block8_amd64.s - gtsocial-umbx - Unnamed repository; edit this file 'description' to name the repository.

block8_amd64.s (7010B)
      1 //+build !noasm,!appengine,gc
      2 
      3 // Copyright (c) 2018 Igneous Systems
      4 //   MIT License
      5 //
      6 //   Permission is hereby granted, free of charge, to any person obtaining a copy
      7 //   of this software and associated documentation files (the "Software"), to deal
      8 //   in the Software without restriction, including without limitation the rights
      9 //   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 //   copies of the Software, and to permit persons to whom the Software is
     11 //   furnished to do so, subject to the following conditions:
     12 //
     13 //   The above copyright notice and this permission notice shall be included in all
     14 //   copies or substantial portions of the Software.
     15 //
     16 //   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 //   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 //   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     19 //   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 //   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 //   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     22 //   SOFTWARE.
     23 
     24 // Copyright (c) 2020 MinIO Inc. All rights reserved.
     25 //   Use of this source code is governed by a license that can be
     26 //   found in the LICENSE file.
     27 
     28 // This is the AVX2 implementation of the MD5 block function (8-way parallel)
     29 
     30 // block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int)
     31 TEXT ·block8(SB), 4, $0-40
     32 	MOVQ state+0(FP), BX
     33 	MOVQ base+8(FP), SI
     34 	MOVQ bufs+16(FP), AX
     35 	MOVQ cache+24(FP), CX
     36 	MOVQ n+32(FP), DX
     37 	MOVQ ·avx256md5consts+0(SB), DI
     38 
     39 	// Align cache (which is stack allocated by the compiler)
     40 	// to a 256 bit boundary (ymm register alignment)
     41 	// The cache8 type is deliberately oversized to permit this.
     42 	ADDQ $31, CX
     43 	ANDB $-32, CL
     44 
     45 #define a Y0
     46 #define b Y1
     47 #define c Y2
     48 #define d Y3
     49 
     50 #define sa Y4
     51 #define sb Y5
     52 #define sc Y6
     53 #define sd Y7
     54 
     55 #define tmp  Y8
     56 #define tmp2 Y9
     57 
     58 #define mask Y10
     59 #define off  Y11
     60 
     61 #define ones Y12
     62 
     63 #define rtmp1  Y13
     64 #define rtmp2  Y14
     65 
     66 #define mem   Y15
     67 
     68 #define dig    BX
     69 #define cache  CX
     70 #define count  DX
     71 #define base   SI
     72 #define consts DI
     73 
     74 #define prepmask \
     75 	VPXOR    mask, mask, mask \
     76 	VPCMPGTD mask, off, mask
     77 
     78 #define prep(index) \
     79 	VMOVAPD    mask, rtmp2                      \
     80 	VPGATHERDD rtmp2, index*4(base)(off*1), mem
     81 
     82 #define load(index) \
     83 	VMOVAPD index*32(cache), mem
     84 
     85 #define store(index) \
     86 	VMOVAPD mem, index*32(cache)
     87 
     88 #define roll(shift, a) \
     89 	VPSLLD $shift, a, rtmp1 \
     90 	VPSRLD $32-shift, a, a  \
     91 	VPOR   rtmp1, a, a
     92 
     93 #define ROUND1(a, b, c, d, index, const, shift) \
     94 	VPXOR   c, tmp, tmp            \
     95 	VPADDD  32*const(consts), a, a \
     96 	VPADDD  mem, a, a              \
     97 	VPAND   b, tmp, tmp            \
     98 	VPXOR   d, tmp, tmp            \
     99 	prep(index)                    \
    100 	VPADDD  tmp, a, a              \
    101 	roll(shift,a)                  \
    102 	VMOVAPD c, tmp                 \
    103 	VPADDD  b, a, a
    104 
    105 #define ROUND1load(a, b, c, d, index, const, shift) \
    106 	VXORPD  c, tmp, tmp            \
    107 	VPADDD  32*const(consts), a, a \
    108 	VPADDD  mem, a, a              \
    109 	VPAND   b, tmp, tmp            \
    110 	VPXOR   d, tmp, tmp            \
    111 	load(index)                    \
    112 	VPADDD  tmp, a, a              \
    113 	roll(shift,a)                  \
    114 	VMOVAPD c, tmp                 \
    115 	VPADDD  b, a, a
    116 
    117 #define ROUND2(a, b, c, d, index, const, shift) \
    118 	VPADDD  32*const(consts), a, a \
    119 	VPADDD  mem, a, a              \
    120 	VPAND   b, tmp2, tmp2          \
    121 	VANDNPD c, tmp, tmp            \
    122 	load(index)                    \
    123 	VPOR    tmp, tmp2, tmp2        \
    124 	VMOVAPD c, tmp                 \
    125 	VPADDD  tmp2, a, a             \
    126 	VMOVAPD c, tmp2                \
    127 	roll(shift,a)                  \
    128 	VPADDD  b, a, a
    129 
    130 #define ROUND3(a, b, c, d, index, const, shift) \
    131 	VPADDD  32*const(consts), a, a \
    132 	VPADDD  mem, a, a              \
    133 	load(index)                    \
    134 	VPXOR   d, tmp, tmp            \
    135 	VPXOR   b, tmp, tmp            \
    136 	VPADDD  tmp, a, a              \
    137 	roll(shift,a)                  \
    138 	VMOVAPD b, tmp                 \
    139 	VPADDD  b, a, a
    140 
    141 #define ROUND4(a, b, c, d, index, const, shift) \
    142 	VPADDD 32*const(consts), a, a \
    143 	VPADDD mem, a, a              \
    144 	VPOR   b, tmp, tmp            \
    145 	VPXOR  c, tmp, tmp            \
    146 	VPADDD tmp, a, a              \
    147 	load(index)                   \
    148 	roll(shift,a)                 \
    149 	VPXOR  c, ones, tmp           \
    150 	VPADDD b, a, a
    151 
    152 	// load digest into state registers
    153 	VMOVUPD (dig), a
    154 	VMOVUPD 32(dig), b
    155 	VMOVUPD 64(dig), c
    156 	VMOVUPD 96(dig), d
    157 
    158 	// load source buffer offsets
    159 	VMOVUPD (AX), off
    160 
    161 	prepmask
    162 	VPCMPEQD ones, ones, ones
    163 
    164 loop:
    165 	VMOVAPD a, sa
    166 	VMOVAPD b, sb
    167 	VMOVAPD c, sc
    168 	VMOVAPD d, sd
    169 
    170 	prep(0)
    171 	VMOVAPD d, tmp
    172 	store(0)
    173 
    174 	ROUND1(a,b,c,d, 1,0x00, 7)
    175 	store(1)
    176 	ROUND1(d,a,b,c, 2,0x01,12)
    177 	store(2)
    178 	ROUND1(c,d,a,b, 3,0x02,17)
    179 	store(3)
    180 	ROUND1(b,c,d,a, 4,0x03,22)
    181 	store(4)
    182 	ROUND1(a,b,c,d, 5,0x04, 7)
    183 	store(5)
    184 	ROUND1(d,a,b,c, 6,0x05,12)
    185 	store(6)
    186 	ROUND1(c,d,a,b, 7,0x06,17)
    187 	store(7)
    188 	ROUND1(b,c,d,a, 8,0x07,22)
    189 	store(8)
    190 	ROUND1(a,b,c,d, 9,0x08, 7)
    191 	store(9)
    192 	ROUND1(d,a,b,c,10,0x09,12)
    193 	store(10)
    194 	ROUND1(c,d,a,b,11,0x0a,17)
    195 	store(11)
    196 	ROUND1(b,c,d,a,12,0x0b,22)
    197 	store(12)
    198 	ROUND1(a,b,c,d,13,0x0c, 7)
    199 	store(13)
    200 	ROUND1(d,a,b,c,14,0x0d,12)
    201 	store(14)
    202 	ROUND1(c,d,a,b,15,0x0e,17)
    203 	store(15)
    204 	ROUND1load(b,c,d,a, 1,0x0f,22)
    205 
    206 	VMOVAPD d, tmp
    207 	VMOVAPD d, tmp2
    208 
    209 	ROUND2(a,b,c,d, 6,0x10, 5)
    210 	ROUND2(d,a,b,c,11,0x11, 9)
    211 	ROUND2(c,d,a,b, 0,0x12,14)
    212 	ROUND2(b,c,d,a, 5,0x13,20)
    213 	ROUND2(a,b,c,d,10,0x14, 5)
    214 	ROUND2(d,a,b,c,15,0x15, 9)
    215 	ROUND2(c,d,a,b, 4,0x16,14)
    216 	ROUND2(b,c,d,a, 9,0x17,20)
    217 	ROUND2(a,b,c,d,14,0x18, 5)
    218 	ROUND2(d,a,b,c, 3,0x19, 9)
    219 	ROUND2(c,d,a,b, 8,0x1a,14)
    220 	ROUND2(b,c,d,a,13,0x1b,20)
    221 	ROUND2(a,b,c,d, 2,0x1c, 5)
    222 	ROUND2(d,a,b,c, 7,0x1d, 9)
    223 	ROUND2(c,d,a,b,12,0x1e,14)
    224 	ROUND2(b,c,d,a, 0,0x1f,20)
    225 
    226 	load(5)
    227 	VMOVAPD c, tmp
    228 
    229 	ROUND3(a,b,c,d, 8,0x20, 4)
    230 	ROUND3(d,a,b,c,11,0x21,11)
    231 	ROUND3(c,d,a,b,14,0x22,16)
    232 	ROUND3(b,c,d,a, 1,0x23,23)
    233 	ROUND3(a,b,c,d, 4,0x24, 4)
    234 	ROUND3(d,a,b,c, 7,0x25,11)
    235 	ROUND3(c,d,a,b,10,0x26,16)
    236 	ROUND3(b,c,d,a,13,0x27,23)
    237 	ROUND3(a,b,c,d, 0,0x28, 4)
    238 	ROUND3(d,a,b,c, 3,0x29,11)
    239 	ROUND3(c,d,a,b, 6,0x2a,16)
    240 	ROUND3(b,c,d,a, 9,0x2b,23)
    241 	ROUND3(a,b,c,d,12,0x2c, 4)
    242 	ROUND3(d,a,b,c,15,0x2d,11)
    243 	ROUND3(c,d,a,b, 2,0x2e,16)
    244 	ROUND3(b,c,d,a, 0,0x2f,23)
    245 
    246 	load(0)
    247 	VPXOR d, ones, tmp
    248 
    249 	ROUND4(a,b,c,d, 7,0x30, 6)
    250 	ROUND4(d,a,b,c,14,0x31,10)
    251 	ROUND4(c,d,a,b, 5,0x32,15)
    252 	ROUND4(b,c,d,a,12,0x33,21)
    253 	ROUND4(a,b,c,d, 3,0x34, 6)
    254 	ROUND4(d,a,b,c,10,0x35,10)
    255 	ROUND4(c,d,a,b, 1,0x36,15)
    256 	ROUND4(b,c,d,a, 8,0x37,21)
    257 	ROUND4(a,b,c,d,15,0x38, 6)
    258 	ROUND4(d,a,b,c, 6,0x39,10)
    259 	ROUND4(c,d,a,b,13,0x3a,15)
    260 	ROUND4(b,c,d,a, 4,0x3b,21)
    261 	ROUND4(a,b,c,d,11,0x3c, 6)
    262 	ROUND4(d,a,b,c, 2,0x3d,10)
    263 	ROUND4(c,d,a,b, 9,0x3e,15)
    264 	ROUND4(b,c,d,a, 0,0x3f,21)
    265 
    266 	VPADDD sa, a, a
    267 	VPADDD sb, b, b
    268 	VPADDD sc, c, c
    269 	VPADDD sd, d, d
    270 
    271 	LEAQ 64(base), base
    272 	SUBQ $64, count
    273 	JNE  loop
    274 
    275 	VMOVUPD a, (dig)
    276 	VMOVUPD b, 32(dig)
    277 	VMOVUPD c, 64(dig)
    278 	VMOVUPD d, 96(dig)
    279 
    280 	VZEROUPPER
    281 	RET
	gtsocial-umbx Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE