gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

block16_amd64.s (5503B)


      1 // Copyright (c) 2020 MinIO Inc. All rights reserved.
      2 // Use of this source code is governed by a license that can be
      3 // found in the LICENSE file.
      4 
      5 //+build !noasm,!appengine,gc
      6 
      7 // This is the AVX512 implementation of the MD5 block function (16-way parallel)
      8 
      9 #define prep(index) \
     10 	KMOVQ      kmask, ktmp                      \
     11 	VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
     12 
     13 #define ROUND1(a, b, c, d, index, const, shift) \
     14 	VPXORQ     c, tmp, tmp            \
     15 	VPADDD     64*const(consts), a, a \
     16 	VPADDD     mem, a, a              \
     17 	VPTERNLOGD $0x6C, b, d, tmp       \
     18 	prep(index)                       \
     19 	VPADDD     tmp, a, a              \
     20 	VPROLD     $shift, a, a           \
     21 	VMOVAPD    c, tmp                 \
     22 	VPADDD     b, a, a
     23 
     24 #define ROUND1noload(a, b, c, d, const, shift) \
     25 	VPXORQ     c, tmp, tmp            \
     26 	VPADDD     64*const(consts), a, a \
     27 	VPADDD     mem, a, a              \
     28 	VPTERNLOGD $0x6C, b, d, tmp       \
     29 	VPADDD     tmp, a, a              \
     30 	VPROLD     $shift, a, a           \
     31 	VMOVAPD    c, tmp                 \
     32 	VPADDD     b, a, a
     33 
     34 #define ROUND2(a, b, c, d, zreg, const, shift) \
     35 	VPADDD     64*const(consts), a, a \
     36 	VPADDD     zreg, a, a             \
     37 	VANDNPD    c, tmp, tmp            \
     38 	VPTERNLOGD $0xEC, b, tmp, tmp2    \
     39 	VMOVAPD    c, tmp                 \
     40 	VPADDD     tmp2, a, a             \
     41 	VMOVAPD    c, tmp2                \
     42 	VPROLD     $shift, a, a           \
     43 	VPADDD     b, a, a
     44 
     45 #define ROUND3(a, b, c, d, zreg, const, shift) \
     46 	VPADDD     64*const(consts), a, a \
     47 	VPADDD     zreg, a, a             \
     48 	VPTERNLOGD $0x96, b, d, tmp       \
     49 	VPADDD     tmp, a, a              \
     50 	VPROLD     $shift, a, a           \
     51 	VMOVAPD    b, tmp                 \
     52 	VPADDD     b, a, a
     53 
     54 #define ROUND4(a, b, c, d, zreg, const, shift) \
     55 	VPADDD     64*const(consts), a, a \
     56 	VPADDD     zreg, a, a             \
     57 	VPTERNLOGD $0x36, b, c, tmp       \
     58 	VPADDD     tmp, a, a              \
     59 	VPROLD     $shift, a, a           \
     60 	VPXORQ     c, ones, tmp           \
     61 	VPADDD     b, a, a
     62 
     63 TEXT ·block16(SB), 4, $0-40
     64 
     65 	MOVQ  state+0(FP), BX
     66 	MOVQ  base+8(FP), SI
     67 	MOVQ  ptrs+16(FP), AX
     68 	KMOVQ mask+24(FP), K1
     69 	MOVQ  n+32(FP), DX
     70 	MOVQ  ·avx512md5consts+0(SB), DI
     71 
     72 #define a Z0
     73 #define b Z1
     74 #define c Z2
     75 #define d Z3
     76 
     77 #define sa Z4
     78 #define sb Z5
     79 #define sc Z6
     80 #define sd Z7
     81 
     82 #define tmp       Z8
     83 #define tmp2      Z9
     84 #define ptrs     Z10
     85 #define ones     Z12
     86 #define mem      Z15
     87 
     88 #define kmask  K1
     89 #define ktmp   K3
     90 
     91 // ----------------------------------------------------------
     92 // Registers Z16 through to Z31 are used for caching purposes
     93 // ----------------------------------------------------------
     94 
     95 #define dig    BX
     96 #define count  DX
     97 #define base   SI
     98 #define consts DI
     99 
    100 	// load digest into state registers
    101 	VMOVUPD (dig), a
    102 	VMOVUPD 0x40(dig), b
    103 	VMOVUPD 0x80(dig), c
    104 	VMOVUPD 0xc0(dig), d
    105 
    106 	// load source pointers
    107 	VMOVUPD 0x00(AX), ptrs
    108 
    109 	MOVQ         $-1, AX
    110 	VPBROADCASTQ AX, ones
    111 
    112 loop:
    113 	VMOVAPD a, sa
    114 	VMOVAPD b, sb
    115 	VMOVAPD c, sc
    116 	VMOVAPD d, sd
    117 
    118 	prep(0)
    119 	VMOVAPD d, tmp
    120 	VMOVAPD mem, Z16
    121 
    122 	ROUND1(a,b,c,d, 1,0x00, 7)
    123 	VMOVAPD mem, Z17
    124 	ROUND1(d,a,b,c, 2,0x01,12)
    125 	VMOVAPD mem, Z18
    126 	ROUND1(c,d,a,b, 3,0x02,17)
    127 	VMOVAPD mem, Z19
    128 	ROUND1(b,c,d,a, 4,0x03,22)
    129 	VMOVAPD mem, Z20
    130 	ROUND1(a,b,c,d, 5,0x04, 7)
    131 	VMOVAPD mem, Z21
    132 	ROUND1(d,a,b,c, 6,0x05,12)
    133 	VMOVAPD mem, Z22
    134 	ROUND1(c,d,a,b, 7,0x06,17)
    135 	VMOVAPD mem, Z23
    136 	ROUND1(b,c,d,a, 8,0x07,22)
    137 	VMOVAPD mem, Z24
    138 	ROUND1(a,b,c,d, 9,0x08, 7)
    139 	VMOVAPD mem, Z25
    140 	ROUND1(d,a,b,c,10,0x09,12)
    141 	VMOVAPD mem, Z26
    142 	ROUND1(c,d,a,b,11,0x0a,17)
    143 	VMOVAPD mem, Z27
    144 	ROUND1(b,c,d,a,12,0x0b,22)
    145 	VMOVAPD mem, Z28
    146 	ROUND1(a,b,c,d,13,0x0c, 7)
    147 	VMOVAPD mem, Z29
    148 	ROUND1(d,a,b,c,14,0x0d,12)
    149 	VMOVAPD mem, Z30
    150 	ROUND1(c,d,a,b,15,0x0e,17)
    151 	VMOVAPD mem, Z31
    152 
    153 	ROUND1noload(b,c,d,a, 0x0f,22)
    154 
    155 	VMOVAPD d, tmp
    156 	VMOVAPD d, tmp2
    157 
    158 	ROUND2(a,b,c,d, Z17,0x10, 5)
    159 	ROUND2(d,a,b,c, Z22,0x11, 9)
    160 	ROUND2(c,d,a,b, Z27,0x12,14)
    161 	ROUND2(b,c,d,a, Z16,0x13,20)
    162 	ROUND2(a,b,c,d, Z21,0x14, 5)
    163 	ROUND2(d,a,b,c, Z26,0x15, 9)
    164 	ROUND2(c,d,a,b, Z31,0x16,14)
    165 	ROUND2(b,c,d,a, Z20,0x17,20)
    166 	ROUND2(a,b,c,d, Z25,0x18, 5)
    167 	ROUND2(d,a,b,c, Z30,0x19, 9)
    168 	ROUND2(c,d,a,b, Z19,0x1a,14)
    169 	ROUND2(b,c,d,a, Z24,0x1b,20)
    170 	ROUND2(a,b,c,d, Z29,0x1c, 5)
    171 	ROUND2(d,a,b,c, Z18,0x1d, 9)
    172 	ROUND2(c,d,a,b, Z23,0x1e,14)
    173 	ROUND2(b,c,d,a, Z28,0x1f,20)
    174 
    175 	VMOVAPD c, tmp
    176 
    177 	ROUND3(a,b,c,d, Z21,0x20, 4)
    178 	ROUND3(d,a,b,c, Z24,0x21,11)
    179 	ROUND3(c,d,a,b, Z27,0x22,16)
    180 	ROUND3(b,c,d,a, Z30,0x23,23)
    181 	ROUND3(a,b,c,d, Z17,0x24, 4)
    182 	ROUND3(d,a,b,c, Z20,0x25,11)
    183 	ROUND3(c,d,a,b, Z23,0x26,16)
    184 	ROUND3(b,c,d,a, Z26,0x27,23)
    185 	ROUND3(a,b,c,d, Z29,0x28, 4)
    186 	ROUND3(d,a,b,c, Z16,0x29,11)
    187 	ROUND3(c,d,a,b, Z19,0x2a,16)
    188 	ROUND3(b,c,d,a, Z22,0x2b,23)
    189 	ROUND3(a,b,c,d, Z25,0x2c, 4)
    190 	ROUND3(d,a,b,c, Z28,0x2d,11)
    191 	ROUND3(c,d,a,b, Z31,0x2e,16)
    192 	ROUND3(b,c,d,a, Z18,0x2f,23)
    193 
    194 	VPXORQ d, ones, tmp
    195 
    196 	ROUND4(a,b,c,d, Z16,0x30, 6)
    197 	ROUND4(d,a,b,c, Z23,0x31,10)
    198 	ROUND4(c,d,a,b, Z30,0x32,15)
    199 	ROUND4(b,c,d,a, Z21,0x33,21)
    200 	ROUND4(a,b,c,d, Z28,0x34, 6)
    201 	ROUND4(d,a,b,c, Z19,0x35,10)
    202 	ROUND4(c,d,a,b, Z26,0x36,15)
    203 	ROUND4(b,c,d,a, Z17,0x37,21)
    204 	ROUND4(a,b,c,d, Z24,0x38, 6)
    205 	ROUND4(d,a,b,c, Z31,0x39,10)
    206 	ROUND4(c,d,a,b, Z22,0x3a,15)
    207 	ROUND4(b,c,d,a, Z29,0x3b,21)
    208 	ROUND4(a,b,c,d, Z20,0x3c, 6)
    209 	ROUND4(d,a,b,c, Z27,0x3d,10)
    210 	ROUND4(c,d,a,b, Z18,0x3e,15)
    211 	ROUND4(b,c,d,a, Z25,0x3f,21)
    212 
    213 	VPADDD sa, a, a
    214 	VPADDD sb, b, b
    215 	VPADDD sc, c, c
    216 	VPADDD sd, d, d
    217 
    218 	LEAQ 64(base), base
    219 	SUBQ $64, count
    220 	JNE  loop
    221 
    222 	VMOVUPD a, (dig)
    223 	VMOVUPD b, 0x40(dig)
    224 	VMOVUPD c, 0x80(dig)
    225 	VMOVUPD d, 0xc0(dig)
    226 
    227 	VZEROUPPER
    228 	RET