block16_amd64.s (5503B)
1 // Copyright (c) 2020 MinIO Inc. All rights reserved. 2 // Use of this source code is governed by a license that can be 3 // found in the LICENSE file. 4 5 //+build !noasm,!appengine,gc 6 7 // This is the AVX512 implementation of the MD5 block function (16-way parallel) 8 9 #define prep(index) \ 10 KMOVQ kmask, ktmp \ 11 VPGATHERDD index*4(base)(ptrs*1), ktmp, mem 12 13 #define ROUND1(a, b, c, d, index, const, shift) \ 14 VPXORQ c, tmp, tmp \ 15 VPADDD 64*const(consts), a, a \ 16 VPADDD mem, a, a \ 17 VPTERNLOGD $0x6C, b, d, tmp \ 18 prep(index) \ 19 VPADDD tmp, a, a \ 20 VPROLD $shift, a, a \ 21 VMOVAPD c, tmp \ 22 VPADDD b, a, a 23 24 #define ROUND1noload(a, b, c, d, const, shift) \ 25 VPXORQ c, tmp, tmp \ 26 VPADDD 64*const(consts), a, a \ 27 VPADDD mem, a, a \ 28 VPTERNLOGD $0x6C, b, d, tmp \ 29 VPADDD tmp, a, a \ 30 VPROLD $shift, a, a \ 31 VMOVAPD c, tmp \ 32 VPADDD b, a, a 33 34 #define ROUND2(a, b, c, d, zreg, const, shift) \ 35 VPADDD 64*const(consts), a, a \ 36 VPADDD zreg, a, a \ 37 VANDNPD c, tmp, tmp \ 38 VPTERNLOGD $0xEC, b, tmp, tmp2 \ 39 VMOVAPD c, tmp \ 40 VPADDD tmp2, a, a \ 41 VMOVAPD c, tmp2 \ 42 VPROLD $shift, a, a \ 43 VPADDD b, a, a 44 45 #define ROUND3(a, b, c, d, zreg, const, shift) \ 46 VPADDD 64*const(consts), a, a \ 47 VPADDD zreg, a, a \ 48 VPTERNLOGD $0x96, b, d, tmp \ 49 VPADDD tmp, a, a \ 50 VPROLD $shift, a, a \ 51 VMOVAPD b, tmp \ 52 VPADDD b, a, a 53 54 #define ROUND4(a, b, c, d, zreg, const, shift) \ 55 VPADDD 64*const(consts), a, a \ 56 VPADDD zreg, a, a \ 57 VPTERNLOGD $0x36, b, c, tmp \ 58 VPADDD tmp, a, a \ 59 VPROLD $shift, a, a \ 60 VPXORQ c, ones, tmp \ 61 VPADDD b, a, a 62 63 TEXT ·block16(SB), 4, $0-40 64 65 MOVQ state+0(FP), BX 66 MOVQ base+8(FP), SI 67 MOVQ ptrs+16(FP), AX 68 KMOVQ mask+24(FP), K1 69 MOVQ n+32(FP), DX 70 MOVQ ·avx512md5consts+0(SB), DI 71 72 #define a Z0 73 #define b Z1 74 #define c Z2 75 #define d Z3 76 77 #define sa Z4 78 #define sb Z5 79 #define sc Z6 80 #define sd Z7 81 82 #define tmp Z8 83 #define tmp2 Z9 84 #define ptrs Z10 85 #define ones Z12 86 #define mem Z15 87 88 #define kmask K1 89 #define ktmp K3 90 91 // ---------------------------------------------------------- 92 // Registers Z16 through to Z31 are used for caching purposes 93 // ---------------------------------------------------------- 94 95 #define dig BX 96 #define count DX 97 #define base SI 98 #define consts DI 99 100 // load digest into state registers 101 VMOVUPD (dig), a 102 VMOVUPD 0x40(dig), b 103 VMOVUPD 0x80(dig), c 104 VMOVUPD 0xc0(dig), d 105 106 // load source pointers 107 VMOVUPD 0x00(AX), ptrs 108 109 MOVQ $-1, AX 110 VPBROADCASTQ AX, ones 111 112 loop: 113 VMOVAPD a, sa 114 VMOVAPD b, sb 115 VMOVAPD c, sc 116 VMOVAPD d, sd 117 118 prep(0) 119 VMOVAPD d, tmp 120 VMOVAPD mem, Z16 121 122 ROUND1(a,b,c,d, 1,0x00, 7) 123 VMOVAPD mem, Z17 124 ROUND1(d,a,b,c, 2,0x01,12) 125 VMOVAPD mem, Z18 126 ROUND1(c,d,a,b, 3,0x02,17) 127 VMOVAPD mem, Z19 128 ROUND1(b,c,d,a, 4,0x03,22) 129 VMOVAPD mem, Z20 130 ROUND1(a,b,c,d, 5,0x04, 7) 131 VMOVAPD mem, Z21 132 ROUND1(d,a,b,c, 6,0x05,12) 133 VMOVAPD mem, Z22 134 ROUND1(c,d,a,b, 7,0x06,17) 135 VMOVAPD mem, Z23 136 ROUND1(b,c,d,a, 8,0x07,22) 137 VMOVAPD mem, Z24 138 ROUND1(a,b,c,d, 9,0x08, 7) 139 VMOVAPD mem, Z25 140 ROUND1(d,a,b,c,10,0x09,12) 141 VMOVAPD mem, Z26 142 ROUND1(c,d,a,b,11,0x0a,17) 143 VMOVAPD mem, Z27 144 ROUND1(b,c,d,a,12,0x0b,22) 145 VMOVAPD mem, Z28 146 ROUND1(a,b,c,d,13,0x0c, 7) 147 VMOVAPD mem, Z29 148 ROUND1(d,a,b,c,14,0x0d,12) 149 VMOVAPD mem, Z30 150 ROUND1(c,d,a,b,15,0x0e,17) 151 VMOVAPD mem, Z31 152 153 ROUND1noload(b,c,d,a, 0x0f,22) 154 155 VMOVAPD d, tmp 156 VMOVAPD d, tmp2 157 158 ROUND2(a,b,c,d, Z17,0x10, 5) 159 ROUND2(d,a,b,c, Z22,0x11, 9) 160 ROUND2(c,d,a,b, Z27,0x12,14) 161 ROUND2(b,c,d,a, Z16,0x13,20) 162 ROUND2(a,b,c,d, Z21,0x14, 5) 163 ROUND2(d,a,b,c, Z26,0x15, 9) 164 ROUND2(c,d,a,b, Z31,0x16,14) 165 ROUND2(b,c,d,a, Z20,0x17,20) 166 ROUND2(a,b,c,d, Z25,0x18, 5) 167 ROUND2(d,a,b,c, Z30,0x19, 9) 168 ROUND2(c,d,a,b, Z19,0x1a,14) 169 ROUND2(b,c,d,a, Z24,0x1b,20) 170 ROUND2(a,b,c,d, Z29,0x1c, 5) 171 ROUND2(d,a,b,c, Z18,0x1d, 9) 172 ROUND2(c,d,a,b, Z23,0x1e,14) 173 ROUND2(b,c,d,a, Z28,0x1f,20) 174 175 VMOVAPD c, tmp 176 177 ROUND3(a,b,c,d, Z21,0x20, 4) 178 ROUND3(d,a,b,c, Z24,0x21,11) 179 ROUND3(c,d,a,b, Z27,0x22,16) 180 ROUND3(b,c,d,a, Z30,0x23,23) 181 ROUND3(a,b,c,d, Z17,0x24, 4) 182 ROUND3(d,a,b,c, Z20,0x25,11) 183 ROUND3(c,d,a,b, Z23,0x26,16) 184 ROUND3(b,c,d,a, Z26,0x27,23) 185 ROUND3(a,b,c,d, Z29,0x28, 4) 186 ROUND3(d,a,b,c, Z16,0x29,11) 187 ROUND3(c,d,a,b, Z19,0x2a,16) 188 ROUND3(b,c,d,a, Z22,0x2b,23) 189 ROUND3(a,b,c,d, Z25,0x2c, 4) 190 ROUND3(d,a,b,c, Z28,0x2d,11) 191 ROUND3(c,d,a,b, Z31,0x2e,16) 192 ROUND3(b,c,d,a, Z18,0x2f,23) 193 194 VPXORQ d, ones, tmp 195 196 ROUND4(a,b,c,d, Z16,0x30, 6) 197 ROUND4(d,a,b,c, Z23,0x31,10) 198 ROUND4(c,d,a,b, Z30,0x32,15) 199 ROUND4(b,c,d,a, Z21,0x33,21) 200 ROUND4(a,b,c,d, Z28,0x34, 6) 201 ROUND4(d,a,b,c, Z19,0x35,10) 202 ROUND4(c,d,a,b, Z26,0x36,15) 203 ROUND4(b,c,d,a, Z17,0x37,21) 204 ROUND4(a,b,c,d, Z24,0x38, 6) 205 ROUND4(d,a,b,c, Z31,0x39,10) 206 ROUND4(c,d,a,b, Z22,0x3a,15) 207 ROUND4(b,c,d,a, Z29,0x3b,21) 208 ROUND4(a,b,c,d, Z20,0x3c, 6) 209 ROUND4(d,a,b,c, Z27,0x3d,10) 210 ROUND4(c,d,a,b, Z18,0x3e,15) 211 ROUND4(b,c,d,a, Z25,0x3f,21) 212 213 VPADDD sa, a, a 214 VPADDD sb, b, b 215 VPADDD sc, c, c 216 VPADDD sd, d, d 217 218 LEAQ 64(base), base 219 SUBQ $64, count 220 JNE loop 221 222 VMOVUPD a, (dig) 223 VMOVUPD b, 0x40(dig) 224 VMOVUPD c, 0x80(dig) 225 VMOVUPD d, 0xc0(dig) 226 227 VZEROUPPER 228 RET