block8_amd64.s (7010B)
1 //+build !noasm,!appengine,gc 2 3 // Copyright (c) 2018 Igneous Systems 4 // MIT License 5 // 6 // Permission is hereby granted, free of charge, to any person obtaining a copy 7 // of this software and associated documentation files (the "Software"), to deal 8 // in the Software without restriction, including without limitation the rights 9 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 // copies of the Software, and to permit persons to whom the Software is 11 // furnished to do so, subject to the following conditions: 12 // 13 // The above copyright notice and this permission notice shall be included in all 14 // copies or substantial portions of the Software. 15 // 16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 // SOFTWARE. 23 24 // Copyright (c) 2020 MinIO Inc. All rights reserved. 25 // Use of this source code is governed by a license that can be 26 // found in the LICENSE file. 27 28 // This is the AVX2 implementation of the MD5 block function (8-way parallel) 29 30 // block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int) 31 TEXT ·block8(SB), 4, $0-40 32 MOVQ state+0(FP), BX 33 MOVQ base+8(FP), SI 34 MOVQ bufs+16(FP), AX 35 MOVQ cache+24(FP), CX 36 MOVQ n+32(FP), DX 37 MOVQ ·avx256md5consts+0(SB), DI 38 39 // Align cache (which is stack allocated by the compiler) 40 // to a 256 bit boundary (ymm register alignment) 41 // The cache8 type is deliberately oversized to permit this. 42 ADDQ $31, CX 43 ANDB $-32, CL 44 45 #define a Y0 46 #define b Y1 47 #define c Y2 48 #define d Y3 49 50 #define sa Y4 51 #define sb Y5 52 #define sc Y6 53 #define sd Y7 54 55 #define tmp Y8 56 #define tmp2 Y9 57 58 #define mask Y10 59 #define off Y11 60 61 #define ones Y12 62 63 #define rtmp1 Y13 64 #define rtmp2 Y14 65 66 #define mem Y15 67 68 #define dig BX 69 #define cache CX 70 #define count DX 71 #define base SI 72 #define consts DI 73 74 #define prepmask \ 75 VPXOR mask, mask, mask \ 76 VPCMPGTD mask, off, mask 77 78 #define prep(index) \ 79 VMOVAPD mask, rtmp2 \ 80 VPGATHERDD rtmp2, index*4(base)(off*1), mem 81 82 #define load(index) \ 83 VMOVAPD index*32(cache), mem 84 85 #define store(index) \ 86 VMOVAPD mem, index*32(cache) 87 88 #define roll(shift, a) \ 89 VPSLLD $shift, a, rtmp1 \ 90 VPSRLD $32-shift, a, a \ 91 VPOR rtmp1, a, a 92 93 #define ROUND1(a, b, c, d, index, const, shift) \ 94 VPXOR c, tmp, tmp \ 95 VPADDD 32*const(consts), a, a \ 96 VPADDD mem, a, a \ 97 VPAND b, tmp, tmp \ 98 VPXOR d, tmp, tmp \ 99 prep(index) \ 100 VPADDD tmp, a, a \ 101 roll(shift,a) \ 102 VMOVAPD c, tmp \ 103 VPADDD b, a, a 104 105 #define ROUND1load(a, b, c, d, index, const, shift) \ 106 VXORPD c, tmp, tmp \ 107 VPADDD 32*const(consts), a, a \ 108 VPADDD mem, a, a \ 109 VPAND b, tmp, tmp \ 110 VPXOR d, tmp, tmp \ 111 load(index) \ 112 VPADDD tmp, a, a \ 113 roll(shift,a) \ 114 VMOVAPD c, tmp \ 115 VPADDD b, a, a 116 117 #define ROUND2(a, b, c, d, index, const, shift) \ 118 VPADDD 32*const(consts), a, a \ 119 VPADDD mem, a, a \ 120 VPAND b, tmp2, tmp2 \ 121 VANDNPD c, tmp, tmp \ 122 load(index) \ 123 VPOR tmp, tmp2, tmp2 \ 124 VMOVAPD c, tmp \ 125 VPADDD tmp2, a, a \ 126 VMOVAPD c, tmp2 \ 127 roll(shift,a) \ 128 VPADDD b, a, a 129 130 #define ROUND3(a, b, c, d, index, const, shift) \ 131 VPADDD 32*const(consts), a, a \ 132 VPADDD mem, a, a \ 133 load(index) \ 134 VPXOR d, tmp, tmp \ 135 VPXOR b, tmp, tmp \ 136 VPADDD tmp, a, a \ 137 roll(shift,a) \ 138 VMOVAPD b, tmp \ 139 VPADDD b, a, a 140 141 #define ROUND4(a, b, c, d, index, const, shift) \ 142 VPADDD 32*const(consts), a, a \ 143 VPADDD mem, a, a \ 144 VPOR b, tmp, tmp \ 145 VPXOR c, tmp, tmp \ 146 VPADDD tmp, a, a \ 147 load(index) \ 148 roll(shift,a) \ 149 VPXOR c, ones, tmp \ 150 VPADDD b, a, a 151 152 // load digest into state registers 153 VMOVUPD (dig), a 154 VMOVUPD 32(dig), b 155 VMOVUPD 64(dig), c 156 VMOVUPD 96(dig), d 157 158 // load source buffer offsets 159 VMOVUPD (AX), off 160 161 prepmask 162 VPCMPEQD ones, ones, ones 163 164 loop: 165 VMOVAPD a, sa 166 VMOVAPD b, sb 167 VMOVAPD c, sc 168 VMOVAPD d, sd 169 170 prep(0) 171 VMOVAPD d, tmp 172 store(0) 173 174 ROUND1(a,b,c,d, 1,0x00, 7) 175 store(1) 176 ROUND1(d,a,b,c, 2,0x01,12) 177 store(2) 178 ROUND1(c,d,a,b, 3,0x02,17) 179 store(3) 180 ROUND1(b,c,d,a, 4,0x03,22) 181 store(4) 182 ROUND1(a,b,c,d, 5,0x04, 7) 183 store(5) 184 ROUND1(d,a,b,c, 6,0x05,12) 185 store(6) 186 ROUND1(c,d,a,b, 7,0x06,17) 187 store(7) 188 ROUND1(b,c,d,a, 8,0x07,22) 189 store(8) 190 ROUND1(a,b,c,d, 9,0x08, 7) 191 store(9) 192 ROUND1(d,a,b,c,10,0x09,12) 193 store(10) 194 ROUND1(c,d,a,b,11,0x0a,17) 195 store(11) 196 ROUND1(b,c,d,a,12,0x0b,22) 197 store(12) 198 ROUND1(a,b,c,d,13,0x0c, 7) 199 store(13) 200 ROUND1(d,a,b,c,14,0x0d,12) 201 store(14) 202 ROUND1(c,d,a,b,15,0x0e,17) 203 store(15) 204 ROUND1load(b,c,d,a, 1,0x0f,22) 205 206 VMOVAPD d, tmp 207 VMOVAPD d, tmp2 208 209 ROUND2(a,b,c,d, 6,0x10, 5) 210 ROUND2(d,a,b,c,11,0x11, 9) 211 ROUND2(c,d,a,b, 0,0x12,14) 212 ROUND2(b,c,d,a, 5,0x13,20) 213 ROUND2(a,b,c,d,10,0x14, 5) 214 ROUND2(d,a,b,c,15,0x15, 9) 215 ROUND2(c,d,a,b, 4,0x16,14) 216 ROUND2(b,c,d,a, 9,0x17,20) 217 ROUND2(a,b,c,d,14,0x18, 5) 218 ROUND2(d,a,b,c, 3,0x19, 9) 219 ROUND2(c,d,a,b, 8,0x1a,14) 220 ROUND2(b,c,d,a,13,0x1b,20) 221 ROUND2(a,b,c,d, 2,0x1c, 5) 222 ROUND2(d,a,b,c, 7,0x1d, 9) 223 ROUND2(c,d,a,b,12,0x1e,14) 224 ROUND2(b,c,d,a, 0,0x1f,20) 225 226 load(5) 227 VMOVAPD c, tmp 228 229 ROUND3(a,b,c,d, 8,0x20, 4) 230 ROUND3(d,a,b,c,11,0x21,11) 231 ROUND3(c,d,a,b,14,0x22,16) 232 ROUND3(b,c,d,a, 1,0x23,23) 233 ROUND3(a,b,c,d, 4,0x24, 4) 234 ROUND3(d,a,b,c, 7,0x25,11) 235 ROUND3(c,d,a,b,10,0x26,16) 236 ROUND3(b,c,d,a,13,0x27,23) 237 ROUND3(a,b,c,d, 0,0x28, 4) 238 ROUND3(d,a,b,c, 3,0x29,11) 239 ROUND3(c,d,a,b, 6,0x2a,16) 240 ROUND3(b,c,d,a, 9,0x2b,23) 241 ROUND3(a,b,c,d,12,0x2c, 4) 242 ROUND3(d,a,b,c,15,0x2d,11) 243 ROUND3(c,d,a,b, 2,0x2e,16) 244 ROUND3(b,c,d,a, 0,0x2f,23) 245 246 load(0) 247 VPXOR d, ones, tmp 248 249 ROUND4(a,b,c,d, 7,0x30, 6) 250 ROUND4(d,a,b,c,14,0x31,10) 251 ROUND4(c,d,a,b, 5,0x32,15) 252 ROUND4(b,c,d,a,12,0x33,21) 253 ROUND4(a,b,c,d, 3,0x34, 6) 254 ROUND4(d,a,b,c,10,0x35,10) 255 ROUND4(c,d,a,b, 1,0x36,15) 256 ROUND4(b,c,d,a, 8,0x37,21) 257 ROUND4(a,b,c,d,15,0x38, 6) 258 ROUND4(d,a,b,c, 6,0x39,10) 259 ROUND4(c,d,a,b,13,0x3a,15) 260 ROUND4(b,c,d,a, 4,0x3b,21) 261 ROUND4(a,b,c,d,11,0x3c, 6) 262 ROUND4(d,a,b,c, 2,0x3d,10) 263 ROUND4(c,d,a,b, 9,0x3e,15) 264 ROUND4(b,c,d,a, 0,0x3f,21) 265 266 VPADDD sa, a, a 267 VPADDD sb, b, b 268 VPADDD sc, c, c 269 VPADDD sd, d, d 270 271 LEAQ 64(base), base 272 SUBQ $64, count 273 JNE loop 274 275 VMOVUPD a, (dig) 276 VMOVUPD b, 32(dig) 277 VMOVUPD c, 64(dig) 278 VMOVUPD d, 96(dig) 279 280 VZEROUPPER 281 RET