blake2b_amd64.s (8585B)
1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build amd64 && gc && !purego 6 // +build amd64,gc,!purego 7 8 #include "textflag.h" 9 10 DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 11 DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 12 GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 13 14 DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b 15 DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 16 GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 17 18 DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 19 DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 20 GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 21 22 DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b 23 DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 24 GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 25 26 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 27 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 28 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 29 30 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 31 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 32 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 33 34 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ 35 MOVO v4, t1; \ 36 MOVO v5, v4; \ 37 MOVO t1, v5; \ 38 MOVO v6, t1; \ 39 PUNPCKLQDQ v6, t2; \ 40 PUNPCKHQDQ v7, v6; \ 41 PUNPCKHQDQ t2, v6; \ 42 PUNPCKLQDQ v7, t2; \ 43 MOVO t1, v7; \ 44 MOVO v2, t1; \ 45 PUNPCKHQDQ t2, v7; \ 46 PUNPCKLQDQ v3, t2; \ 47 PUNPCKHQDQ t2, v2; \ 48 PUNPCKLQDQ t1, t2; \ 49 PUNPCKHQDQ t2, v3 50 51 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ 52 MOVO v4, t1; \ 53 MOVO v5, v4; \ 54 MOVO t1, v5; \ 55 MOVO v2, t1; \ 56 PUNPCKLQDQ v2, t2; \ 57 PUNPCKHQDQ v3, v2; \ 58 PUNPCKHQDQ t2, v2; \ 59 PUNPCKLQDQ v3, t2; \ 60 MOVO t1, v3; \ 61 MOVO v6, t1; \ 62 PUNPCKHQDQ t2, v3; \ 63 PUNPCKLQDQ v7, t2; \ 64 PUNPCKHQDQ t2, v6; \ 65 PUNPCKLQDQ t1, t2; \ 66 PUNPCKHQDQ t2, v7 67 68 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ 69 PADDQ m0, v0; \ 70 PADDQ m1, v1; \ 71 PADDQ v2, v0; \ 72 PADDQ v3, v1; \ 73 PXOR v0, v6; \ 74 PXOR v1, v7; \ 75 PSHUFD $0xB1, v6, v6; \ 76 PSHUFD $0xB1, v7, v7; \ 77 PADDQ v6, v4; \ 78 PADDQ v7, v5; \ 79 PXOR v4, v2; \ 80 PXOR v5, v3; \ 81 PSHUFB c40, v2; \ 82 PSHUFB c40, v3; \ 83 PADDQ m2, v0; \ 84 PADDQ m3, v1; \ 85 PADDQ v2, v0; \ 86 PADDQ v3, v1; \ 87 PXOR v0, v6; \ 88 PXOR v1, v7; \ 89 PSHUFB c48, v6; \ 90 PSHUFB c48, v7; \ 91 PADDQ v6, v4; \ 92 PADDQ v7, v5; \ 93 PXOR v4, v2; \ 94 PXOR v5, v3; \ 95 MOVOU v2, t0; \ 96 PADDQ v2, t0; \ 97 PSRLQ $63, v2; \ 98 PXOR t0, v2; \ 99 MOVOU v3, t0; \ 100 PADDQ v3, t0; \ 101 PSRLQ $63, v3; \ 102 PXOR t0, v3 103 104 #define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ 105 MOVQ i0*8(src), m0; \ 106 PINSRQ $1, i1*8(src), m0; \ 107 MOVQ i2*8(src), m1; \ 108 PINSRQ $1, i3*8(src), m1; \ 109 MOVQ i4*8(src), m2; \ 110 PINSRQ $1, i5*8(src), m2; \ 111 MOVQ i6*8(src), m3; \ 112 PINSRQ $1, i7*8(src), m3 113 114 // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) 115 TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment 116 MOVQ h+0(FP), AX 117 MOVQ c+8(FP), BX 118 MOVQ flag+16(FP), CX 119 MOVQ blocks_base+24(FP), SI 120 MOVQ blocks_len+32(FP), DI 121 122 MOVQ SP, R10 123 ADDQ $15, R10 124 ANDQ $~15, R10 125 126 MOVOU ·iv3<>(SB), X0 127 MOVO X0, 0(R10) 128 XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) 129 130 MOVOU ·c40<>(SB), X13 131 MOVOU ·c48<>(SB), X14 132 133 MOVOU 0(AX), X12 134 MOVOU 16(AX), X15 135 136 MOVQ 0(BX), R8 137 MOVQ 8(BX), R9 138 139 loop: 140 ADDQ $128, R8 141 CMPQ R8, $128 142 JGE noinc 143 INCQ R9 144 145 noinc: 146 MOVQ R8, X8 147 PINSRQ $1, R9, X8 148 149 MOVO X12, X0 150 MOVO X15, X1 151 MOVOU 32(AX), X2 152 MOVOU 48(AX), X3 153 MOVOU ·iv0<>(SB), X4 154 MOVOU ·iv1<>(SB), X5 155 MOVOU ·iv2<>(SB), X6 156 157 PXOR X8, X6 158 MOVO 0(R10), X7 159 160 LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) 161 MOVO X8, 16(R10) 162 MOVO X9, 32(R10) 163 MOVO X10, 48(R10) 164 MOVO X11, 64(R10) 165 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 166 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 167 LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) 168 MOVO X8, 80(R10) 169 MOVO X9, 96(R10) 170 MOVO X10, 112(R10) 171 MOVO X11, 128(R10) 172 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 173 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 174 175 LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) 176 MOVO X8, 144(R10) 177 MOVO X9, 160(R10) 178 MOVO X10, 176(R10) 179 MOVO X11, 192(R10) 180 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 181 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 182 LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) 183 MOVO X8, 208(R10) 184 MOVO X9, 224(R10) 185 MOVO X10, 240(R10) 186 MOVO X11, 256(R10) 187 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 188 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 189 190 LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) 191 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 192 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 193 LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) 194 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 195 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 196 197 LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) 198 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 199 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 200 LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) 201 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 202 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 203 204 LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) 205 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 206 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 207 LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) 208 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 209 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 210 211 LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) 212 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 213 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 214 LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) 215 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 216 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 217 218 LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) 219 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 220 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 221 LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) 222 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 223 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 224 225 LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) 226 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 227 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 228 LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) 229 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 230 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 231 232 LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) 233 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 234 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 235 LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) 236 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 237 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 238 239 LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) 240 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 241 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 242 LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) 243 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 244 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 245 246 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) 247 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 248 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) 249 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 250 251 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) 252 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 253 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) 254 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 255 256 MOVOU 32(AX), X10 257 MOVOU 48(AX), X11 258 PXOR X0, X12 259 PXOR X1, X15 260 PXOR X2, X10 261 PXOR X3, X11 262 PXOR X4, X12 263 PXOR X5, X15 264 PXOR X6, X10 265 PXOR X7, X11 266 MOVOU X10, 32(AX) 267 MOVOU X11, 48(AX) 268 269 LEAQ 128(SI), SI 270 SUBQ $128, DI 271 JNE loop 272 273 MOVOU X12, 0(AX) 274 MOVOU X15, 16(AX) 275 276 MOVQ R8, 0(BX) 277 MOVQ R9, 8(BX) 278 279 RET