blake2bAVX2_amd64.s (24279B)
1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build go1.7 && amd64 && gc && !purego 6 // +build go1.7,amd64,gc,!purego 7 8 #include "textflag.h" 9 10 DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 11 DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 12 DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b 13 DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 14 GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 15 16 DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 17 DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 18 DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b 19 DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 20 GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 21 22 DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 23 DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 24 DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 25 DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b 26 GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 27 28 DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 29 DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 30 DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 31 DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a 32 GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 33 34 DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 35 DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 36 GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 37 38 DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b 39 DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 40 GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 41 42 DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 43 DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 44 GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 45 46 DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b 47 DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 48 GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 49 50 DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 51 DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 52 GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 53 54 DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 55 DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 56 GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 57 58 #define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 59 #define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 60 #define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e 61 #define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 62 #define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 63 64 #define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ 65 VPADDQ m0, Y0, Y0; \ 66 VPADDQ Y1, Y0, Y0; \ 67 VPXOR Y0, Y3, Y3; \ 68 VPSHUFD $-79, Y3, Y3; \ 69 VPADDQ Y3, Y2, Y2; \ 70 VPXOR Y2, Y1, Y1; \ 71 VPSHUFB c40, Y1, Y1; \ 72 VPADDQ m1, Y0, Y0; \ 73 VPADDQ Y1, Y0, Y0; \ 74 VPXOR Y0, Y3, Y3; \ 75 VPSHUFB c48, Y3, Y3; \ 76 VPADDQ Y3, Y2, Y2; \ 77 VPXOR Y2, Y1, Y1; \ 78 VPADDQ Y1, Y1, t; \ 79 VPSRLQ $63, Y1, Y1; \ 80 VPXOR t, Y1, Y1; \ 81 VPERMQ_0x39_Y1_Y1; \ 82 VPERMQ_0x4E_Y2_Y2; \ 83 VPERMQ_0x93_Y3_Y3; \ 84 VPADDQ m2, Y0, Y0; \ 85 VPADDQ Y1, Y0, Y0; \ 86 VPXOR Y0, Y3, Y3; \ 87 VPSHUFD $-79, Y3, Y3; \ 88 VPADDQ Y3, Y2, Y2; \ 89 VPXOR Y2, Y1, Y1; \ 90 VPSHUFB c40, Y1, Y1; \ 91 VPADDQ m3, Y0, Y0; \ 92 VPADDQ Y1, Y0, Y0; \ 93 VPXOR Y0, Y3, Y3; \ 94 VPSHUFB c48, Y3, Y3; \ 95 VPADDQ Y3, Y2, Y2; \ 96 VPXOR Y2, Y1, Y1; \ 97 VPADDQ Y1, Y1, t; \ 98 VPSRLQ $63, Y1, Y1; \ 99 VPXOR t, Y1, Y1; \ 100 VPERMQ_0x39_Y3_Y3; \ 101 VPERMQ_0x4E_Y2_Y2; \ 102 VPERMQ_0x93_Y1_Y1 103 104 #define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E 105 #define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 106 #define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E 107 #define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 108 #define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E 109 110 #define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n 111 #define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n 112 #define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n 113 #define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n 114 #define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n 115 116 #define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 117 #define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 118 #define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 119 #define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 120 #define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 121 122 #define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 123 #define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 124 #define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 125 #define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 126 #define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 127 128 #define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 129 #define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 130 131 // load msg: Y12 = (i0, i1, i2, i3) 132 // i0, i1, i2, i3 must not be 0 133 #define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ 134 VMOVQ_SI_X12(i0*8); \ 135 VMOVQ_SI_X11(i2*8); \ 136 VPINSRQ_1_SI_X12(i1*8); \ 137 VPINSRQ_1_SI_X11(i3*8); \ 138 VINSERTI128 $1, X11, Y12, Y12 139 140 // load msg: Y13 = (i0, i1, i2, i3) 141 // i0, i1, i2, i3 must not be 0 142 #define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ 143 VMOVQ_SI_X13(i0*8); \ 144 VMOVQ_SI_X11(i2*8); \ 145 VPINSRQ_1_SI_X13(i1*8); \ 146 VPINSRQ_1_SI_X11(i3*8); \ 147 VINSERTI128 $1, X11, Y13, Y13 148 149 // load msg: Y14 = (i0, i1, i2, i3) 150 // i0, i1, i2, i3 must not be 0 151 #define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ 152 VMOVQ_SI_X14(i0*8); \ 153 VMOVQ_SI_X11(i2*8); \ 154 VPINSRQ_1_SI_X14(i1*8); \ 155 VPINSRQ_1_SI_X11(i3*8); \ 156 VINSERTI128 $1, X11, Y14, Y14 157 158 // load msg: Y15 = (i0, i1, i2, i3) 159 // i0, i1, i2, i3 must not be 0 160 #define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ 161 VMOVQ_SI_X15(i0*8); \ 162 VMOVQ_SI_X11(i2*8); \ 163 VPINSRQ_1_SI_X15(i1*8); \ 164 VPINSRQ_1_SI_X11(i3*8); \ 165 VINSERTI128 $1, X11, Y15, Y15 166 167 #define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ 168 VMOVQ_SI_X12_0; \ 169 VMOVQ_SI_X11(4*8); \ 170 VPINSRQ_1_SI_X12(2*8); \ 171 VPINSRQ_1_SI_X11(6*8); \ 172 VINSERTI128 $1, X11, Y12, Y12; \ 173 LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ 174 LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ 175 LOAD_MSG_AVX2_Y15(9, 11, 13, 15) 176 177 #define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ 178 LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ 179 LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ 180 VMOVQ_SI_X11(11*8); \ 181 VPSHUFD $0x4E, 0*8(SI), X14; \ 182 VPINSRQ_1_SI_X11(5*8); \ 183 VINSERTI128 $1, X11, Y14, Y14; \ 184 LOAD_MSG_AVX2_Y15(12, 2, 7, 3) 185 186 #define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ 187 VMOVQ_SI_X11(5*8); \ 188 VMOVDQU 11*8(SI), X12; \ 189 VPINSRQ_1_SI_X11(15*8); \ 190 VINSERTI128 $1, X11, Y12, Y12; \ 191 VMOVQ_SI_X13(8*8); \ 192 VMOVQ_SI_X11(2*8); \ 193 VPINSRQ_1_SI_X13_0; \ 194 VPINSRQ_1_SI_X11(13*8); \ 195 VINSERTI128 $1, X11, Y13, Y13; \ 196 LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ 197 LOAD_MSG_AVX2_Y15(14, 6, 1, 4) 198 199 #define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ 200 LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ 201 LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ 202 LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ 203 VMOVQ_SI_X15(6*8); \ 204 VMOVQ_SI_X11_0; \ 205 VPINSRQ_1_SI_X15(10*8); \ 206 VPINSRQ_1_SI_X11(8*8); \ 207 VINSERTI128 $1, X11, Y15, Y15 208 209 #define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ 210 LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ 211 VMOVQ_SI_X13_0; \ 212 VMOVQ_SI_X11(4*8); \ 213 VPINSRQ_1_SI_X13(7*8); \ 214 VPINSRQ_1_SI_X11(15*8); \ 215 VINSERTI128 $1, X11, Y13, Y13; \ 216 LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ 217 LOAD_MSG_AVX2_Y15(1, 12, 8, 13) 218 219 #define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ 220 VMOVQ_SI_X12(2*8); \ 221 VMOVQ_SI_X11_0; \ 222 VPINSRQ_1_SI_X12(6*8); \ 223 VPINSRQ_1_SI_X11(8*8); \ 224 VINSERTI128 $1, X11, Y12, Y12; \ 225 LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ 226 LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ 227 LOAD_MSG_AVX2_Y15(13, 5, 14, 9) 228 229 #define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ 230 LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ 231 LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ 232 VMOVQ_SI_X14_0; \ 233 VPSHUFD $0x4E, 8*8(SI), X11; \ 234 VPINSRQ_1_SI_X14(6*8); \ 235 VINSERTI128 $1, X11, Y14, Y14; \ 236 LOAD_MSG_AVX2_Y15(7, 3, 2, 11) 237 238 #define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ 239 LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ 240 LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ 241 LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ 242 VMOVQ_SI_X15_0; \ 243 VMOVQ_SI_X11(6*8); \ 244 VPINSRQ_1_SI_X15(4*8); \ 245 VPINSRQ_1_SI_X11(10*8); \ 246 VINSERTI128 $1, X11, Y15, Y15 247 248 #define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ 249 VMOVQ_SI_X12(6*8); \ 250 VMOVQ_SI_X11(11*8); \ 251 VPINSRQ_1_SI_X12(14*8); \ 252 VPINSRQ_1_SI_X11_0; \ 253 VINSERTI128 $1, X11, Y12, Y12; \ 254 LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ 255 VMOVQ_SI_X11(1*8); \ 256 VMOVDQU 12*8(SI), X14; \ 257 VPINSRQ_1_SI_X11(10*8); \ 258 VINSERTI128 $1, X11, Y14, Y14; \ 259 VMOVQ_SI_X15(2*8); \ 260 VMOVDQU 4*8(SI), X11; \ 261 VPINSRQ_1_SI_X15(7*8); \ 262 VINSERTI128 $1, X11, Y15, Y15 263 264 #define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ 265 LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ 266 VMOVQ_SI_X13(2*8); \ 267 VPSHUFD $0x4E, 5*8(SI), X11; \ 268 VPINSRQ_1_SI_X13(4*8); \ 269 VINSERTI128 $1, X11, Y13, Y13; \ 270 LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ 271 VMOVQ_SI_X15(11*8); \ 272 VMOVQ_SI_X11(12*8); \ 273 VPINSRQ_1_SI_X15(14*8); \ 274 VPINSRQ_1_SI_X11_0; \ 275 VINSERTI128 $1, X11, Y15, Y15 276 277 // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) 278 TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment 279 MOVQ h+0(FP), AX 280 MOVQ c+8(FP), BX 281 MOVQ flag+16(FP), CX 282 MOVQ blocks_base+24(FP), SI 283 MOVQ blocks_len+32(FP), DI 284 285 MOVQ SP, DX 286 ADDQ $31, DX 287 ANDQ $~31, DX 288 289 MOVQ CX, 16(DX) 290 XORQ CX, CX 291 MOVQ CX, 24(DX) 292 293 VMOVDQU ·AVX2_c40<>(SB), Y4 294 VMOVDQU ·AVX2_c48<>(SB), Y5 295 296 VMOVDQU 0(AX), Y8 297 VMOVDQU 32(AX), Y9 298 VMOVDQU ·AVX2_iv0<>(SB), Y6 299 VMOVDQU ·AVX2_iv1<>(SB), Y7 300 301 MOVQ 0(BX), R8 302 MOVQ 8(BX), R9 303 MOVQ R9, 8(DX) 304 305 loop: 306 ADDQ $128, R8 307 MOVQ R8, 0(DX) 308 CMPQ R8, $128 309 JGE noinc 310 INCQ R9 311 MOVQ R9, 8(DX) 312 313 noinc: 314 VMOVDQA Y8, Y0 315 VMOVDQA Y9, Y1 316 VMOVDQA Y6, Y2 317 VPXOR 0(DX), Y7, Y3 318 319 LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() 320 VMOVDQA Y12, 32(DX) 321 VMOVDQA Y13, 64(DX) 322 VMOVDQA Y14, 96(DX) 323 VMOVDQA Y15, 128(DX) 324 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 325 LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() 326 VMOVDQA Y12, 160(DX) 327 VMOVDQA Y13, 192(DX) 328 VMOVDQA Y14, 224(DX) 329 VMOVDQA Y15, 256(DX) 330 331 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 332 LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() 333 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 334 LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() 335 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 336 LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() 337 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 338 LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() 339 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 340 LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() 341 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 342 LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() 343 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 344 LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() 345 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 346 LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() 347 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 348 349 ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) 350 ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) 351 352 VPXOR Y0, Y8, Y8 353 VPXOR Y1, Y9, Y9 354 VPXOR Y2, Y8, Y8 355 VPXOR Y3, Y9, Y9 356 357 LEAQ 128(SI), SI 358 SUBQ $128, DI 359 JNE loop 360 361 MOVQ R8, 0(BX) 362 MOVQ R9, 8(BX) 363 364 VMOVDQU Y8, 0(AX) 365 VMOVDQU Y9, 32(AX) 366 VZEROUPPER 367 368 RET 369 370 #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA 371 #define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB 372 #define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF 373 #define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD 374 #define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE 375 376 #define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 377 #define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF 378 #define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 379 #define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF 380 #define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 381 #define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 382 #define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF 383 #define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF 384 385 #define SHUFFLE_AVX() \ 386 VMOVDQA X6, X13; \ 387 VMOVDQA X2, X14; \ 388 VMOVDQA X4, X6; \ 389 VPUNPCKLQDQ_X13_X13_X15; \ 390 VMOVDQA X5, X4; \ 391 VMOVDQA X6, X5; \ 392 VPUNPCKHQDQ_X15_X7_X6; \ 393 VPUNPCKLQDQ_X7_X7_X15; \ 394 VPUNPCKHQDQ_X15_X13_X7; \ 395 VPUNPCKLQDQ_X3_X3_X15; \ 396 VPUNPCKHQDQ_X15_X2_X2; \ 397 VPUNPCKLQDQ_X14_X14_X15; \ 398 VPUNPCKHQDQ_X15_X3_X3; \ 399 400 #define SHUFFLE_AVX_INV() \ 401 VMOVDQA X2, X13; \ 402 VMOVDQA X4, X14; \ 403 VPUNPCKLQDQ_X2_X2_X15; \ 404 VMOVDQA X5, X4; \ 405 VPUNPCKHQDQ_X15_X3_X2; \ 406 VMOVDQA X14, X5; \ 407 VPUNPCKLQDQ_X3_X3_X15; \ 408 VMOVDQA X6, X14; \ 409 VPUNPCKHQDQ_X15_X13_X3; \ 410 VPUNPCKLQDQ_X7_X7_X15; \ 411 VPUNPCKHQDQ_X15_X6_X6; \ 412 VPUNPCKLQDQ_X14_X14_X15; \ 413 VPUNPCKHQDQ_X15_X7_X7; \ 414 415 #define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ 416 VPADDQ m0, v0, v0; \ 417 VPADDQ v2, v0, v0; \ 418 VPADDQ m1, v1, v1; \ 419 VPADDQ v3, v1, v1; \ 420 VPXOR v0, v6, v6; \ 421 VPXOR v1, v7, v7; \ 422 VPSHUFD $-79, v6, v6; \ 423 VPSHUFD $-79, v7, v7; \ 424 VPADDQ v6, v4, v4; \ 425 VPADDQ v7, v5, v5; \ 426 VPXOR v4, v2, v2; \ 427 VPXOR v5, v3, v3; \ 428 VPSHUFB c40, v2, v2; \ 429 VPSHUFB c40, v3, v3; \ 430 VPADDQ m2, v0, v0; \ 431 VPADDQ v2, v0, v0; \ 432 VPADDQ m3, v1, v1; \ 433 VPADDQ v3, v1, v1; \ 434 VPXOR v0, v6, v6; \ 435 VPXOR v1, v7, v7; \ 436 VPSHUFB c48, v6, v6; \ 437 VPSHUFB c48, v7, v7; \ 438 VPADDQ v6, v4, v4; \ 439 VPADDQ v7, v5, v5; \ 440 VPXOR v4, v2, v2; \ 441 VPXOR v5, v3, v3; \ 442 VPADDQ v2, v2, t0; \ 443 VPSRLQ $63, v2, v2; \ 444 VPXOR t0, v2, v2; \ 445 VPADDQ v3, v3, t0; \ 446 VPSRLQ $63, v3, v3; \ 447 VPXOR t0, v3, v3 448 449 // load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) 450 // i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 451 #define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ 452 VMOVQ_SI_X12(i0*8); \ 453 VMOVQ_SI_X13(i2*8); \ 454 VMOVQ_SI_X14(i4*8); \ 455 VMOVQ_SI_X15(i6*8); \ 456 VPINSRQ_1_SI_X12(i1*8); \ 457 VPINSRQ_1_SI_X13(i3*8); \ 458 VPINSRQ_1_SI_X14(i5*8); \ 459 VPINSRQ_1_SI_X15(i7*8) 460 461 // load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) 462 #define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ 463 VMOVQ_SI_X12_0; \ 464 VMOVQ_SI_X13(4*8); \ 465 VMOVQ_SI_X14(1*8); \ 466 VMOVQ_SI_X15(5*8); \ 467 VPINSRQ_1_SI_X12(2*8); \ 468 VPINSRQ_1_SI_X13(6*8); \ 469 VPINSRQ_1_SI_X14(3*8); \ 470 VPINSRQ_1_SI_X15(7*8) 471 472 // load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) 473 #define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ 474 VPSHUFD $0x4E, 0*8(SI), X12; \ 475 VMOVQ_SI_X13(11*8); \ 476 VMOVQ_SI_X14(12*8); \ 477 VMOVQ_SI_X15(7*8); \ 478 VPINSRQ_1_SI_X13(5*8); \ 479 VPINSRQ_1_SI_X14(2*8); \ 480 VPINSRQ_1_SI_X15(3*8) 481 482 // load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) 483 #define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ 484 VMOVDQU 11*8(SI), X12; \ 485 VMOVQ_SI_X13(5*8); \ 486 VMOVQ_SI_X14(8*8); \ 487 VMOVQ_SI_X15(2*8); \ 488 VPINSRQ_1_SI_X13(15*8); \ 489 VPINSRQ_1_SI_X14_0; \ 490 VPINSRQ_1_SI_X15(13*8) 491 492 // load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) 493 #define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ 494 VMOVQ_SI_X12(2*8); \ 495 VMOVQ_SI_X13(4*8); \ 496 VMOVQ_SI_X14(6*8); \ 497 VMOVQ_SI_X15_0; \ 498 VPINSRQ_1_SI_X12(5*8); \ 499 VPINSRQ_1_SI_X13(15*8); \ 500 VPINSRQ_1_SI_X14(10*8); \ 501 VPINSRQ_1_SI_X15(8*8) 502 503 // load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) 504 #define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ 505 VMOVQ_SI_X12(9*8); \ 506 VMOVQ_SI_X13(2*8); \ 507 VMOVQ_SI_X14_0; \ 508 VMOVQ_SI_X15(4*8); \ 509 VPINSRQ_1_SI_X12(5*8); \ 510 VPINSRQ_1_SI_X13(10*8); \ 511 VPINSRQ_1_SI_X14(7*8); \ 512 VPINSRQ_1_SI_X15(15*8) 513 514 // load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) 515 #define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ 516 VMOVQ_SI_X12(2*8); \ 517 VMOVQ_SI_X13_0; \ 518 VMOVQ_SI_X14(12*8); \ 519 VMOVQ_SI_X15(11*8); \ 520 VPINSRQ_1_SI_X12(6*8); \ 521 VPINSRQ_1_SI_X13(8*8); \ 522 VPINSRQ_1_SI_X14(10*8); \ 523 VPINSRQ_1_SI_X15(3*8) 524 525 // load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) 526 #define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ 527 MOVQ 0*8(SI), X12; \ 528 VPSHUFD $0x4E, 8*8(SI), X13; \ 529 MOVQ 7*8(SI), X14; \ 530 MOVQ 2*8(SI), X15; \ 531 VPINSRQ_1_SI_X12(6*8); \ 532 VPINSRQ_1_SI_X14(3*8); \ 533 VPINSRQ_1_SI_X15(11*8) 534 535 // load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) 536 #define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ 537 MOVQ 6*8(SI), X12; \ 538 MOVQ 11*8(SI), X13; \ 539 MOVQ 15*8(SI), X14; \ 540 MOVQ 3*8(SI), X15; \ 541 VPINSRQ_1_SI_X12(14*8); \ 542 VPINSRQ_1_SI_X13_0; \ 543 VPINSRQ_1_SI_X14(9*8); \ 544 VPINSRQ_1_SI_X15(8*8) 545 546 // load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) 547 #define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ 548 MOVQ 5*8(SI), X12; \ 549 MOVQ 8*8(SI), X13; \ 550 MOVQ 0*8(SI), X14; \ 551 MOVQ 6*8(SI), X15; \ 552 VPINSRQ_1_SI_X12(15*8); \ 553 VPINSRQ_1_SI_X13(2*8); \ 554 VPINSRQ_1_SI_X14(4*8); \ 555 VPINSRQ_1_SI_X15(10*8) 556 557 // load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) 558 #define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ 559 VMOVDQU 12*8(SI), X12; \ 560 MOVQ 1*8(SI), X13; \ 561 MOVQ 2*8(SI), X14; \ 562 VPINSRQ_1_SI_X13(10*8); \ 563 VPINSRQ_1_SI_X14(7*8); \ 564 VMOVDQU 4*8(SI), X15 565 566 // load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) 567 #define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ 568 MOVQ 15*8(SI), X12; \ 569 MOVQ 3*8(SI), X13; \ 570 MOVQ 11*8(SI), X14; \ 571 MOVQ 12*8(SI), X15; \ 572 VPINSRQ_1_SI_X12(9*8); \ 573 VPINSRQ_1_SI_X13(13*8); \ 574 VPINSRQ_1_SI_X14(14*8); \ 575 VPINSRQ_1_SI_X15_0 576 577 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) 578 TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment 579 MOVQ h+0(FP), AX 580 MOVQ c+8(FP), BX 581 MOVQ flag+16(FP), CX 582 MOVQ blocks_base+24(FP), SI 583 MOVQ blocks_len+32(FP), DI 584 585 MOVQ SP, R10 586 ADDQ $15, R10 587 ANDQ $~15, R10 588 589 VMOVDQU ·AVX_c40<>(SB), X0 590 VMOVDQU ·AVX_c48<>(SB), X1 591 VMOVDQA X0, X8 592 VMOVDQA X1, X9 593 594 VMOVDQU ·AVX_iv3<>(SB), X0 595 VMOVDQA X0, 0(R10) 596 XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) 597 598 VMOVDQU 0(AX), X10 599 VMOVDQU 16(AX), X11 600 VMOVDQU 32(AX), X2 601 VMOVDQU 48(AX), X3 602 603 MOVQ 0(BX), R8 604 MOVQ 8(BX), R9 605 606 loop: 607 ADDQ $128, R8 608 CMPQ R8, $128 609 JGE noinc 610 INCQ R9 611 612 noinc: 613 VMOVQ_R8_X15 614 VPINSRQ_1_R9_X15 615 616 VMOVDQA X10, X0 617 VMOVDQA X11, X1 618 VMOVDQU ·AVX_iv0<>(SB), X4 619 VMOVDQU ·AVX_iv1<>(SB), X5 620 VMOVDQU ·AVX_iv2<>(SB), X6 621 622 VPXOR X15, X6, X6 623 VMOVDQA 0(R10), X7 624 625 LOAD_MSG_AVX_0_2_4_6_1_3_5_7() 626 VMOVDQA X12, 16(R10) 627 VMOVDQA X13, 32(R10) 628 VMOVDQA X14, 48(R10) 629 VMOVDQA X15, 64(R10) 630 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 631 SHUFFLE_AVX() 632 LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) 633 VMOVDQA X12, 80(R10) 634 VMOVDQA X13, 96(R10) 635 VMOVDQA X14, 112(R10) 636 VMOVDQA X15, 128(R10) 637 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 638 SHUFFLE_AVX_INV() 639 640 LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) 641 VMOVDQA X12, 144(R10) 642 VMOVDQA X13, 160(R10) 643 VMOVDQA X14, 176(R10) 644 VMOVDQA X15, 192(R10) 645 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 646 SHUFFLE_AVX() 647 LOAD_MSG_AVX_1_0_11_5_12_2_7_3() 648 VMOVDQA X12, 208(R10) 649 VMOVDQA X13, 224(R10) 650 VMOVDQA X14, 240(R10) 651 VMOVDQA X15, 256(R10) 652 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 653 SHUFFLE_AVX_INV() 654 655 LOAD_MSG_AVX_11_12_5_15_8_0_2_13() 656 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 657 SHUFFLE_AVX() 658 LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) 659 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 660 SHUFFLE_AVX_INV() 661 662 LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) 663 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 664 SHUFFLE_AVX() 665 LOAD_MSG_AVX_2_5_4_15_6_10_0_8() 666 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 667 SHUFFLE_AVX_INV() 668 669 LOAD_MSG_AVX_9_5_2_10_0_7_4_15() 670 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 671 SHUFFLE_AVX() 672 LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) 673 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 674 SHUFFLE_AVX_INV() 675 676 LOAD_MSG_AVX_2_6_0_8_12_10_11_3() 677 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 678 SHUFFLE_AVX() 679 LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) 680 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 681 SHUFFLE_AVX_INV() 682 683 LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) 684 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 685 SHUFFLE_AVX() 686 LOAD_MSG_AVX_0_6_9_8_7_3_2_11() 687 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 688 SHUFFLE_AVX_INV() 689 690 LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) 691 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 692 SHUFFLE_AVX() 693 LOAD_MSG_AVX_5_15_8_2_0_4_6_10() 694 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 695 SHUFFLE_AVX_INV() 696 697 LOAD_MSG_AVX_6_14_11_0_15_9_3_8() 698 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 699 SHUFFLE_AVX() 700 LOAD_MSG_AVX_12_13_1_10_2_7_4_5() 701 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 702 SHUFFLE_AVX_INV() 703 704 LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) 705 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 706 SHUFFLE_AVX() 707 LOAD_MSG_AVX_15_9_3_13_11_14_12_0() 708 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 709 SHUFFLE_AVX_INV() 710 711 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) 712 SHUFFLE_AVX() 713 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) 714 SHUFFLE_AVX_INV() 715 716 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) 717 SHUFFLE_AVX() 718 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) 719 SHUFFLE_AVX_INV() 720 721 VMOVDQU 32(AX), X14 722 VMOVDQU 48(AX), X15 723 VPXOR X0, X10, X10 724 VPXOR X1, X11, X11 725 VPXOR X2, X14, X14 726 VPXOR X3, X15, X15 727 VPXOR X4, X10, X10 728 VPXOR X5, X11, X11 729 VPXOR X6, X14, X2 730 VPXOR X7, X15, X3 731 VMOVDQU X2, 32(AX) 732 VMOVDQU X3, 48(AX) 733 734 LEAQ 128(SI), SI 735 SUBQ $128, DI 736 JNE loop 737 738 VMOVDQU X10, 0(AX) 739 VMOVDQU X11, 16(AX) 740 741 MOVQ R8, 0(BX) 742 MOVQ R9, 8(BX) 743 VZEROUPPER 744 745 RET