gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

blake2bAVX2_amd64.s (24279B)


      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:build go1.7 && amd64 && gc && !purego
      6 // +build go1.7,amd64,gc,!purego
      7 
      8 #include "textflag.h"
      9 
     10 DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
     11 DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
     12 DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
     13 DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
     14 GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
     15 
     16 DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
     17 DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
     18 DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
     19 DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
     20 GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
     21 
     22 DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
     23 DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
     24 DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
     25 DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
     26 GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
     27 
     28 DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
     29 DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
     30 DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
     31 DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
     32 GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
     33 
     34 DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
     35 DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
     36 GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
     37 
     38 DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
     39 DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
     40 GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
     41 
     42 DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
     43 DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
     44 GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
     45 
     46 DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
     47 DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
     48 GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
     49 
     50 DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
     51 DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
     52 GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
     53 
     54 DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
     55 DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
     56 GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
     57 
     58 #define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
     59 #define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
     60 #define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
     61 #define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
     62 #define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
     63 
     64 #define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
     65 	VPADDQ  m0, Y0, Y0;   \
     66 	VPADDQ  Y1, Y0, Y0;   \
     67 	VPXOR   Y0, Y3, Y3;   \
     68 	VPSHUFD $-79, Y3, Y3; \
     69 	VPADDQ  Y3, Y2, Y2;   \
     70 	VPXOR   Y2, Y1, Y1;   \
     71 	VPSHUFB c40, Y1, Y1;  \
     72 	VPADDQ  m1, Y0, Y0;   \
     73 	VPADDQ  Y1, Y0, Y0;   \
     74 	VPXOR   Y0, Y3, Y3;   \
     75 	VPSHUFB c48, Y3, Y3;  \
     76 	VPADDQ  Y3, Y2, Y2;   \
     77 	VPXOR   Y2, Y1, Y1;   \
     78 	VPADDQ  Y1, Y1, t;    \
     79 	VPSRLQ  $63, Y1, Y1;  \
     80 	VPXOR   t, Y1, Y1;    \
     81 	VPERMQ_0x39_Y1_Y1;    \
     82 	VPERMQ_0x4E_Y2_Y2;    \
     83 	VPERMQ_0x93_Y3_Y3;    \
     84 	VPADDQ  m2, Y0, Y0;   \
     85 	VPADDQ  Y1, Y0, Y0;   \
     86 	VPXOR   Y0, Y3, Y3;   \
     87 	VPSHUFD $-79, Y3, Y3; \
     88 	VPADDQ  Y3, Y2, Y2;   \
     89 	VPXOR   Y2, Y1, Y1;   \
     90 	VPSHUFB c40, Y1, Y1;  \
     91 	VPADDQ  m3, Y0, Y0;   \
     92 	VPADDQ  Y1, Y0, Y0;   \
     93 	VPXOR   Y0, Y3, Y3;   \
     94 	VPSHUFB c48, Y3, Y3;  \
     95 	VPADDQ  Y3, Y2, Y2;   \
     96 	VPXOR   Y2, Y1, Y1;   \
     97 	VPADDQ  Y1, Y1, t;    \
     98 	VPSRLQ  $63, Y1, Y1;  \
     99 	VPXOR   t, Y1, Y1;    \
    100 	VPERMQ_0x39_Y3_Y3;    \
    101 	VPERMQ_0x4E_Y2_Y2;    \
    102 	VPERMQ_0x93_Y1_Y1
    103 
    104 #define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
    105 #define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
    106 #define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
    107 #define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
    108 #define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
    109 
    110 #define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
    111 #define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
    112 #define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
    113 #define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
    114 #define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
    115 
    116 #define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
    117 #define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
    118 #define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
    119 #define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
    120 #define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
    121 
    122 #define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
    123 #define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
    124 #define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
    125 #define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
    126 #define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
    127 
    128 #define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
    129 #define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
    130 
    131 // load msg: Y12 = (i0, i1, i2, i3)
    132 // i0, i1, i2, i3 must not be 0
    133 #define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
    134 	VMOVQ_SI_X12(i0*8);           \
    135 	VMOVQ_SI_X11(i2*8);           \
    136 	VPINSRQ_1_SI_X12(i1*8);       \
    137 	VPINSRQ_1_SI_X11(i3*8);       \
    138 	VINSERTI128 $1, X11, Y12, Y12
    139 
    140 // load msg: Y13 = (i0, i1, i2, i3)
    141 // i0, i1, i2, i3 must not be 0
    142 #define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
    143 	VMOVQ_SI_X13(i0*8);           \
    144 	VMOVQ_SI_X11(i2*8);           \
    145 	VPINSRQ_1_SI_X13(i1*8);       \
    146 	VPINSRQ_1_SI_X11(i3*8);       \
    147 	VINSERTI128 $1, X11, Y13, Y13
    148 
    149 // load msg: Y14 = (i0, i1, i2, i3)
    150 // i0, i1, i2, i3 must not be 0
    151 #define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
    152 	VMOVQ_SI_X14(i0*8);           \
    153 	VMOVQ_SI_X11(i2*8);           \
    154 	VPINSRQ_1_SI_X14(i1*8);       \
    155 	VPINSRQ_1_SI_X11(i3*8);       \
    156 	VINSERTI128 $1, X11, Y14, Y14
    157 
    158 // load msg: Y15 = (i0, i1, i2, i3)
    159 // i0, i1, i2, i3 must not be 0
    160 #define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
    161 	VMOVQ_SI_X15(i0*8);           \
    162 	VMOVQ_SI_X11(i2*8);           \
    163 	VPINSRQ_1_SI_X15(i1*8);       \
    164 	VPINSRQ_1_SI_X11(i3*8);       \
    165 	VINSERTI128 $1, X11, Y15, Y15
    166 
    167 #define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
    168 	VMOVQ_SI_X12_0;                   \
    169 	VMOVQ_SI_X11(4*8);                \
    170 	VPINSRQ_1_SI_X12(2*8);            \
    171 	VPINSRQ_1_SI_X11(6*8);            \
    172 	VINSERTI128 $1, X11, Y12, Y12;    \
    173 	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
    174 	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
    175 	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
    176 
    177 #define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
    178 	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
    179 	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
    180 	VMOVQ_SI_X11(11*8);              \
    181 	VPSHUFD     $0x4E, 0*8(SI), X14; \
    182 	VPINSRQ_1_SI_X11(5*8);           \
    183 	VINSERTI128 $1, X11, Y14, Y14;   \
    184 	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
    185 
    186 #define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
    187 	VMOVQ_SI_X11(5*8);              \
    188 	VMOVDQU     11*8(SI), X12;      \
    189 	VPINSRQ_1_SI_X11(15*8);         \
    190 	VINSERTI128 $1, X11, Y12, Y12;  \
    191 	VMOVQ_SI_X13(8*8);              \
    192 	VMOVQ_SI_X11(2*8);              \
    193 	VPINSRQ_1_SI_X13_0;             \
    194 	VPINSRQ_1_SI_X11(13*8);         \
    195 	VINSERTI128 $1, X11, Y13, Y13;  \
    196 	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
    197 	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
    198 
    199 #define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
    200 	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
    201 	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
    202 	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
    203 	VMOVQ_SI_X15(6*8);               \
    204 	VMOVQ_SI_X11_0;                  \
    205 	VPINSRQ_1_SI_X15(10*8);          \
    206 	VPINSRQ_1_SI_X11(8*8);           \
    207 	VINSERTI128 $1, X11, Y15, Y15
    208 
    209 #define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
    210 	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
    211 	VMOVQ_SI_X13_0;                  \
    212 	VMOVQ_SI_X11(4*8);               \
    213 	VPINSRQ_1_SI_X13(7*8);           \
    214 	VPINSRQ_1_SI_X11(15*8);          \
    215 	VINSERTI128 $1, X11, Y13, Y13;   \
    216 	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
    217 	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
    218 
    219 #define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
    220 	VMOVQ_SI_X12(2*8);                \
    221 	VMOVQ_SI_X11_0;                   \
    222 	VPINSRQ_1_SI_X12(6*8);            \
    223 	VPINSRQ_1_SI_X11(8*8);            \
    224 	VINSERTI128 $1, X11, Y12, Y12;    \
    225 	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
    226 	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
    227 	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
    228 
    229 #define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
    230 	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
    231 	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
    232 	VMOVQ_SI_X14_0;                   \
    233 	VPSHUFD     $0x4E, 8*8(SI), X11;  \
    234 	VPINSRQ_1_SI_X14(6*8);            \
    235 	VINSERTI128 $1, X11, Y14, Y14;    \
    236 	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
    237 
    238 #define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
    239 	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
    240 	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
    241 	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
    242 	VMOVQ_SI_X15_0;                  \
    243 	VMOVQ_SI_X11(6*8);               \
    244 	VPINSRQ_1_SI_X15(4*8);           \
    245 	VPINSRQ_1_SI_X11(10*8);          \
    246 	VINSERTI128 $1, X11, Y15, Y15
    247 
    248 #define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
    249 	VMOVQ_SI_X12(6*8);              \
    250 	VMOVQ_SI_X11(11*8);             \
    251 	VPINSRQ_1_SI_X12(14*8);         \
    252 	VPINSRQ_1_SI_X11_0;             \
    253 	VINSERTI128 $1, X11, Y12, Y12;  \
    254 	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
    255 	VMOVQ_SI_X11(1*8);              \
    256 	VMOVDQU     12*8(SI), X14;      \
    257 	VPINSRQ_1_SI_X11(10*8);         \
    258 	VINSERTI128 $1, X11, Y14, Y14;  \
    259 	VMOVQ_SI_X15(2*8);              \
    260 	VMOVDQU     4*8(SI), X11;       \
    261 	VPINSRQ_1_SI_X15(7*8);          \
    262 	VINSERTI128 $1, X11, Y15, Y15
    263 
    264 #define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
    265 	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
    266 	VMOVQ_SI_X13(2*8);               \
    267 	VPSHUFD     $0x4E, 5*8(SI), X11; \
    268 	VPINSRQ_1_SI_X13(4*8);           \
    269 	VINSERTI128 $1, X11, Y13, Y13;   \
    270 	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
    271 	VMOVQ_SI_X15(11*8);              \
    272 	VMOVQ_SI_X11(12*8);              \
    273 	VPINSRQ_1_SI_X15(14*8);          \
    274 	VPINSRQ_1_SI_X11_0;              \
    275 	VINSERTI128 $1, X11, Y15, Y15
    276 
    277 // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
    278 TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
    279 	MOVQ h+0(FP), AX
    280 	MOVQ c+8(FP), BX
    281 	MOVQ flag+16(FP), CX
    282 	MOVQ blocks_base+24(FP), SI
    283 	MOVQ blocks_len+32(FP), DI
    284 
    285 	MOVQ SP, DX
    286 	ADDQ $31, DX
    287 	ANDQ $~31, DX
    288 
    289 	MOVQ CX, 16(DX)
    290 	XORQ CX, CX
    291 	MOVQ CX, 24(DX)
    292 
    293 	VMOVDQU ·AVX2_c40<>(SB), Y4
    294 	VMOVDQU ·AVX2_c48<>(SB), Y5
    295 
    296 	VMOVDQU 0(AX), Y8
    297 	VMOVDQU 32(AX), Y9
    298 	VMOVDQU ·AVX2_iv0<>(SB), Y6
    299 	VMOVDQU ·AVX2_iv1<>(SB), Y7
    300 
    301 	MOVQ 0(BX), R8
    302 	MOVQ 8(BX), R9
    303 	MOVQ R9, 8(DX)
    304 
    305 loop:
    306 	ADDQ $128, R8
    307 	MOVQ R8, 0(DX)
    308 	CMPQ R8, $128
    309 	JGE  noinc
    310 	INCQ R9
    311 	MOVQ R9, 8(DX)
    312 
    313 noinc:
    314 	VMOVDQA Y8, Y0
    315 	VMOVDQA Y9, Y1
    316 	VMOVDQA Y6, Y2
    317 	VPXOR   0(DX), Y7, Y3
    318 
    319 	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
    320 	VMOVDQA Y12, 32(DX)
    321 	VMOVDQA Y13, 64(DX)
    322 	VMOVDQA Y14, 96(DX)
    323 	VMOVDQA Y15, 128(DX)
    324 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    325 	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
    326 	VMOVDQA Y12, 160(DX)
    327 	VMOVDQA Y13, 192(DX)
    328 	VMOVDQA Y14, 224(DX)
    329 	VMOVDQA Y15, 256(DX)
    330 
    331 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    332 	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
    333 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    334 	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
    335 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    336 	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
    337 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    338 	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
    339 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    340 	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
    341 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    342 	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
    343 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    344 	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
    345 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    346 	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
    347 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
    348 
    349 	ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
    350 	ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
    351 
    352 	VPXOR Y0, Y8, Y8
    353 	VPXOR Y1, Y9, Y9
    354 	VPXOR Y2, Y8, Y8
    355 	VPXOR Y3, Y9, Y9
    356 
    357 	LEAQ 128(SI), SI
    358 	SUBQ $128, DI
    359 	JNE  loop
    360 
    361 	MOVQ R8, 0(BX)
    362 	MOVQ R9, 8(BX)
    363 
    364 	VMOVDQU Y8, 0(AX)
    365 	VMOVDQU Y9, 32(AX)
    366 	VZEROUPPER
    367 
    368 	RET
    369 
    370 #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
    371 #define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
    372 #define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
    373 #define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
    374 #define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
    375 
    376 #define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
    377 #define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
    378 #define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
    379 #define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
    380 #define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
    381 #define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
    382 #define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
    383 #define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
    384 
    385 #define SHUFFLE_AVX() \
    386 	VMOVDQA X6, X13;         \
    387 	VMOVDQA X2, X14;         \
    388 	VMOVDQA X4, X6;          \
    389 	VPUNPCKLQDQ_X13_X13_X15; \
    390 	VMOVDQA X5, X4;          \
    391 	VMOVDQA X6, X5;          \
    392 	VPUNPCKHQDQ_X15_X7_X6;   \
    393 	VPUNPCKLQDQ_X7_X7_X15;   \
    394 	VPUNPCKHQDQ_X15_X13_X7;  \
    395 	VPUNPCKLQDQ_X3_X3_X15;   \
    396 	VPUNPCKHQDQ_X15_X2_X2;   \
    397 	VPUNPCKLQDQ_X14_X14_X15; \
    398 	VPUNPCKHQDQ_X15_X3_X3;   \
    399 
    400 #define SHUFFLE_AVX_INV() \
    401 	VMOVDQA X2, X13;         \
    402 	VMOVDQA X4, X14;         \
    403 	VPUNPCKLQDQ_X2_X2_X15;   \
    404 	VMOVDQA X5, X4;          \
    405 	VPUNPCKHQDQ_X15_X3_X2;   \
    406 	VMOVDQA X14, X5;         \
    407 	VPUNPCKLQDQ_X3_X3_X15;   \
    408 	VMOVDQA X6, X14;         \
    409 	VPUNPCKHQDQ_X15_X13_X3;  \
    410 	VPUNPCKLQDQ_X7_X7_X15;   \
    411 	VPUNPCKHQDQ_X15_X6_X6;   \
    412 	VPUNPCKLQDQ_X14_X14_X15; \
    413 	VPUNPCKHQDQ_X15_X7_X7;   \
    414 
    415 #define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
    416 	VPADDQ  m0, v0, v0;   \
    417 	VPADDQ  v2, v0, v0;   \
    418 	VPADDQ  m1, v1, v1;   \
    419 	VPADDQ  v3, v1, v1;   \
    420 	VPXOR   v0, v6, v6;   \
    421 	VPXOR   v1, v7, v7;   \
    422 	VPSHUFD $-79, v6, v6; \
    423 	VPSHUFD $-79, v7, v7; \
    424 	VPADDQ  v6, v4, v4;   \
    425 	VPADDQ  v7, v5, v5;   \
    426 	VPXOR   v4, v2, v2;   \
    427 	VPXOR   v5, v3, v3;   \
    428 	VPSHUFB c40, v2, v2;  \
    429 	VPSHUFB c40, v3, v3;  \
    430 	VPADDQ  m2, v0, v0;   \
    431 	VPADDQ  v2, v0, v0;   \
    432 	VPADDQ  m3, v1, v1;   \
    433 	VPADDQ  v3, v1, v1;   \
    434 	VPXOR   v0, v6, v6;   \
    435 	VPXOR   v1, v7, v7;   \
    436 	VPSHUFB c48, v6, v6;  \
    437 	VPSHUFB c48, v7, v7;  \
    438 	VPADDQ  v6, v4, v4;   \
    439 	VPADDQ  v7, v5, v5;   \
    440 	VPXOR   v4, v2, v2;   \
    441 	VPXOR   v5, v3, v3;   \
    442 	VPADDQ  v2, v2, t0;   \
    443 	VPSRLQ  $63, v2, v2;  \
    444 	VPXOR   t0, v2, v2;   \
    445 	VPADDQ  v3, v3, t0;   \
    446 	VPSRLQ  $63, v3, v3;  \
    447 	VPXOR   t0, v3, v3
    448 
    449 // load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
    450 // i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
    451 #define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
    452 	VMOVQ_SI_X12(i0*8);     \
    453 	VMOVQ_SI_X13(i2*8);     \
    454 	VMOVQ_SI_X14(i4*8);     \
    455 	VMOVQ_SI_X15(i6*8);     \
    456 	VPINSRQ_1_SI_X12(i1*8); \
    457 	VPINSRQ_1_SI_X13(i3*8); \
    458 	VPINSRQ_1_SI_X14(i5*8); \
    459 	VPINSRQ_1_SI_X15(i7*8)
    460 
    461 // load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
    462 #define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
    463 	VMOVQ_SI_X12_0;        \
    464 	VMOVQ_SI_X13(4*8);     \
    465 	VMOVQ_SI_X14(1*8);     \
    466 	VMOVQ_SI_X15(5*8);     \
    467 	VPINSRQ_1_SI_X12(2*8); \
    468 	VPINSRQ_1_SI_X13(6*8); \
    469 	VPINSRQ_1_SI_X14(3*8); \
    470 	VPINSRQ_1_SI_X15(7*8)
    471 
    472 // load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
    473 #define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
    474 	VPSHUFD $0x4E, 0*8(SI), X12; \
    475 	VMOVQ_SI_X13(11*8);          \
    476 	VMOVQ_SI_X14(12*8);          \
    477 	VMOVQ_SI_X15(7*8);           \
    478 	VPINSRQ_1_SI_X13(5*8);       \
    479 	VPINSRQ_1_SI_X14(2*8);       \
    480 	VPINSRQ_1_SI_X15(3*8)
    481 
    482 // load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
    483 #define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
    484 	VMOVDQU 11*8(SI), X12;  \
    485 	VMOVQ_SI_X13(5*8);      \
    486 	VMOVQ_SI_X14(8*8);      \
    487 	VMOVQ_SI_X15(2*8);      \
    488 	VPINSRQ_1_SI_X13(15*8); \
    489 	VPINSRQ_1_SI_X14_0;     \
    490 	VPINSRQ_1_SI_X15(13*8)
    491 
    492 // load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
    493 #define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
    494 	VMOVQ_SI_X12(2*8);      \
    495 	VMOVQ_SI_X13(4*8);      \
    496 	VMOVQ_SI_X14(6*8);      \
    497 	VMOVQ_SI_X15_0;         \
    498 	VPINSRQ_1_SI_X12(5*8);  \
    499 	VPINSRQ_1_SI_X13(15*8); \
    500 	VPINSRQ_1_SI_X14(10*8); \
    501 	VPINSRQ_1_SI_X15(8*8)
    502 
    503 // load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
    504 #define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
    505 	VMOVQ_SI_X12(9*8);      \
    506 	VMOVQ_SI_X13(2*8);      \
    507 	VMOVQ_SI_X14_0;         \
    508 	VMOVQ_SI_X15(4*8);      \
    509 	VPINSRQ_1_SI_X12(5*8);  \
    510 	VPINSRQ_1_SI_X13(10*8); \
    511 	VPINSRQ_1_SI_X14(7*8);  \
    512 	VPINSRQ_1_SI_X15(15*8)
    513 
    514 // load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
    515 #define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
    516 	VMOVQ_SI_X12(2*8);      \
    517 	VMOVQ_SI_X13_0;         \
    518 	VMOVQ_SI_X14(12*8);     \
    519 	VMOVQ_SI_X15(11*8);     \
    520 	VPINSRQ_1_SI_X12(6*8);  \
    521 	VPINSRQ_1_SI_X13(8*8);  \
    522 	VPINSRQ_1_SI_X14(10*8); \
    523 	VPINSRQ_1_SI_X15(3*8)
    524 
    525 // load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
    526 #define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
    527 	MOVQ    0*8(SI), X12;        \
    528 	VPSHUFD $0x4E, 8*8(SI), X13; \
    529 	MOVQ    7*8(SI), X14;        \
    530 	MOVQ    2*8(SI), X15;        \
    531 	VPINSRQ_1_SI_X12(6*8);       \
    532 	VPINSRQ_1_SI_X14(3*8);       \
    533 	VPINSRQ_1_SI_X15(11*8)
    534 
    535 // load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
    536 #define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
    537 	MOVQ 6*8(SI), X12;      \
    538 	MOVQ 11*8(SI), X13;     \
    539 	MOVQ 15*8(SI), X14;     \
    540 	MOVQ 3*8(SI), X15;      \
    541 	VPINSRQ_1_SI_X12(14*8); \
    542 	VPINSRQ_1_SI_X13_0;     \
    543 	VPINSRQ_1_SI_X14(9*8);  \
    544 	VPINSRQ_1_SI_X15(8*8)
    545 
    546 // load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
    547 #define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
    548 	MOVQ 5*8(SI), X12;      \
    549 	MOVQ 8*8(SI), X13;      \
    550 	MOVQ 0*8(SI), X14;      \
    551 	MOVQ 6*8(SI), X15;      \
    552 	VPINSRQ_1_SI_X12(15*8); \
    553 	VPINSRQ_1_SI_X13(2*8);  \
    554 	VPINSRQ_1_SI_X14(4*8);  \
    555 	VPINSRQ_1_SI_X15(10*8)
    556 
    557 // load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
    558 #define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
    559 	VMOVDQU 12*8(SI), X12;  \
    560 	MOVQ    1*8(SI), X13;   \
    561 	MOVQ    2*8(SI), X14;   \
    562 	VPINSRQ_1_SI_X13(10*8); \
    563 	VPINSRQ_1_SI_X14(7*8);  \
    564 	VMOVDQU 4*8(SI), X15
    565 
    566 // load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
    567 #define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
    568 	MOVQ 15*8(SI), X12;     \
    569 	MOVQ 3*8(SI), X13;      \
    570 	MOVQ 11*8(SI), X14;     \
    571 	MOVQ 12*8(SI), X15;     \
    572 	VPINSRQ_1_SI_X12(9*8);  \
    573 	VPINSRQ_1_SI_X13(13*8); \
    574 	VPINSRQ_1_SI_X14(14*8); \
    575 	VPINSRQ_1_SI_X15_0
    576 
    577 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
    578 TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
    579 	MOVQ h+0(FP), AX
    580 	MOVQ c+8(FP), BX
    581 	MOVQ flag+16(FP), CX
    582 	MOVQ blocks_base+24(FP), SI
    583 	MOVQ blocks_len+32(FP), DI
    584 
    585 	MOVQ SP, R10
    586 	ADDQ $15, R10
    587 	ANDQ $~15, R10
    588 
    589 	VMOVDQU ·AVX_c40<>(SB), X0
    590 	VMOVDQU ·AVX_c48<>(SB), X1
    591 	VMOVDQA X0, X8
    592 	VMOVDQA X1, X9
    593 
    594 	VMOVDQU ·AVX_iv3<>(SB), X0
    595 	VMOVDQA X0, 0(R10)
    596 	XORQ    CX, 0(R10)          // 0(R10) = ·AVX_iv3 ^ (CX || 0)
    597 
    598 	VMOVDQU 0(AX), X10
    599 	VMOVDQU 16(AX), X11
    600 	VMOVDQU 32(AX), X2
    601 	VMOVDQU 48(AX), X3
    602 
    603 	MOVQ 0(BX), R8
    604 	MOVQ 8(BX), R9
    605 
    606 loop:
    607 	ADDQ $128, R8
    608 	CMPQ R8, $128
    609 	JGE  noinc
    610 	INCQ R9
    611 
    612 noinc:
    613 	VMOVQ_R8_X15
    614 	VPINSRQ_1_R9_X15
    615 
    616 	VMOVDQA X10, X0
    617 	VMOVDQA X11, X1
    618 	VMOVDQU ·AVX_iv0<>(SB), X4
    619 	VMOVDQU ·AVX_iv1<>(SB), X5
    620 	VMOVDQU ·AVX_iv2<>(SB), X6
    621 
    622 	VPXOR   X15, X6, X6
    623 	VMOVDQA 0(R10), X7
    624 
    625 	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
    626 	VMOVDQA X12, 16(R10)
    627 	VMOVDQA X13, 32(R10)
    628 	VMOVDQA X14, 48(R10)
    629 	VMOVDQA X15, 64(R10)
    630 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    631 	SHUFFLE_AVX()
    632 	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
    633 	VMOVDQA X12, 80(R10)
    634 	VMOVDQA X13, 96(R10)
    635 	VMOVDQA X14, 112(R10)
    636 	VMOVDQA X15, 128(R10)
    637 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    638 	SHUFFLE_AVX_INV()
    639 
    640 	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
    641 	VMOVDQA X12, 144(R10)
    642 	VMOVDQA X13, 160(R10)
    643 	VMOVDQA X14, 176(R10)
    644 	VMOVDQA X15, 192(R10)
    645 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    646 	SHUFFLE_AVX()
    647 	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
    648 	VMOVDQA X12, 208(R10)
    649 	VMOVDQA X13, 224(R10)
    650 	VMOVDQA X14, 240(R10)
    651 	VMOVDQA X15, 256(R10)
    652 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    653 	SHUFFLE_AVX_INV()
    654 
    655 	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
    656 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    657 	SHUFFLE_AVX()
    658 	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
    659 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    660 	SHUFFLE_AVX_INV()
    661 
    662 	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
    663 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    664 	SHUFFLE_AVX()
    665 	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
    666 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    667 	SHUFFLE_AVX_INV()
    668 
    669 	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
    670 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    671 	SHUFFLE_AVX()
    672 	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
    673 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    674 	SHUFFLE_AVX_INV()
    675 
    676 	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
    677 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    678 	SHUFFLE_AVX()
    679 	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
    680 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    681 	SHUFFLE_AVX_INV()
    682 
    683 	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
    684 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    685 	SHUFFLE_AVX()
    686 	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
    687 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    688 	SHUFFLE_AVX_INV()
    689 
    690 	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
    691 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    692 	SHUFFLE_AVX()
    693 	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
    694 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    695 	SHUFFLE_AVX_INV()
    696 
    697 	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
    698 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    699 	SHUFFLE_AVX()
    700 	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
    701 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    702 	SHUFFLE_AVX_INV()
    703 
    704 	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
    705 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    706 	SHUFFLE_AVX()
    707 	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
    708 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
    709 	SHUFFLE_AVX_INV()
    710 
    711 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
    712 	SHUFFLE_AVX()
    713 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
    714 	SHUFFLE_AVX_INV()
    715 
    716 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
    717 	SHUFFLE_AVX()
    718 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
    719 	SHUFFLE_AVX_INV()
    720 
    721 	VMOVDQU 32(AX), X14
    722 	VMOVDQU 48(AX), X15
    723 	VPXOR   X0, X10, X10
    724 	VPXOR   X1, X11, X11
    725 	VPXOR   X2, X14, X14
    726 	VPXOR   X3, X15, X15
    727 	VPXOR   X4, X10, X10
    728 	VPXOR   X5, X11, X11
    729 	VPXOR   X6, X14, X2
    730 	VPXOR   X7, X15, X3
    731 	VMOVDQU X2, 32(AX)
    732 	VMOVDQU X3, 48(AX)
    733 
    734 	LEAQ 128(SI), SI
    735 	SUBQ $128, DI
    736 	JNE  loop
    737 
    738 	VMOVDQU X10, 0(AX)
    739 	VMOVDQU X11, 16(AX)
    740 
    741 	MOVQ R8, 0(BX)
    742 	MOVQ R9, 8(BX)
    743 	VZEROUPPER
    744 
    745 	RET