gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

blake2b_amd64.s (8585B)


      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:build amd64 && gc && !purego
      6 // +build amd64,gc,!purego
      7 
      8 #include "textflag.h"
      9 
     10 DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
     11 DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
     12 GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
     13 
     14 DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
     15 DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
     16 GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
     17 
     18 DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
     19 DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
     20 GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
     21 
     22 DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
     23 DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
     24 GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
     25 
     26 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
     27 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
     28 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
     29 
     30 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
     31 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
     32 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
     33 
     34 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
     35 	MOVO       v4, t1; \
     36 	MOVO       v5, v4; \
     37 	MOVO       t1, v5; \
     38 	MOVO       v6, t1; \
     39 	PUNPCKLQDQ v6, t2; \
     40 	PUNPCKHQDQ v7, v6; \
     41 	PUNPCKHQDQ t2, v6; \
     42 	PUNPCKLQDQ v7, t2; \
     43 	MOVO       t1, v7; \
     44 	MOVO       v2, t1; \
     45 	PUNPCKHQDQ t2, v7; \
     46 	PUNPCKLQDQ v3, t2; \
     47 	PUNPCKHQDQ t2, v2; \
     48 	PUNPCKLQDQ t1, t2; \
     49 	PUNPCKHQDQ t2, v3
     50 
     51 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
     52 	MOVO       v4, t1; \
     53 	MOVO       v5, v4; \
     54 	MOVO       t1, v5; \
     55 	MOVO       v2, t1; \
     56 	PUNPCKLQDQ v2, t2; \
     57 	PUNPCKHQDQ v3, v2; \
     58 	PUNPCKHQDQ t2, v2; \
     59 	PUNPCKLQDQ v3, t2; \
     60 	MOVO       t1, v3; \
     61 	MOVO       v6, t1; \
     62 	PUNPCKHQDQ t2, v3; \
     63 	PUNPCKLQDQ v7, t2; \
     64 	PUNPCKHQDQ t2, v6; \
     65 	PUNPCKLQDQ t1, t2; \
     66 	PUNPCKHQDQ t2, v7
     67 
     68 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
     69 	PADDQ  m0, v0;        \
     70 	PADDQ  m1, v1;        \
     71 	PADDQ  v2, v0;        \
     72 	PADDQ  v3, v1;        \
     73 	PXOR   v0, v6;        \
     74 	PXOR   v1, v7;        \
     75 	PSHUFD $0xB1, v6, v6; \
     76 	PSHUFD $0xB1, v7, v7; \
     77 	PADDQ  v6, v4;        \
     78 	PADDQ  v7, v5;        \
     79 	PXOR   v4, v2;        \
     80 	PXOR   v5, v3;        \
     81 	PSHUFB c40, v2;       \
     82 	PSHUFB c40, v3;       \
     83 	PADDQ  m2, v0;        \
     84 	PADDQ  m3, v1;        \
     85 	PADDQ  v2, v0;        \
     86 	PADDQ  v3, v1;        \
     87 	PXOR   v0, v6;        \
     88 	PXOR   v1, v7;        \
     89 	PSHUFB c48, v6;       \
     90 	PSHUFB c48, v7;       \
     91 	PADDQ  v6, v4;        \
     92 	PADDQ  v7, v5;        \
     93 	PXOR   v4, v2;        \
     94 	PXOR   v5, v3;        \
     95 	MOVOU  v2, t0;        \
     96 	PADDQ  v2, t0;        \
     97 	PSRLQ  $63, v2;       \
     98 	PXOR   t0, v2;        \
     99 	MOVOU  v3, t0;        \
    100 	PADDQ  v3, t0;        \
    101 	PSRLQ  $63, v3;       \
    102 	PXOR   t0, v3
    103 
    104 #define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
    105 	MOVQ   i0*8(src), m0;     \
    106 	PINSRQ $1, i1*8(src), m0; \
    107 	MOVQ   i2*8(src), m1;     \
    108 	PINSRQ $1, i3*8(src), m1; \
    109 	MOVQ   i4*8(src), m2;     \
    110 	PINSRQ $1, i5*8(src), m2; \
    111 	MOVQ   i6*8(src), m3;     \
    112 	PINSRQ $1, i7*8(src), m3
    113 
    114 // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
    115 TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
    116 	MOVQ h+0(FP), AX
    117 	MOVQ c+8(FP), BX
    118 	MOVQ flag+16(FP), CX
    119 	MOVQ blocks_base+24(FP), SI
    120 	MOVQ blocks_len+32(FP), DI
    121 
    122 	MOVQ SP, R10
    123 	ADDQ $15, R10
    124 	ANDQ $~15, R10
    125 
    126 	MOVOU ·iv3<>(SB), X0
    127 	MOVO  X0, 0(R10)
    128 	XORQ  CX, 0(R10)     // 0(R10) = ·iv3 ^ (CX || 0)
    129 
    130 	MOVOU ·c40<>(SB), X13
    131 	MOVOU ·c48<>(SB), X14
    132 
    133 	MOVOU 0(AX), X12
    134 	MOVOU 16(AX), X15
    135 
    136 	MOVQ 0(BX), R8
    137 	MOVQ 8(BX), R9
    138 
    139 loop:
    140 	ADDQ $128, R8
    141 	CMPQ R8, $128
    142 	JGE  noinc
    143 	INCQ R9
    144 
    145 noinc:
    146 	MOVQ R8, X8
    147 	PINSRQ $1, R9, X8
    148 
    149 	MOVO X12, X0
    150 	MOVO X15, X1
    151 	MOVOU 32(AX), X2
    152 	MOVOU 48(AX), X3
    153 	MOVOU ·iv0<>(SB), X4
    154 	MOVOU ·iv1<>(SB), X5
    155 	MOVOU ·iv2<>(SB), X6
    156 
    157 	PXOR X8, X6
    158 	MOVO 0(R10), X7
    159 
    160 	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
    161 	MOVO X8, 16(R10)
    162 	MOVO X9, 32(R10)
    163 	MOVO X10, 48(R10)
    164 	MOVO X11, 64(R10)
    165 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    166 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    167 	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
    168 	MOVO X8, 80(R10)
    169 	MOVO X9, 96(R10)
    170 	MOVO X10, 112(R10)
    171 	MOVO X11, 128(R10)
    172 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    173 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    174 
    175 	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
    176 	MOVO X8, 144(R10)
    177 	MOVO X9, 160(R10)
    178 	MOVO X10, 176(R10)
    179 	MOVO X11, 192(R10)
    180 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    181 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    182 	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
    183 	MOVO X8, 208(R10)
    184 	MOVO X9, 224(R10)
    185 	MOVO X10, 240(R10)
    186 	MOVO X11, 256(R10)
    187 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    188 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    189 
    190 	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
    191 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    192 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    193 	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
    194 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    195 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    196 
    197 	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
    198 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    199 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    200 	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
    201 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    202 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    203 
    204 	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
    205 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    206 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    207 	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
    208 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    209 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    210 
    211 	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
    212 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    213 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    214 	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
    215 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    216 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    217 
    218 	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
    219 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    220 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    221 	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
    222 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    223 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    224 
    225 	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
    226 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    227 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    228 	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
    229 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    230 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    231 
    232 	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
    233 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    234 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    235 	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
    236 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    237 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    238 
    239 	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
    240 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    241 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    242 	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
    243 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
    244 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    245 
    246 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
    247 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    248 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
    249 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    250 
    251 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
    252 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
    253 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
    254 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
    255 
    256 	MOVOU 32(AX), X10
    257 	MOVOU 48(AX), X11
    258 	PXOR  X0, X12
    259 	PXOR  X1, X15
    260 	PXOR  X2, X10
    261 	PXOR  X3, X11
    262 	PXOR  X4, X12
    263 	PXOR  X5, X15
    264 	PXOR  X6, X10
    265 	PXOR  X7, X11
    266 	MOVOU X10, 32(AX)
    267 	MOVOU X11, 48(AX)
    268 
    269 	LEAQ 128(SI), SI
    270 	SUBQ $128, DI
    271 	JNE  loop
    272 
    273 	MOVOU X12, 0(AX)
    274 	MOVOU X15, 16(AX)
    275 
    276 	MOVQ R8, 0(BX)
    277 	MOVQ R9, 8(BX)
    278 
    279 	RET