gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

blake2s_386.s (11720B)


      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:build 386 && gc && !purego
      6 // +build 386,gc,!purego
      7 
      8 #include "textflag.h"
      9 
     10 DATA iv0<>+0x00(SB)/4, $0x6a09e667
     11 DATA iv0<>+0x04(SB)/4, $0xbb67ae85
     12 DATA iv0<>+0x08(SB)/4, $0x3c6ef372
     13 DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
     14 GLOBL iv0<>(SB), (NOPTR+RODATA), $16
     15 
     16 DATA iv1<>+0x00(SB)/4, $0x510e527f
     17 DATA iv1<>+0x04(SB)/4, $0x9b05688c
     18 DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
     19 DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
     20 GLOBL iv1<>(SB), (NOPTR+RODATA), $16
     21 
     22 DATA rol16<>+0x00(SB)/8, $0x0504070601000302
     23 DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
     24 GLOBL rol16<>(SB), (NOPTR+RODATA), $16
     25 
     26 DATA rol8<>+0x00(SB)/8, $0x0407060500030201
     27 DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
     28 GLOBL rol8<>(SB), (NOPTR+RODATA), $16
     29 
     30 DATA counter<>+0x00(SB)/8, $0x40
     31 DATA counter<>+0x08(SB)/8, $0x0
     32 GLOBL counter<>(SB), (NOPTR+RODATA), $16
     33 
     34 #define ROTL_SSE2(n, t, v) \
     35 	MOVO  v, t;       \
     36 	PSLLL $n, t;      \
     37 	PSRLL $(32-n), v; \
     38 	PXOR  t, v
     39 
     40 #define ROTL_SSSE3(c, v) \
     41 	PSHUFB c, v
     42 
     43 #define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
     44 	PADDL  m0, v0;        \
     45 	PADDL  v1, v0;        \
     46 	PXOR   v0, v3;        \
     47 	ROTL_SSE2(16, t, v3); \
     48 	PADDL  v3, v2;        \
     49 	PXOR   v2, v1;        \
     50 	ROTL_SSE2(20, t, v1); \
     51 	PADDL  m1, v0;        \
     52 	PADDL  v1, v0;        \
     53 	PXOR   v0, v3;        \
     54 	ROTL_SSE2(24, t, v3); \
     55 	PADDL  v3, v2;        \
     56 	PXOR   v2, v1;        \
     57 	ROTL_SSE2(25, t, v1); \
     58 	PSHUFL $0x39, v1, v1; \
     59 	PSHUFL $0x4E, v2, v2; \
     60 	PSHUFL $0x93, v3, v3; \
     61 	PADDL  m2, v0;        \
     62 	PADDL  v1, v0;        \
     63 	PXOR   v0, v3;        \
     64 	ROTL_SSE2(16, t, v3); \
     65 	PADDL  v3, v2;        \
     66 	PXOR   v2, v1;        \
     67 	ROTL_SSE2(20, t, v1); \
     68 	PADDL  m3, v0;        \
     69 	PADDL  v1, v0;        \
     70 	PXOR   v0, v3;        \
     71 	ROTL_SSE2(24, t, v3); \
     72 	PADDL  v3, v2;        \
     73 	PXOR   v2, v1;        \
     74 	ROTL_SSE2(25, t, v1); \
     75 	PSHUFL $0x39, v3, v3; \
     76 	PSHUFL $0x4E, v2, v2; \
     77 	PSHUFL $0x93, v1, v1
     78 
     79 #define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
     80 	PADDL  m0, v0;        \
     81 	PADDL  v1, v0;        \
     82 	PXOR   v0, v3;        \
     83 	ROTL_SSSE3(c16, v3);  \
     84 	PADDL  v3, v2;        \
     85 	PXOR   v2, v1;        \
     86 	ROTL_SSE2(20, t, v1); \
     87 	PADDL  m1, v0;        \
     88 	PADDL  v1, v0;        \
     89 	PXOR   v0, v3;        \
     90 	ROTL_SSSE3(c8, v3);   \
     91 	PADDL  v3, v2;        \
     92 	PXOR   v2, v1;        \
     93 	ROTL_SSE2(25, t, v1); \
     94 	PSHUFL $0x39, v1, v1; \
     95 	PSHUFL $0x4E, v2, v2; \
     96 	PSHUFL $0x93, v3, v3; \
     97 	PADDL  m2, v0;        \
     98 	PADDL  v1, v0;        \
     99 	PXOR   v0, v3;        \
    100 	ROTL_SSSE3(c16, v3);  \
    101 	PADDL  v3, v2;        \
    102 	PXOR   v2, v1;        \
    103 	ROTL_SSE2(20, t, v1); \
    104 	PADDL  m3, v0;        \
    105 	PADDL  v1, v0;        \
    106 	PXOR   v0, v3;        \
    107 	ROTL_SSSE3(c8, v3);   \
    108 	PADDL  v3, v2;        \
    109 	PXOR   v2, v1;        \
    110 	ROTL_SSE2(25, t, v1); \
    111 	PSHUFL $0x39, v3, v3; \
    112 	PSHUFL $0x4E, v2, v2; \
    113 	PSHUFL $0x93, v1, v1
    114 
    115 #define PRECOMPUTE(dst, off, src, t) \
    116 	MOVL 0*4(src), t;          \
    117 	MOVL t, 0*4+off+0(dst);    \
    118 	MOVL t, 9*4+off+64(dst);   \
    119 	MOVL t, 5*4+off+128(dst);  \
    120 	MOVL t, 14*4+off+192(dst); \
    121 	MOVL t, 4*4+off+256(dst);  \
    122 	MOVL t, 2*4+off+320(dst);  \
    123 	MOVL t, 8*4+off+384(dst);  \
    124 	MOVL t, 12*4+off+448(dst); \
    125 	MOVL t, 3*4+off+512(dst);  \
    126 	MOVL t, 15*4+off+576(dst); \
    127 	MOVL 1*4(src), t;          \
    128 	MOVL t, 4*4+off+0(dst);    \
    129 	MOVL t, 8*4+off+64(dst);   \
    130 	MOVL t, 14*4+off+128(dst); \
    131 	MOVL t, 5*4+off+192(dst);  \
    132 	MOVL t, 12*4+off+256(dst); \
    133 	MOVL t, 11*4+off+320(dst); \
    134 	MOVL t, 1*4+off+384(dst);  \
    135 	MOVL t, 6*4+off+448(dst);  \
    136 	MOVL t, 10*4+off+512(dst); \
    137 	MOVL t, 3*4+off+576(dst);  \
    138 	MOVL 2*4(src), t;          \
    139 	MOVL t, 1*4+off+0(dst);    \
    140 	MOVL t, 13*4+off+64(dst);  \
    141 	MOVL t, 6*4+off+128(dst);  \
    142 	MOVL t, 8*4+off+192(dst);  \
    143 	MOVL t, 2*4+off+256(dst);  \
    144 	MOVL t, 0*4+off+320(dst);  \
    145 	MOVL t, 14*4+off+384(dst); \
    146 	MOVL t, 11*4+off+448(dst); \
    147 	MOVL t, 12*4+off+512(dst); \
    148 	MOVL t, 4*4+off+576(dst);  \
    149 	MOVL 3*4(src), t;          \
    150 	MOVL t, 5*4+off+0(dst);    \
    151 	MOVL t, 15*4+off+64(dst);  \
    152 	MOVL t, 9*4+off+128(dst);  \
    153 	MOVL t, 1*4+off+192(dst);  \
    154 	MOVL t, 11*4+off+256(dst); \
    155 	MOVL t, 7*4+off+320(dst);  \
    156 	MOVL t, 13*4+off+384(dst); \
    157 	MOVL t, 3*4+off+448(dst);  \
    158 	MOVL t, 6*4+off+512(dst);  \
    159 	MOVL t, 10*4+off+576(dst); \
    160 	MOVL 4*4(src), t;          \
    161 	MOVL t, 2*4+off+0(dst);    \
    162 	MOVL t, 1*4+off+64(dst);   \
    163 	MOVL t, 15*4+off+128(dst); \
    164 	MOVL t, 10*4+off+192(dst); \
    165 	MOVL t, 6*4+off+256(dst);  \
    166 	MOVL t, 8*4+off+320(dst);  \
    167 	MOVL t, 3*4+off+384(dst);  \
    168 	MOVL t, 13*4+off+448(dst); \
    169 	MOVL t, 14*4+off+512(dst); \
    170 	MOVL t, 5*4+off+576(dst);  \
    171 	MOVL 5*4(src), t;          \
    172 	MOVL t, 6*4+off+0(dst);    \
    173 	MOVL t, 11*4+off+64(dst);  \
    174 	MOVL t, 2*4+off+128(dst);  \
    175 	MOVL t, 9*4+off+192(dst);  \
    176 	MOVL t, 1*4+off+256(dst);  \
    177 	MOVL t, 13*4+off+320(dst); \
    178 	MOVL t, 4*4+off+384(dst);  \
    179 	MOVL t, 8*4+off+448(dst);  \
    180 	MOVL t, 15*4+off+512(dst); \
    181 	MOVL t, 7*4+off+576(dst);  \
    182 	MOVL 6*4(src), t;          \
    183 	MOVL t, 3*4+off+0(dst);    \
    184 	MOVL t, 7*4+off+64(dst);   \
    185 	MOVL t, 13*4+off+128(dst); \
    186 	MOVL t, 12*4+off+192(dst); \
    187 	MOVL t, 10*4+off+256(dst); \
    188 	MOVL t, 1*4+off+320(dst);  \
    189 	MOVL t, 9*4+off+384(dst);  \
    190 	MOVL t, 14*4+off+448(dst); \
    191 	MOVL t, 0*4+off+512(dst);  \
    192 	MOVL t, 6*4+off+576(dst);  \
    193 	MOVL 7*4(src), t;          \
    194 	MOVL t, 7*4+off+0(dst);    \
    195 	MOVL t, 14*4+off+64(dst);  \
    196 	MOVL t, 10*4+off+128(dst); \
    197 	MOVL t, 0*4+off+192(dst);  \
    198 	MOVL t, 5*4+off+256(dst);  \
    199 	MOVL t, 9*4+off+320(dst);  \
    200 	MOVL t, 12*4+off+384(dst); \
    201 	MOVL t, 1*4+off+448(dst);  \
    202 	MOVL t, 13*4+off+512(dst); \
    203 	MOVL t, 2*4+off+576(dst);  \
    204 	MOVL 8*4(src), t;          \
    205 	MOVL t, 8*4+off+0(dst);    \
    206 	MOVL t, 5*4+off+64(dst);   \
    207 	MOVL t, 4*4+off+128(dst);  \
    208 	MOVL t, 15*4+off+192(dst); \
    209 	MOVL t, 14*4+off+256(dst); \
    210 	MOVL t, 3*4+off+320(dst);  \
    211 	MOVL t, 11*4+off+384(dst); \
    212 	MOVL t, 10*4+off+448(dst); \
    213 	MOVL t, 7*4+off+512(dst);  \
    214 	MOVL t, 1*4+off+576(dst);  \
    215 	MOVL 9*4(src), t;          \
    216 	MOVL t, 12*4+off+0(dst);   \
    217 	MOVL t, 2*4+off+64(dst);   \
    218 	MOVL t, 11*4+off+128(dst); \
    219 	MOVL t, 4*4+off+192(dst);  \
    220 	MOVL t, 0*4+off+256(dst);  \
    221 	MOVL t, 15*4+off+320(dst); \
    222 	MOVL t, 10*4+off+384(dst); \
    223 	MOVL t, 7*4+off+448(dst);  \
    224 	MOVL t, 5*4+off+512(dst);  \
    225 	MOVL t, 9*4+off+576(dst);  \
    226 	MOVL 10*4(src), t;         \
    227 	MOVL t, 9*4+off+0(dst);    \
    228 	MOVL t, 4*4+off+64(dst);   \
    229 	MOVL t, 8*4+off+128(dst);  \
    230 	MOVL t, 13*4+off+192(dst); \
    231 	MOVL t, 3*4+off+256(dst);  \
    232 	MOVL t, 5*4+off+320(dst);  \
    233 	MOVL t, 7*4+off+384(dst);  \
    234 	MOVL t, 15*4+off+448(dst); \
    235 	MOVL t, 11*4+off+512(dst); \
    236 	MOVL t, 0*4+off+576(dst);  \
    237 	MOVL 11*4(src), t;         \
    238 	MOVL t, 13*4+off+0(dst);   \
    239 	MOVL t, 10*4+off+64(dst);  \
    240 	MOVL t, 0*4+off+128(dst);  \
    241 	MOVL t, 3*4+off+192(dst);  \
    242 	MOVL t, 9*4+off+256(dst);  \
    243 	MOVL t, 6*4+off+320(dst);  \
    244 	MOVL t, 15*4+off+384(dst); \
    245 	MOVL t, 4*4+off+448(dst);  \
    246 	MOVL t, 2*4+off+512(dst);  \
    247 	MOVL t, 12*4+off+576(dst); \
    248 	MOVL 12*4(src), t;         \
    249 	MOVL t, 10*4+off+0(dst);   \
    250 	MOVL t, 12*4+off+64(dst);  \
    251 	MOVL t, 1*4+off+128(dst);  \
    252 	MOVL t, 6*4+off+192(dst);  \
    253 	MOVL t, 13*4+off+256(dst); \
    254 	MOVL t, 4*4+off+320(dst);  \
    255 	MOVL t, 0*4+off+384(dst);  \
    256 	MOVL t, 2*4+off+448(dst);  \
    257 	MOVL t, 8*4+off+512(dst);  \
    258 	MOVL t, 14*4+off+576(dst); \
    259 	MOVL 13*4(src), t;         \
    260 	MOVL t, 14*4+off+0(dst);   \
    261 	MOVL t, 3*4+off+64(dst);   \
    262 	MOVL t, 7*4+off+128(dst);  \
    263 	MOVL t, 2*4+off+192(dst);  \
    264 	MOVL t, 15*4+off+256(dst); \
    265 	MOVL t, 12*4+off+320(dst); \
    266 	MOVL t, 6*4+off+384(dst);  \
    267 	MOVL t, 0*4+off+448(dst);  \
    268 	MOVL t, 9*4+off+512(dst);  \
    269 	MOVL t, 11*4+off+576(dst); \
    270 	MOVL 14*4(src), t;         \
    271 	MOVL t, 11*4+off+0(dst);   \
    272 	MOVL t, 0*4+off+64(dst);   \
    273 	MOVL t, 12*4+off+128(dst); \
    274 	MOVL t, 7*4+off+192(dst);  \
    275 	MOVL t, 8*4+off+256(dst);  \
    276 	MOVL t, 14*4+off+320(dst); \
    277 	MOVL t, 2*4+off+384(dst);  \
    278 	MOVL t, 5*4+off+448(dst);  \
    279 	MOVL t, 1*4+off+512(dst);  \
    280 	MOVL t, 13*4+off+576(dst); \
    281 	MOVL 15*4(src), t;         \
    282 	MOVL t, 15*4+off+0(dst);   \
    283 	MOVL t, 6*4+off+64(dst);   \
    284 	MOVL t, 3*4+off+128(dst);  \
    285 	MOVL t, 11*4+off+192(dst); \
    286 	MOVL t, 7*4+off+256(dst);  \
    287 	MOVL t, 10*4+off+320(dst); \
    288 	MOVL t, 5*4+off+384(dst);  \
    289 	MOVL t, 9*4+off+448(dst);  \
    290 	MOVL t, 4*4+off+512(dst);  \
    291 	MOVL t, 8*4+off+576(dst)
    292 
    293 // func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
    294 TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
    295 	MOVL h+0(FP), AX
    296 	MOVL c+4(FP), BX
    297 	MOVL flag+8(FP), CX
    298 	MOVL blocks_base+12(FP), SI
    299 	MOVL blocks_len+16(FP), DX
    300 
    301 	MOVL SP, DI
    302 	ADDL $15, DI
    303 	ANDL $~15, DI
    304 
    305 	MOVL CX, 8(DI)
    306 	MOVL 0(BX), CX
    307 	MOVL CX, 0(DI)
    308 	MOVL 4(BX), CX
    309 	MOVL CX, 4(DI)
    310 	XORL CX, CX
    311 	MOVL CX, 12(DI)
    312 
    313 	MOVOU 0(AX), X0
    314 	MOVOU 16(AX), X1
    315 	MOVOU counter<>(SB), X2
    316 
    317 loop:
    318 	MOVO  X0, X4
    319 	MOVO  X1, X5
    320 	MOVOU iv0<>(SB), X6
    321 	MOVOU iv1<>(SB), X7
    322 
    323 	MOVO  0(DI), X3
    324 	PADDQ X2, X3
    325 	PXOR  X3, X7
    326 	MOVO  X3, 0(DI)
    327 
    328 	PRECOMPUTE(DI, 16, SI, CX)
    329 	ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3)
    330 	ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3)
    331 	ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3)
    332 	ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3)
    333 	ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3)
    334 	ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3)
    335 	ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3)
    336 	ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3)
    337 	ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3)
    338 	ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3)
    339 
    340 	PXOR X4, X0
    341 	PXOR X5, X1
    342 	PXOR X6, X0
    343 	PXOR X7, X1
    344 
    345 	LEAL 64(SI), SI
    346 	SUBL $64, DX
    347 	JNE  loop
    348 
    349 	MOVL 0(DI), CX
    350 	MOVL CX, 0(BX)
    351 	MOVL 4(DI), CX
    352 	MOVL CX, 4(BX)
    353 
    354 	MOVOU X0, 0(AX)
    355 	MOVOU X1, 16(AX)
    356 
    357 	RET
    358 
    359 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
    360 TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
    361 	MOVL h+0(FP), AX
    362 	MOVL c+4(FP), BX
    363 	MOVL flag+8(FP), CX
    364 	MOVL blocks_base+12(FP), SI
    365 	MOVL blocks_len+16(FP), DX
    366 
    367 	MOVL SP, DI
    368 	ADDL $15, DI
    369 	ANDL $~15, DI
    370 
    371 	MOVL CX, 8(DI)
    372 	MOVL 0(BX), CX
    373 	MOVL CX, 0(DI)
    374 	MOVL 4(BX), CX
    375 	MOVL CX, 4(DI)
    376 	XORL CX, CX
    377 	MOVL CX, 12(DI)
    378 
    379 	MOVOU 0(AX), X0
    380 	MOVOU 16(AX), X1
    381 	MOVOU counter<>(SB), X2
    382 
    383 loop:
    384 	MOVO  X0, 656(DI)
    385 	MOVO  X1, 672(DI)
    386 	MOVO  X0, X4
    387 	MOVO  X1, X5
    388 	MOVOU iv0<>(SB), X6
    389 	MOVOU iv1<>(SB), X7
    390 
    391 	MOVO  0(DI), X3
    392 	PADDQ X2, X3
    393 	PXOR  X3, X7
    394 	MOVO  X3, 0(DI)
    395 
    396 	MOVOU rol16<>(SB), X0
    397 	MOVOU rol8<>(SB), X1
    398 
    399 	PRECOMPUTE(DI, 16, SI, CX)
    400 	ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1)
    401 	ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1)
    402 	ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1)
    403 	ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1)
    404 	ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1)
    405 	ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1)
    406 	ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1)
    407 	ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1)
    408 	ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1)
    409 	ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1)
    410 
    411 	MOVO 656(DI), X0
    412 	MOVO 672(DI), X1
    413 	PXOR X4, X0
    414 	PXOR X5, X1
    415 	PXOR X6, X0
    416 	PXOR X7, X1
    417 
    418 	LEAL 64(SI), SI
    419 	SUBL $64, DX
    420 	JNE  loop
    421 
    422 	MOVL 0(DI), CX
    423 	MOVL CX, 0(BX)
    424 	MOVL 4(DI), CX
    425 	MOVL CX, 4(BX)
    426 
    427 	MOVOU X0, 0(AX)
    428 	MOVOU X1, 16(AX)
    429 
    430 	RET