gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

blamka_amd64.s (6419B)


      1 // Copyright 2017 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:build amd64 && gc && !purego
      6 // +build amd64,gc,!purego
      7 
      8 #include "textflag.h"
      9 
     10 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
     11 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
     12 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
     13 
     14 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
     15 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
     16 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
     17 
     18 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
     19 	MOVO       v4, t1; \
     20 	MOVO       v5, v4; \
     21 	MOVO       t1, v5; \
     22 	MOVO       v6, t1; \
     23 	PUNPCKLQDQ v6, t2; \
     24 	PUNPCKHQDQ v7, v6; \
     25 	PUNPCKHQDQ t2, v6; \
     26 	PUNPCKLQDQ v7, t2; \
     27 	MOVO       t1, v7; \
     28 	MOVO       v2, t1; \
     29 	PUNPCKHQDQ t2, v7; \
     30 	PUNPCKLQDQ v3, t2; \
     31 	PUNPCKHQDQ t2, v2; \
     32 	PUNPCKLQDQ t1, t2; \
     33 	PUNPCKHQDQ t2, v3
     34 
     35 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
     36 	MOVO       v4, t1; \
     37 	MOVO       v5, v4; \
     38 	MOVO       t1, v5; \
     39 	MOVO       v2, t1; \
     40 	PUNPCKLQDQ v2, t2; \
     41 	PUNPCKHQDQ v3, v2; \
     42 	PUNPCKHQDQ t2, v2; \
     43 	PUNPCKLQDQ v3, t2; \
     44 	MOVO       t1, v3; \
     45 	MOVO       v6, t1; \
     46 	PUNPCKHQDQ t2, v3; \
     47 	PUNPCKLQDQ v7, t2; \
     48 	PUNPCKHQDQ t2, v6; \
     49 	PUNPCKLQDQ t1, t2; \
     50 	PUNPCKHQDQ t2, v7
     51 
     52 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
     53 	MOVO    v0, t0;        \
     54 	PMULULQ v2, t0;        \
     55 	PADDQ   v2, v0;        \
     56 	PADDQ   t0, v0;        \
     57 	PADDQ   t0, v0;        \
     58 	PXOR    v0, v6;        \
     59 	PSHUFD  $0xB1, v6, v6; \
     60 	MOVO    v4, t0;        \
     61 	PMULULQ v6, t0;        \
     62 	PADDQ   v6, v4;        \
     63 	PADDQ   t0, v4;        \
     64 	PADDQ   t0, v4;        \
     65 	PXOR    v4, v2;        \
     66 	PSHUFB  c40, v2;       \
     67 	MOVO    v0, t0;        \
     68 	PMULULQ v2, t0;        \
     69 	PADDQ   v2, v0;        \
     70 	PADDQ   t0, v0;        \
     71 	PADDQ   t0, v0;        \
     72 	PXOR    v0, v6;        \
     73 	PSHUFB  c48, v6;       \
     74 	MOVO    v4, t0;        \
     75 	PMULULQ v6, t0;        \
     76 	PADDQ   v6, v4;        \
     77 	PADDQ   t0, v4;        \
     78 	PADDQ   t0, v4;        \
     79 	PXOR    v4, v2;        \
     80 	MOVO    v2, t0;        \
     81 	PADDQ   v2, t0;        \
     82 	PSRLQ   $63, v2;       \
     83 	PXOR    t0, v2;        \
     84 	MOVO    v1, t0;        \
     85 	PMULULQ v3, t0;        \
     86 	PADDQ   v3, v1;        \
     87 	PADDQ   t0, v1;        \
     88 	PADDQ   t0, v1;        \
     89 	PXOR    v1, v7;        \
     90 	PSHUFD  $0xB1, v7, v7; \
     91 	MOVO    v5, t0;        \
     92 	PMULULQ v7, t0;        \
     93 	PADDQ   v7, v5;        \
     94 	PADDQ   t0, v5;        \
     95 	PADDQ   t0, v5;        \
     96 	PXOR    v5, v3;        \
     97 	PSHUFB  c40, v3;       \
     98 	MOVO    v1, t0;        \
     99 	PMULULQ v3, t0;        \
    100 	PADDQ   v3, v1;        \
    101 	PADDQ   t0, v1;        \
    102 	PADDQ   t0, v1;        \
    103 	PXOR    v1, v7;        \
    104 	PSHUFB  c48, v7;       \
    105 	MOVO    v5, t0;        \
    106 	PMULULQ v7, t0;        \
    107 	PADDQ   v7, v5;        \
    108 	PADDQ   t0, v5;        \
    109 	PADDQ   t0, v5;        \
    110 	PXOR    v5, v3;        \
    111 	MOVO    v3, t0;        \
    112 	PADDQ   v3, t0;        \
    113 	PSRLQ   $63, v3;       \
    114 	PXOR    t0, v3
    115 
    116 #define LOAD_MSG_0(block, off) \
    117 	MOVOU 8*(off+0)(block), X0;  \
    118 	MOVOU 8*(off+2)(block), X1;  \
    119 	MOVOU 8*(off+4)(block), X2;  \
    120 	MOVOU 8*(off+6)(block), X3;  \
    121 	MOVOU 8*(off+8)(block), X4;  \
    122 	MOVOU 8*(off+10)(block), X5; \
    123 	MOVOU 8*(off+12)(block), X6; \
    124 	MOVOU 8*(off+14)(block), X7
    125 
    126 #define STORE_MSG_0(block, off) \
    127 	MOVOU X0, 8*(off+0)(block);  \
    128 	MOVOU X1, 8*(off+2)(block);  \
    129 	MOVOU X2, 8*(off+4)(block);  \
    130 	MOVOU X3, 8*(off+6)(block);  \
    131 	MOVOU X4, 8*(off+8)(block);  \
    132 	MOVOU X5, 8*(off+10)(block); \
    133 	MOVOU X6, 8*(off+12)(block); \
    134 	MOVOU X7, 8*(off+14)(block)
    135 
    136 #define LOAD_MSG_1(block, off) \
    137 	MOVOU 8*off+0*8(block), X0;  \
    138 	MOVOU 8*off+16*8(block), X1; \
    139 	MOVOU 8*off+32*8(block), X2; \
    140 	MOVOU 8*off+48*8(block), X3; \
    141 	MOVOU 8*off+64*8(block), X4; \
    142 	MOVOU 8*off+80*8(block), X5; \
    143 	MOVOU 8*off+96*8(block), X6; \
    144 	MOVOU 8*off+112*8(block), X7
    145 
    146 #define STORE_MSG_1(block, off) \
    147 	MOVOU X0, 8*off+0*8(block);  \
    148 	MOVOU X1, 8*off+16*8(block); \
    149 	MOVOU X2, 8*off+32*8(block); \
    150 	MOVOU X3, 8*off+48*8(block); \
    151 	MOVOU X4, 8*off+64*8(block); \
    152 	MOVOU X5, 8*off+80*8(block); \
    153 	MOVOU X6, 8*off+96*8(block); \
    154 	MOVOU X7, 8*off+112*8(block)
    155 
    156 #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
    157 	LOAD_MSG_0(block, off);                                   \
    158 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
    159 	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
    160 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
    161 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
    162 	STORE_MSG_0(block, off)
    163 
    164 #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
    165 	LOAD_MSG_1(block, off);                                   \
    166 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
    167 	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
    168 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
    169 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
    170 	STORE_MSG_1(block, off)
    171 
    172 // func blamkaSSE4(b *block)
    173 TEXT ·blamkaSSE4(SB), 4, $0-8
    174 	MOVQ b+0(FP), AX
    175 
    176 	MOVOU ·c40<>(SB), X10
    177 	MOVOU ·c48<>(SB), X11
    178 
    179 	BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
    180 	BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
    181 	BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
    182 	BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
    183 	BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
    184 	BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
    185 	BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
    186 	BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
    187 
    188 	BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
    189 	BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
    190 	BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
    191 	BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
    192 	BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
    193 	BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
    194 	BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
    195 	BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
    196 	RET
    197 
    198 // func mixBlocksSSE2(out, a, b, c *block)
    199 TEXT ·mixBlocksSSE2(SB), 4, $0-32
    200 	MOVQ out+0(FP), DX
    201 	MOVQ a+8(FP), AX
    202 	MOVQ b+16(FP), BX
    203 	MOVQ a+24(FP), CX
    204 	MOVQ $128, BP
    205 
    206 loop:
    207 	MOVOU 0(AX), X0
    208 	MOVOU 0(BX), X1
    209 	MOVOU 0(CX), X2
    210 	PXOR  X1, X0
    211 	PXOR  X2, X0
    212 	MOVOU X0, 0(DX)
    213 	ADDQ  $16, AX
    214 	ADDQ  $16, BX
    215 	ADDQ  $16, CX
    216 	ADDQ  $16, DX
    217 	SUBQ  $2, BP
    218 	JA    loop
    219 	RET
    220 
    221 // func xorBlocksSSE2(out, a, b, c *block)
    222 TEXT ·xorBlocksSSE2(SB), 4, $0-32
    223 	MOVQ out+0(FP), DX
    224 	MOVQ a+8(FP), AX
    225 	MOVQ b+16(FP), BX
    226 	MOVQ a+24(FP), CX
    227 	MOVQ $128, BP
    228 
    229 loop:
    230 	MOVOU 0(AX), X0
    231 	MOVOU 0(BX), X1
    232 	MOVOU 0(CX), X2
    233 	MOVOU 0(DX), X3
    234 	PXOR  X1, X0
    235 	PXOR  X2, X0
    236 	PXOR  X3, X0
    237 	MOVOU X0, 0(DX)
    238 	ADDQ  $16, AX
    239 	ADDQ  $16, BX
    240 	ADDQ  $16, CX
    241 	ADDQ  $16, DX
    242 	SUBQ  $2, BP
    243 	JA    loop
    244 	RET