blamka_amd64.s (6419B)
1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build amd64 && gc && !purego 6 // +build amd64,gc,!purego 7 8 #include "textflag.h" 9 10 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 11 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 12 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 13 14 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 15 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 16 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 17 18 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ 19 MOVO v4, t1; \ 20 MOVO v5, v4; \ 21 MOVO t1, v5; \ 22 MOVO v6, t1; \ 23 PUNPCKLQDQ v6, t2; \ 24 PUNPCKHQDQ v7, v6; \ 25 PUNPCKHQDQ t2, v6; \ 26 PUNPCKLQDQ v7, t2; \ 27 MOVO t1, v7; \ 28 MOVO v2, t1; \ 29 PUNPCKHQDQ t2, v7; \ 30 PUNPCKLQDQ v3, t2; \ 31 PUNPCKHQDQ t2, v2; \ 32 PUNPCKLQDQ t1, t2; \ 33 PUNPCKHQDQ t2, v3 34 35 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ 36 MOVO v4, t1; \ 37 MOVO v5, v4; \ 38 MOVO t1, v5; \ 39 MOVO v2, t1; \ 40 PUNPCKLQDQ v2, t2; \ 41 PUNPCKHQDQ v3, v2; \ 42 PUNPCKHQDQ t2, v2; \ 43 PUNPCKLQDQ v3, t2; \ 44 MOVO t1, v3; \ 45 MOVO v6, t1; \ 46 PUNPCKHQDQ t2, v3; \ 47 PUNPCKLQDQ v7, t2; \ 48 PUNPCKHQDQ t2, v6; \ 49 PUNPCKLQDQ t1, t2; \ 50 PUNPCKHQDQ t2, v7 51 52 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ 53 MOVO v0, t0; \ 54 PMULULQ v2, t0; \ 55 PADDQ v2, v0; \ 56 PADDQ t0, v0; \ 57 PADDQ t0, v0; \ 58 PXOR v0, v6; \ 59 PSHUFD $0xB1, v6, v6; \ 60 MOVO v4, t0; \ 61 PMULULQ v6, t0; \ 62 PADDQ v6, v4; \ 63 PADDQ t0, v4; \ 64 PADDQ t0, v4; \ 65 PXOR v4, v2; \ 66 PSHUFB c40, v2; \ 67 MOVO v0, t0; \ 68 PMULULQ v2, t0; \ 69 PADDQ v2, v0; \ 70 PADDQ t0, v0; \ 71 PADDQ t0, v0; \ 72 PXOR v0, v6; \ 73 PSHUFB c48, v6; \ 74 MOVO v4, t0; \ 75 PMULULQ v6, t0; \ 76 PADDQ v6, v4; \ 77 PADDQ t0, v4; \ 78 PADDQ t0, v4; \ 79 PXOR v4, v2; \ 80 MOVO v2, t0; \ 81 PADDQ v2, t0; \ 82 PSRLQ $63, v2; \ 83 PXOR t0, v2; \ 84 MOVO v1, t0; \ 85 PMULULQ v3, t0; \ 86 PADDQ v3, v1; \ 87 PADDQ t0, v1; \ 88 PADDQ t0, v1; \ 89 PXOR v1, v7; \ 90 PSHUFD $0xB1, v7, v7; \ 91 MOVO v5, t0; \ 92 PMULULQ v7, t0; \ 93 PADDQ v7, v5; \ 94 PADDQ t0, v5; \ 95 PADDQ t0, v5; \ 96 PXOR v5, v3; \ 97 PSHUFB c40, v3; \ 98 MOVO v1, t0; \ 99 PMULULQ v3, t0; \ 100 PADDQ v3, v1; \ 101 PADDQ t0, v1; \ 102 PADDQ t0, v1; \ 103 PXOR v1, v7; \ 104 PSHUFB c48, v7; \ 105 MOVO v5, t0; \ 106 PMULULQ v7, t0; \ 107 PADDQ v7, v5; \ 108 PADDQ t0, v5; \ 109 PADDQ t0, v5; \ 110 PXOR v5, v3; \ 111 MOVO v3, t0; \ 112 PADDQ v3, t0; \ 113 PSRLQ $63, v3; \ 114 PXOR t0, v3 115 116 #define LOAD_MSG_0(block, off) \ 117 MOVOU 8*(off+0)(block), X0; \ 118 MOVOU 8*(off+2)(block), X1; \ 119 MOVOU 8*(off+4)(block), X2; \ 120 MOVOU 8*(off+6)(block), X3; \ 121 MOVOU 8*(off+8)(block), X4; \ 122 MOVOU 8*(off+10)(block), X5; \ 123 MOVOU 8*(off+12)(block), X6; \ 124 MOVOU 8*(off+14)(block), X7 125 126 #define STORE_MSG_0(block, off) \ 127 MOVOU X0, 8*(off+0)(block); \ 128 MOVOU X1, 8*(off+2)(block); \ 129 MOVOU X2, 8*(off+4)(block); \ 130 MOVOU X3, 8*(off+6)(block); \ 131 MOVOU X4, 8*(off+8)(block); \ 132 MOVOU X5, 8*(off+10)(block); \ 133 MOVOU X6, 8*(off+12)(block); \ 134 MOVOU X7, 8*(off+14)(block) 135 136 #define LOAD_MSG_1(block, off) \ 137 MOVOU 8*off+0*8(block), X0; \ 138 MOVOU 8*off+16*8(block), X1; \ 139 MOVOU 8*off+32*8(block), X2; \ 140 MOVOU 8*off+48*8(block), X3; \ 141 MOVOU 8*off+64*8(block), X4; \ 142 MOVOU 8*off+80*8(block), X5; \ 143 MOVOU 8*off+96*8(block), X6; \ 144 MOVOU 8*off+112*8(block), X7 145 146 #define STORE_MSG_1(block, off) \ 147 MOVOU X0, 8*off+0*8(block); \ 148 MOVOU X1, 8*off+16*8(block); \ 149 MOVOU X2, 8*off+32*8(block); \ 150 MOVOU X3, 8*off+48*8(block); \ 151 MOVOU X4, 8*off+64*8(block); \ 152 MOVOU X5, 8*off+80*8(block); \ 153 MOVOU X6, 8*off+96*8(block); \ 154 MOVOU X7, 8*off+112*8(block) 155 156 #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ 157 LOAD_MSG_0(block, off); \ 158 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ 159 SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ 160 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ 161 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ 162 STORE_MSG_0(block, off) 163 164 #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ 165 LOAD_MSG_1(block, off); \ 166 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ 167 SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ 168 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ 169 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ 170 STORE_MSG_1(block, off) 171 172 // func blamkaSSE4(b *block) 173 TEXT ·blamkaSSE4(SB), 4, $0-8 174 MOVQ b+0(FP), AX 175 176 MOVOU ·c40<>(SB), X10 177 MOVOU ·c48<>(SB), X11 178 179 BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) 180 BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) 181 BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) 182 BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) 183 BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) 184 BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) 185 BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) 186 BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) 187 188 BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) 189 BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) 190 BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) 191 BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) 192 BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) 193 BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) 194 BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) 195 BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) 196 RET 197 198 // func mixBlocksSSE2(out, a, b, c *block) 199 TEXT ·mixBlocksSSE2(SB), 4, $0-32 200 MOVQ out+0(FP), DX 201 MOVQ a+8(FP), AX 202 MOVQ b+16(FP), BX 203 MOVQ a+24(FP), CX 204 MOVQ $128, BP 205 206 loop: 207 MOVOU 0(AX), X0 208 MOVOU 0(BX), X1 209 MOVOU 0(CX), X2 210 PXOR X1, X0 211 PXOR X2, X0 212 MOVOU X0, 0(DX) 213 ADDQ $16, AX 214 ADDQ $16, BX 215 ADDQ $16, CX 216 ADDQ $16, DX 217 SUBQ $2, BP 218 JA loop 219 RET 220 221 // func xorBlocksSSE2(out, a, b, c *block) 222 TEXT ·xorBlocksSSE2(SB), 4, $0-32 223 MOVQ out+0(FP), DX 224 MOVQ a+8(FP), AX 225 MOVQ b+16(FP), BX 226 MOVQ a+24(FP), CX 227 MOVQ $128, BP 228 229 loop: 230 MOVOU 0(AX), X0 231 MOVOU 0(BX), X1 232 MOVOU 0(CX), X2 233 MOVOU 0(DX), X3 234 PXOR X1, X0 235 PXOR X2, X0 236 PXOR X3, X0 237 MOVOU X0, 0(DX) 238 ADDQ $16, AX 239 ADDQ $16, BX 240 ADDQ $16, CX 241 ADDQ $16, DX 242 SUBQ $2, BP 243 JA loop 244 RET