blake2s_386.s (11720B)
1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build 386 && gc && !purego 6 // +build 386,gc,!purego 7 8 #include "textflag.h" 9 10 DATA iv0<>+0x00(SB)/4, $0x6a09e667 11 DATA iv0<>+0x04(SB)/4, $0xbb67ae85 12 DATA iv0<>+0x08(SB)/4, $0x3c6ef372 13 DATA iv0<>+0x0c(SB)/4, $0xa54ff53a 14 GLOBL iv0<>(SB), (NOPTR+RODATA), $16 15 16 DATA iv1<>+0x00(SB)/4, $0x510e527f 17 DATA iv1<>+0x04(SB)/4, $0x9b05688c 18 DATA iv1<>+0x08(SB)/4, $0x1f83d9ab 19 DATA iv1<>+0x0c(SB)/4, $0x5be0cd19 20 GLOBL iv1<>(SB), (NOPTR+RODATA), $16 21 22 DATA rol16<>+0x00(SB)/8, $0x0504070601000302 23 DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 24 GLOBL rol16<>(SB), (NOPTR+RODATA), $16 25 26 DATA rol8<>+0x00(SB)/8, $0x0407060500030201 27 DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 28 GLOBL rol8<>(SB), (NOPTR+RODATA), $16 29 30 DATA counter<>+0x00(SB)/8, $0x40 31 DATA counter<>+0x08(SB)/8, $0x0 32 GLOBL counter<>(SB), (NOPTR+RODATA), $16 33 34 #define ROTL_SSE2(n, t, v) \ 35 MOVO v, t; \ 36 PSLLL $n, t; \ 37 PSRLL $(32-n), v; \ 38 PXOR t, v 39 40 #define ROTL_SSSE3(c, v) \ 41 PSHUFB c, v 42 43 #define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \ 44 PADDL m0, v0; \ 45 PADDL v1, v0; \ 46 PXOR v0, v3; \ 47 ROTL_SSE2(16, t, v3); \ 48 PADDL v3, v2; \ 49 PXOR v2, v1; \ 50 ROTL_SSE2(20, t, v1); \ 51 PADDL m1, v0; \ 52 PADDL v1, v0; \ 53 PXOR v0, v3; \ 54 ROTL_SSE2(24, t, v3); \ 55 PADDL v3, v2; \ 56 PXOR v2, v1; \ 57 ROTL_SSE2(25, t, v1); \ 58 PSHUFL $0x39, v1, v1; \ 59 PSHUFL $0x4E, v2, v2; \ 60 PSHUFL $0x93, v3, v3; \ 61 PADDL m2, v0; \ 62 PADDL v1, v0; \ 63 PXOR v0, v3; \ 64 ROTL_SSE2(16, t, v3); \ 65 PADDL v3, v2; \ 66 PXOR v2, v1; \ 67 ROTL_SSE2(20, t, v1); \ 68 PADDL m3, v0; \ 69 PADDL v1, v0; \ 70 PXOR v0, v3; \ 71 ROTL_SSE2(24, t, v3); \ 72 PADDL v3, v2; \ 73 PXOR v2, v1; \ 74 ROTL_SSE2(25, t, v1); \ 75 PSHUFL $0x39, v3, v3; \ 76 PSHUFL $0x4E, v2, v2; \ 77 PSHUFL $0x93, v1, v1 78 79 #define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \ 80 PADDL m0, v0; \ 81 PADDL v1, v0; \ 82 PXOR v0, v3; \ 83 ROTL_SSSE3(c16, v3); \ 84 PADDL v3, v2; \ 85 PXOR v2, v1; \ 86 ROTL_SSE2(20, t, v1); \ 87 PADDL m1, v0; \ 88 PADDL v1, v0; \ 89 PXOR v0, v3; \ 90 ROTL_SSSE3(c8, v3); \ 91 PADDL v3, v2; \ 92 PXOR v2, v1; \ 93 ROTL_SSE2(25, t, v1); \ 94 PSHUFL $0x39, v1, v1; \ 95 PSHUFL $0x4E, v2, v2; \ 96 PSHUFL $0x93, v3, v3; \ 97 PADDL m2, v0; \ 98 PADDL v1, v0; \ 99 PXOR v0, v3; \ 100 ROTL_SSSE3(c16, v3); \ 101 PADDL v3, v2; \ 102 PXOR v2, v1; \ 103 ROTL_SSE2(20, t, v1); \ 104 PADDL m3, v0; \ 105 PADDL v1, v0; \ 106 PXOR v0, v3; \ 107 ROTL_SSSE3(c8, v3); \ 108 PADDL v3, v2; \ 109 PXOR v2, v1; \ 110 ROTL_SSE2(25, t, v1); \ 111 PSHUFL $0x39, v3, v3; \ 112 PSHUFL $0x4E, v2, v2; \ 113 PSHUFL $0x93, v1, v1 114 115 #define PRECOMPUTE(dst, off, src, t) \ 116 MOVL 0*4(src), t; \ 117 MOVL t, 0*4+off+0(dst); \ 118 MOVL t, 9*4+off+64(dst); \ 119 MOVL t, 5*4+off+128(dst); \ 120 MOVL t, 14*4+off+192(dst); \ 121 MOVL t, 4*4+off+256(dst); \ 122 MOVL t, 2*4+off+320(dst); \ 123 MOVL t, 8*4+off+384(dst); \ 124 MOVL t, 12*4+off+448(dst); \ 125 MOVL t, 3*4+off+512(dst); \ 126 MOVL t, 15*4+off+576(dst); \ 127 MOVL 1*4(src), t; \ 128 MOVL t, 4*4+off+0(dst); \ 129 MOVL t, 8*4+off+64(dst); \ 130 MOVL t, 14*4+off+128(dst); \ 131 MOVL t, 5*4+off+192(dst); \ 132 MOVL t, 12*4+off+256(dst); \ 133 MOVL t, 11*4+off+320(dst); \ 134 MOVL t, 1*4+off+384(dst); \ 135 MOVL t, 6*4+off+448(dst); \ 136 MOVL t, 10*4+off+512(dst); \ 137 MOVL t, 3*4+off+576(dst); \ 138 MOVL 2*4(src), t; \ 139 MOVL t, 1*4+off+0(dst); \ 140 MOVL t, 13*4+off+64(dst); \ 141 MOVL t, 6*4+off+128(dst); \ 142 MOVL t, 8*4+off+192(dst); \ 143 MOVL t, 2*4+off+256(dst); \ 144 MOVL t, 0*4+off+320(dst); \ 145 MOVL t, 14*4+off+384(dst); \ 146 MOVL t, 11*4+off+448(dst); \ 147 MOVL t, 12*4+off+512(dst); \ 148 MOVL t, 4*4+off+576(dst); \ 149 MOVL 3*4(src), t; \ 150 MOVL t, 5*4+off+0(dst); \ 151 MOVL t, 15*4+off+64(dst); \ 152 MOVL t, 9*4+off+128(dst); \ 153 MOVL t, 1*4+off+192(dst); \ 154 MOVL t, 11*4+off+256(dst); \ 155 MOVL t, 7*4+off+320(dst); \ 156 MOVL t, 13*4+off+384(dst); \ 157 MOVL t, 3*4+off+448(dst); \ 158 MOVL t, 6*4+off+512(dst); \ 159 MOVL t, 10*4+off+576(dst); \ 160 MOVL 4*4(src), t; \ 161 MOVL t, 2*4+off+0(dst); \ 162 MOVL t, 1*4+off+64(dst); \ 163 MOVL t, 15*4+off+128(dst); \ 164 MOVL t, 10*4+off+192(dst); \ 165 MOVL t, 6*4+off+256(dst); \ 166 MOVL t, 8*4+off+320(dst); \ 167 MOVL t, 3*4+off+384(dst); \ 168 MOVL t, 13*4+off+448(dst); \ 169 MOVL t, 14*4+off+512(dst); \ 170 MOVL t, 5*4+off+576(dst); \ 171 MOVL 5*4(src), t; \ 172 MOVL t, 6*4+off+0(dst); \ 173 MOVL t, 11*4+off+64(dst); \ 174 MOVL t, 2*4+off+128(dst); \ 175 MOVL t, 9*4+off+192(dst); \ 176 MOVL t, 1*4+off+256(dst); \ 177 MOVL t, 13*4+off+320(dst); \ 178 MOVL t, 4*4+off+384(dst); \ 179 MOVL t, 8*4+off+448(dst); \ 180 MOVL t, 15*4+off+512(dst); \ 181 MOVL t, 7*4+off+576(dst); \ 182 MOVL 6*4(src), t; \ 183 MOVL t, 3*4+off+0(dst); \ 184 MOVL t, 7*4+off+64(dst); \ 185 MOVL t, 13*4+off+128(dst); \ 186 MOVL t, 12*4+off+192(dst); \ 187 MOVL t, 10*4+off+256(dst); \ 188 MOVL t, 1*4+off+320(dst); \ 189 MOVL t, 9*4+off+384(dst); \ 190 MOVL t, 14*4+off+448(dst); \ 191 MOVL t, 0*4+off+512(dst); \ 192 MOVL t, 6*4+off+576(dst); \ 193 MOVL 7*4(src), t; \ 194 MOVL t, 7*4+off+0(dst); \ 195 MOVL t, 14*4+off+64(dst); \ 196 MOVL t, 10*4+off+128(dst); \ 197 MOVL t, 0*4+off+192(dst); \ 198 MOVL t, 5*4+off+256(dst); \ 199 MOVL t, 9*4+off+320(dst); \ 200 MOVL t, 12*4+off+384(dst); \ 201 MOVL t, 1*4+off+448(dst); \ 202 MOVL t, 13*4+off+512(dst); \ 203 MOVL t, 2*4+off+576(dst); \ 204 MOVL 8*4(src), t; \ 205 MOVL t, 8*4+off+0(dst); \ 206 MOVL t, 5*4+off+64(dst); \ 207 MOVL t, 4*4+off+128(dst); \ 208 MOVL t, 15*4+off+192(dst); \ 209 MOVL t, 14*4+off+256(dst); \ 210 MOVL t, 3*4+off+320(dst); \ 211 MOVL t, 11*4+off+384(dst); \ 212 MOVL t, 10*4+off+448(dst); \ 213 MOVL t, 7*4+off+512(dst); \ 214 MOVL t, 1*4+off+576(dst); \ 215 MOVL 9*4(src), t; \ 216 MOVL t, 12*4+off+0(dst); \ 217 MOVL t, 2*4+off+64(dst); \ 218 MOVL t, 11*4+off+128(dst); \ 219 MOVL t, 4*4+off+192(dst); \ 220 MOVL t, 0*4+off+256(dst); \ 221 MOVL t, 15*4+off+320(dst); \ 222 MOVL t, 10*4+off+384(dst); \ 223 MOVL t, 7*4+off+448(dst); \ 224 MOVL t, 5*4+off+512(dst); \ 225 MOVL t, 9*4+off+576(dst); \ 226 MOVL 10*4(src), t; \ 227 MOVL t, 9*4+off+0(dst); \ 228 MOVL t, 4*4+off+64(dst); \ 229 MOVL t, 8*4+off+128(dst); \ 230 MOVL t, 13*4+off+192(dst); \ 231 MOVL t, 3*4+off+256(dst); \ 232 MOVL t, 5*4+off+320(dst); \ 233 MOVL t, 7*4+off+384(dst); \ 234 MOVL t, 15*4+off+448(dst); \ 235 MOVL t, 11*4+off+512(dst); \ 236 MOVL t, 0*4+off+576(dst); \ 237 MOVL 11*4(src), t; \ 238 MOVL t, 13*4+off+0(dst); \ 239 MOVL t, 10*4+off+64(dst); \ 240 MOVL t, 0*4+off+128(dst); \ 241 MOVL t, 3*4+off+192(dst); \ 242 MOVL t, 9*4+off+256(dst); \ 243 MOVL t, 6*4+off+320(dst); \ 244 MOVL t, 15*4+off+384(dst); \ 245 MOVL t, 4*4+off+448(dst); \ 246 MOVL t, 2*4+off+512(dst); \ 247 MOVL t, 12*4+off+576(dst); \ 248 MOVL 12*4(src), t; \ 249 MOVL t, 10*4+off+0(dst); \ 250 MOVL t, 12*4+off+64(dst); \ 251 MOVL t, 1*4+off+128(dst); \ 252 MOVL t, 6*4+off+192(dst); \ 253 MOVL t, 13*4+off+256(dst); \ 254 MOVL t, 4*4+off+320(dst); \ 255 MOVL t, 0*4+off+384(dst); \ 256 MOVL t, 2*4+off+448(dst); \ 257 MOVL t, 8*4+off+512(dst); \ 258 MOVL t, 14*4+off+576(dst); \ 259 MOVL 13*4(src), t; \ 260 MOVL t, 14*4+off+0(dst); \ 261 MOVL t, 3*4+off+64(dst); \ 262 MOVL t, 7*4+off+128(dst); \ 263 MOVL t, 2*4+off+192(dst); \ 264 MOVL t, 15*4+off+256(dst); \ 265 MOVL t, 12*4+off+320(dst); \ 266 MOVL t, 6*4+off+384(dst); \ 267 MOVL t, 0*4+off+448(dst); \ 268 MOVL t, 9*4+off+512(dst); \ 269 MOVL t, 11*4+off+576(dst); \ 270 MOVL 14*4(src), t; \ 271 MOVL t, 11*4+off+0(dst); \ 272 MOVL t, 0*4+off+64(dst); \ 273 MOVL t, 12*4+off+128(dst); \ 274 MOVL t, 7*4+off+192(dst); \ 275 MOVL t, 8*4+off+256(dst); \ 276 MOVL t, 14*4+off+320(dst); \ 277 MOVL t, 2*4+off+384(dst); \ 278 MOVL t, 5*4+off+448(dst); \ 279 MOVL t, 1*4+off+512(dst); \ 280 MOVL t, 13*4+off+576(dst); \ 281 MOVL 15*4(src), t; \ 282 MOVL t, 15*4+off+0(dst); \ 283 MOVL t, 6*4+off+64(dst); \ 284 MOVL t, 3*4+off+128(dst); \ 285 MOVL t, 11*4+off+192(dst); \ 286 MOVL t, 7*4+off+256(dst); \ 287 MOVL t, 10*4+off+320(dst); \ 288 MOVL t, 5*4+off+384(dst); \ 289 MOVL t, 9*4+off+448(dst); \ 290 MOVL t, 4*4+off+512(dst); \ 291 MOVL t, 8*4+off+576(dst) 292 293 // func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) 294 TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment 295 MOVL h+0(FP), AX 296 MOVL c+4(FP), BX 297 MOVL flag+8(FP), CX 298 MOVL blocks_base+12(FP), SI 299 MOVL blocks_len+16(FP), DX 300 301 MOVL SP, DI 302 ADDL $15, DI 303 ANDL $~15, DI 304 305 MOVL CX, 8(DI) 306 MOVL 0(BX), CX 307 MOVL CX, 0(DI) 308 MOVL 4(BX), CX 309 MOVL CX, 4(DI) 310 XORL CX, CX 311 MOVL CX, 12(DI) 312 313 MOVOU 0(AX), X0 314 MOVOU 16(AX), X1 315 MOVOU counter<>(SB), X2 316 317 loop: 318 MOVO X0, X4 319 MOVO X1, X5 320 MOVOU iv0<>(SB), X6 321 MOVOU iv1<>(SB), X7 322 323 MOVO 0(DI), X3 324 PADDQ X2, X3 325 PXOR X3, X7 326 MOVO X3, 0(DI) 327 328 PRECOMPUTE(DI, 16, SI, CX) 329 ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3) 330 ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3) 331 ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3) 332 ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3) 333 ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3) 334 ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3) 335 ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3) 336 ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3) 337 ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3) 338 ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3) 339 340 PXOR X4, X0 341 PXOR X5, X1 342 PXOR X6, X0 343 PXOR X7, X1 344 345 LEAL 64(SI), SI 346 SUBL $64, DX 347 JNE loop 348 349 MOVL 0(DI), CX 350 MOVL CX, 0(BX) 351 MOVL 4(DI), CX 352 MOVL CX, 4(BX) 353 354 MOVOU X0, 0(AX) 355 MOVOU X1, 16(AX) 356 357 RET 358 359 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) 360 TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment 361 MOVL h+0(FP), AX 362 MOVL c+4(FP), BX 363 MOVL flag+8(FP), CX 364 MOVL blocks_base+12(FP), SI 365 MOVL blocks_len+16(FP), DX 366 367 MOVL SP, DI 368 ADDL $15, DI 369 ANDL $~15, DI 370 371 MOVL CX, 8(DI) 372 MOVL 0(BX), CX 373 MOVL CX, 0(DI) 374 MOVL 4(BX), CX 375 MOVL CX, 4(DI) 376 XORL CX, CX 377 MOVL CX, 12(DI) 378 379 MOVOU 0(AX), X0 380 MOVOU 16(AX), X1 381 MOVOU counter<>(SB), X2 382 383 loop: 384 MOVO X0, 656(DI) 385 MOVO X1, 672(DI) 386 MOVO X0, X4 387 MOVO X1, X5 388 MOVOU iv0<>(SB), X6 389 MOVOU iv1<>(SB), X7 390 391 MOVO 0(DI), X3 392 PADDQ X2, X3 393 PXOR X3, X7 394 MOVO X3, 0(DI) 395 396 MOVOU rol16<>(SB), X0 397 MOVOU rol8<>(SB), X1 398 399 PRECOMPUTE(DI, 16, SI, CX) 400 ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1) 401 ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1) 402 ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1) 403 ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1) 404 ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1) 405 ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1) 406 ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1) 407 ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1) 408 ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1) 409 ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1) 410 411 MOVO 656(DI), X0 412 MOVO 672(DI), X1 413 PXOR X4, X0 414 PXOR X5, X1 415 PXOR X6, X0 416 PXOR X7, X1 417 418 LEAL 64(SI), SI 419 SUBL $64, DX 420 JNE loop 421 422 MOVL 0(DI), CX 423 MOVL CX, 0(BX) 424 MOVL 4(DI), CX 425 MOVL CX, 4(BX) 426 427 MOVOU X0, 0(AX) 428 MOVOU X1, 16(AX) 429 430 RET