chacha_ppc64le.s (9337B)
1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Based on CRYPTOGAMS code with the following comment: 6 // # ==================================================================== 7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 8 // # project. The module is, however, dual licensed under OpenSSL and 9 // # CRYPTOGAMS licenses depending on where you obtain it. For further 10 // # details see http://www.openssl.org/~appro/cryptogams/. 11 // # ==================================================================== 12 13 // Code for the perl script that generates the ppc64 assembler 14 // can be found in the cryptogams repository at the link below. It is based on 15 // the original from openssl. 16 17 // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91 18 19 // The differences in this and the original implementation are 20 // due to the calling conventions and initialization of constants. 21 22 //go:build gc && !purego 23 // +build gc,!purego 24 25 #include "textflag.h" 26 27 #define OUT R3 28 #define INP R4 29 #define LEN R5 30 #define KEY R6 31 #define CNT R7 32 #define TMP R15 33 34 #define CONSTBASE R16 35 #define BLOCKS R17 36 37 DATA consts<>+0x00(SB)/8, $0x3320646e61707865 38 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 39 DATA consts<>+0x10(SB)/8, $0x0000000000000001 40 DATA consts<>+0x18(SB)/8, $0x0000000000000000 41 DATA consts<>+0x20(SB)/8, $0x0000000000000004 42 DATA consts<>+0x28(SB)/8, $0x0000000000000000 43 DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d 44 DATA consts<>+0x38(SB)/8, $0x0203000106070405 45 DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c 46 DATA consts<>+0x48(SB)/8, $0x0102030005060704 47 DATA consts<>+0x50(SB)/8, $0x6170786561707865 48 DATA consts<>+0x58(SB)/8, $0x6170786561707865 49 DATA consts<>+0x60(SB)/8, $0x3320646e3320646e 50 DATA consts<>+0x68(SB)/8, $0x3320646e3320646e 51 DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 52 DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 53 DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 54 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 55 DATA consts<>+0x90(SB)/8, $0x0000000100000000 56 DATA consts<>+0x98(SB)/8, $0x0000000300000002 57 GLOBL consts<>(SB), RODATA, $0xa0 58 59 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) 60 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 61 MOVD out+0(FP), OUT 62 MOVD inp+8(FP), INP 63 MOVD len+16(FP), LEN 64 MOVD key+24(FP), KEY 65 MOVD counter+32(FP), CNT 66 67 // Addressing for constants 68 MOVD $consts<>+0x00(SB), CONSTBASE 69 MOVD $16, R8 70 MOVD $32, R9 71 MOVD $48, R10 72 MOVD $64, R11 73 SRD $6, LEN, BLOCKS 74 // V16 75 LXVW4X (CONSTBASE)(R0), VS48 76 ADD $80,CONSTBASE 77 78 // Load key into V17,V18 79 LXVW4X (KEY)(R0), VS49 80 LXVW4X (KEY)(R8), VS50 81 82 // Load CNT, NONCE into V19 83 LXVW4X (CNT)(R0), VS51 84 85 // Clear V27 86 VXOR V27, V27, V27 87 88 // V28 89 LXVW4X (CONSTBASE)(R11), VS60 90 91 // splat slot from V19 -> V26 92 VSPLTW $0, V19, V26 93 94 VSLDOI $4, V19, V27, V19 95 VSLDOI $12, V27, V19, V19 96 97 VADDUWM V26, V28, V26 98 99 MOVD $10, R14 100 MOVD R14, CTR 101 102 loop_outer_vsx: 103 // V0, V1, V2, V3 104 LXVW4X (R0)(CONSTBASE), VS32 105 LXVW4X (R8)(CONSTBASE), VS33 106 LXVW4X (R9)(CONSTBASE), VS34 107 LXVW4X (R10)(CONSTBASE), VS35 108 109 // splat values from V17, V18 into V4-V11 110 VSPLTW $0, V17, V4 111 VSPLTW $1, V17, V5 112 VSPLTW $2, V17, V6 113 VSPLTW $3, V17, V7 114 VSPLTW $0, V18, V8 115 VSPLTW $1, V18, V9 116 VSPLTW $2, V18, V10 117 VSPLTW $3, V18, V11 118 119 // VOR 120 VOR V26, V26, V12 121 122 // splat values from V19 -> V13, V14, V15 123 VSPLTW $1, V19, V13 124 VSPLTW $2, V19, V14 125 VSPLTW $3, V19, V15 126 127 // splat const values 128 VSPLTISW $-16, V27 129 VSPLTISW $12, V28 130 VSPLTISW $8, V29 131 VSPLTISW $7, V30 132 133 loop_vsx: 134 VADDUWM V0, V4, V0 135 VADDUWM V1, V5, V1 136 VADDUWM V2, V6, V2 137 VADDUWM V3, V7, V3 138 139 VXOR V12, V0, V12 140 VXOR V13, V1, V13 141 VXOR V14, V2, V14 142 VXOR V15, V3, V15 143 144 VRLW V12, V27, V12 145 VRLW V13, V27, V13 146 VRLW V14, V27, V14 147 VRLW V15, V27, V15 148 149 VADDUWM V8, V12, V8 150 VADDUWM V9, V13, V9 151 VADDUWM V10, V14, V10 152 VADDUWM V11, V15, V11 153 154 VXOR V4, V8, V4 155 VXOR V5, V9, V5 156 VXOR V6, V10, V6 157 VXOR V7, V11, V7 158 159 VRLW V4, V28, V4 160 VRLW V5, V28, V5 161 VRLW V6, V28, V6 162 VRLW V7, V28, V7 163 164 VADDUWM V0, V4, V0 165 VADDUWM V1, V5, V1 166 VADDUWM V2, V6, V2 167 VADDUWM V3, V7, V3 168 169 VXOR V12, V0, V12 170 VXOR V13, V1, V13 171 VXOR V14, V2, V14 172 VXOR V15, V3, V15 173 174 VRLW V12, V29, V12 175 VRLW V13, V29, V13 176 VRLW V14, V29, V14 177 VRLW V15, V29, V15 178 179 VADDUWM V8, V12, V8 180 VADDUWM V9, V13, V9 181 VADDUWM V10, V14, V10 182 VADDUWM V11, V15, V11 183 184 VXOR V4, V8, V4 185 VXOR V5, V9, V5 186 VXOR V6, V10, V6 187 VXOR V7, V11, V7 188 189 VRLW V4, V30, V4 190 VRLW V5, V30, V5 191 VRLW V6, V30, V6 192 VRLW V7, V30, V7 193 194 VADDUWM V0, V5, V0 195 VADDUWM V1, V6, V1 196 VADDUWM V2, V7, V2 197 VADDUWM V3, V4, V3 198 199 VXOR V15, V0, V15 200 VXOR V12, V1, V12 201 VXOR V13, V2, V13 202 VXOR V14, V3, V14 203 204 VRLW V15, V27, V15 205 VRLW V12, V27, V12 206 VRLW V13, V27, V13 207 VRLW V14, V27, V14 208 209 VADDUWM V10, V15, V10 210 VADDUWM V11, V12, V11 211 VADDUWM V8, V13, V8 212 VADDUWM V9, V14, V9 213 214 VXOR V5, V10, V5 215 VXOR V6, V11, V6 216 VXOR V7, V8, V7 217 VXOR V4, V9, V4 218 219 VRLW V5, V28, V5 220 VRLW V6, V28, V6 221 VRLW V7, V28, V7 222 VRLW V4, V28, V4 223 224 VADDUWM V0, V5, V0 225 VADDUWM V1, V6, V1 226 VADDUWM V2, V7, V2 227 VADDUWM V3, V4, V3 228 229 VXOR V15, V0, V15 230 VXOR V12, V1, V12 231 VXOR V13, V2, V13 232 VXOR V14, V3, V14 233 234 VRLW V15, V29, V15 235 VRLW V12, V29, V12 236 VRLW V13, V29, V13 237 VRLW V14, V29, V14 238 239 VADDUWM V10, V15, V10 240 VADDUWM V11, V12, V11 241 VADDUWM V8, V13, V8 242 VADDUWM V9, V14, V9 243 244 VXOR V5, V10, V5 245 VXOR V6, V11, V6 246 VXOR V7, V8, V7 247 VXOR V4, V9, V4 248 249 VRLW V5, V30, V5 250 VRLW V6, V30, V6 251 VRLW V7, V30, V7 252 VRLW V4, V30, V4 253 BC 16, LT, loop_vsx 254 255 VADDUWM V12, V26, V12 256 257 WORD $0x13600F8C // VMRGEW V0, V1, V27 258 WORD $0x13821F8C // VMRGEW V2, V3, V28 259 260 WORD $0x10000E8C // VMRGOW V0, V1, V0 261 WORD $0x10421E8C // VMRGOW V2, V3, V2 262 263 WORD $0x13A42F8C // VMRGEW V4, V5, V29 264 WORD $0x13C63F8C // VMRGEW V6, V7, V30 265 266 XXPERMDI VS32, VS34, $0, VS33 267 XXPERMDI VS32, VS34, $3, VS35 268 XXPERMDI VS59, VS60, $0, VS32 269 XXPERMDI VS59, VS60, $3, VS34 270 271 WORD $0x10842E8C // VMRGOW V4, V5, V4 272 WORD $0x10C63E8C // VMRGOW V6, V7, V6 273 274 WORD $0x13684F8C // VMRGEW V8, V9, V27 275 WORD $0x138A5F8C // VMRGEW V10, V11, V28 276 277 XXPERMDI VS36, VS38, $0, VS37 278 XXPERMDI VS36, VS38, $3, VS39 279 XXPERMDI VS61, VS62, $0, VS36 280 XXPERMDI VS61, VS62, $3, VS38 281 282 WORD $0x11084E8C // VMRGOW V8, V9, V8 283 WORD $0x114A5E8C // VMRGOW V10, V11, V10 284 285 WORD $0x13AC6F8C // VMRGEW V12, V13, V29 286 WORD $0x13CE7F8C // VMRGEW V14, V15, V30 287 288 XXPERMDI VS40, VS42, $0, VS41 289 XXPERMDI VS40, VS42, $3, VS43 290 XXPERMDI VS59, VS60, $0, VS40 291 XXPERMDI VS59, VS60, $3, VS42 292 293 WORD $0x118C6E8C // VMRGOW V12, V13, V12 294 WORD $0x11CE7E8C // VMRGOW V14, V15, V14 295 296 VSPLTISW $4, V27 297 VADDUWM V26, V27, V26 298 299 XXPERMDI VS44, VS46, $0, VS45 300 XXPERMDI VS44, VS46, $3, VS47 301 XXPERMDI VS61, VS62, $0, VS44 302 XXPERMDI VS61, VS62, $3, VS46 303 304 VADDUWM V0, V16, V0 305 VADDUWM V4, V17, V4 306 VADDUWM V8, V18, V8 307 VADDUWM V12, V19, V12 308 309 CMPU LEN, $64 310 BLT tail_vsx 311 312 // Bottom of loop 313 LXVW4X (INP)(R0), VS59 314 LXVW4X (INP)(R8), VS60 315 LXVW4X (INP)(R9), VS61 316 LXVW4X (INP)(R10), VS62 317 318 VXOR V27, V0, V27 319 VXOR V28, V4, V28 320 VXOR V29, V8, V29 321 VXOR V30, V12, V30 322 323 STXVW4X VS59, (OUT)(R0) 324 STXVW4X VS60, (OUT)(R8) 325 ADD $64, INP 326 STXVW4X VS61, (OUT)(R9) 327 ADD $-64, LEN 328 STXVW4X VS62, (OUT)(R10) 329 ADD $64, OUT 330 BEQ done_vsx 331 332 VADDUWM V1, V16, V0 333 VADDUWM V5, V17, V4 334 VADDUWM V9, V18, V8 335 VADDUWM V13, V19, V12 336 337 CMPU LEN, $64 338 BLT tail_vsx 339 340 LXVW4X (INP)(R0), VS59 341 LXVW4X (INP)(R8), VS60 342 LXVW4X (INP)(R9), VS61 343 LXVW4X (INP)(R10), VS62 344 VXOR V27, V0, V27 345 346 VXOR V28, V4, V28 347 VXOR V29, V8, V29 348 VXOR V30, V12, V30 349 350 STXVW4X VS59, (OUT)(R0) 351 STXVW4X VS60, (OUT)(R8) 352 ADD $64, INP 353 STXVW4X VS61, (OUT)(R9) 354 ADD $-64, LEN 355 STXVW4X VS62, (OUT)(V10) 356 ADD $64, OUT 357 BEQ done_vsx 358 359 VADDUWM V2, V16, V0 360 VADDUWM V6, V17, V4 361 VADDUWM V10, V18, V8 362 VADDUWM V14, V19, V12 363 364 CMPU LEN, $64 365 BLT tail_vsx 366 367 LXVW4X (INP)(R0), VS59 368 LXVW4X (INP)(R8), VS60 369 LXVW4X (INP)(R9), VS61 370 LXVW4X (INP)(R10), VS62 371 372 VXOR V27, V0, V27 373 VXOR V28, V4, V28 374 VXOR V29, V8, V29 375 VXOR V30, V12, V30 376 377 STXVW4X VS59, (OUT)(R0) 378 STXVW4X VS60, (OUT)(R8) 379 ADD $64, INP 380 STXVW4X VS61, (OUT)(R9) 381 ADD $-64, LEN 382 STXVW4X VS62, (OUT)(R10) 383 ADD $64, OUT 384 BEQ done_vsx 385 386 VADDUWM V3, V16, V0 387 VADDUWM V7, V17, V4 388 VADDUWM V11, V18, V8 389 VADDUWM V15, V19, V12 390 391 CMPU LEN, $64 392 BLT tail_vsx 393 394 LXVW4X (INP)(R0), VS59 395 LXVW4X (INP)(R8), VS60 396 LXVW4X (INP)(R9), VS61 397 LXVW4X (INP)(R10), VS62 398 399 VXOR V27, V0, V27 400 VXOR V28, V4, V28 401 VXOR V29, V8, V29 402 VXOR V30, V12, V30 403 404 STXVW4X VS59, (OUT)(R0) 405 STXVW4X VS60, (OUT)(R8) 406 ADD $64, INP 407 STXVW4X VS61, (OUT)(R9) 408 ADD $-64, LEN 409 STXVW4X VS62, (OUT)(R10) 410 ADD $64, OUT 411 412 MOVD $10, R14 413 MOVD R14, CTR 414 BNE loop_outer_vsx 415 416 done_vsx: 417 // Increment counter by number of 64 byte blocks 418 MOVD (CNT), R14 419 ADD BLOCKS, R14 420 MOVD R14, (CNT) 421 RET 422 423 tail_vsx: 424 ADD $32, R1, R11 425 MOVD LEN, CTR 426 427 // Save values on stack to copy from 428 STXVW4X VS32, (R11)(R0) 429 STXVW4X VS36, (R11)(R8) 430 STXVW4X VS40, (R11)(R9) 431 STXVW4X VS44, (R11)(R10) 432 ADD $-1, R11, R12 433 ADD $-1, INP 434 ADD $-1, OUT 435 436 looptail_vsx: 437 // Copying the result to OUT 438 // in bytes. 439 MOVBZU 1(R12), KEY 440 MOVBZU 1(INP), TMP 441 XOR KEY, TMP, KEY 442 MOVBU KEY, 1(OUT) 443 BC 16, LT, looptail_vsx 444 445 // Clear the stack values 446 STXVW4X VS48, (R11)(R0) 447 STXVW4X VS48, (R11)(R8) 448 STXVW4X VS48, (R11)(R9) 449 STXVW4X VS48, (R11)(R10) 450 BR done_vsx