gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

chacha_ppc64le.s (9337B)


      1 // Copyright 2019 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Based on CRYPTOGAMS code with the following comment:
      6 // # ====================================================================
      7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
      8 // # project. The module is, however, dual licensed under OpenSSL and
      9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
     10 // # details see http://www.openssl.org/~appro/cryptogams/.
     11 // # ====================================================================
     12 
     13 // Code for the perl script that generates the ppc64 assembler
     14 // can be found in the cryptogams repository at the link below. It is based on
     15 // the original from openssl.
     16 
     17 // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
     18 
     19 // The differences in this and the original implementation are
     20 // due to the calling conventions and initialization of constants.
     21 
     22 //go:build gc && !purego
     23 // +build gc,!purego
     24 
     25 #include "textflag.h"
     26 
     27 #define OUT  R3
     28 #define INP  R4
     29 #define LEN  R5
     30 #define KEY  R6
     31 #define CNT  R7
     32 #define TMP  R15
     33 
     34 #define CONSTBASE  R16
     35 #define BLOCKS R17
     36 
     37 DATA consts<>+0x00(SB)/8, $0x3320646e61707865
     38 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
     39 DATA consts<>+0x10(SB)/8, $0x0000000000000001
     40 DATA consts<>+0x18(SB)/8, $0x0000000000000000
     41 DATA consts<>+0x20(SB)/8, $0x0000000000000004
     42 DATA consts<>+0x28(SB)/8, $0x0000000000000000
     43 DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
     44 DATA consts<>+0x38(SB)/8, $0x0203000106070405
     45 DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
     46 DATA consts<>+0x48(SB)/8, $0x0102030005060704
     47 DATA consts<>+0x50(SB)/8, $0x6170786561707865
     48 DATA consts<>+0x58(SB)/8, $0x6170786561707865
     49 DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
     50 DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
     51 DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
     52 DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
     53 DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
     54 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
     55 DATA consts<>+0x90(SB)/8, $0x0000000100000000
     56 DATA consts<>+0x98(SB)/8, $0x0000000300000002
     57 GLOBL consts<>(SB), RODATA, $0xa0
     58 
     59 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
     60 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
     61 	MOVD out+0(FP), OUT
     62 	MOVD inp+8(FP), INP
     63 	MOVD len+16(FP), LEN
     64 	MOVD key+24(FP), KEY
     65 	MOVD counter+32(FP), CNT
     66 
     67 	// Addressing for constants
     68 	MOVD $consts<>+0x00(SB), CONSTBASE
     69 	MOVD $16, R8
     70 	MOVD $32, R9
     71 	MOVD $48, R10
     72 	MOVD $64, R11
     73 	SRD $6, LEN, BLOCKS
     74 	// V16
     75 	LXVW4X (CONSTBASE)(R0), VS48
     76 	ADD $80,CONSTBASE
     77 
     78 	// Load key into V17,V18
     79 	LXVW4X (KEY)(R0), VS49
     80 	LXVW4X (KEY)(R8), VS50
     81 
     82 	// Load CNT, NONCE into V19
     83 	LXVW4X (CNT)(R0), VS51
     84 
     85 	// Clear V27
     86 	VXOR V27, V27, V27
     87 
     88 	// V28
     89 	LXVW4X (CONSTBASE)(R11), VS60
     90 
     91 	// splat slot from V19 -> V26
     92 	VSPLTW $0, V19, V26
     93 
     94 	VSLDOI $4, V19, V27, V19
     95 	VSLDOI $12, V27, V19, V19
     96 
     97 	VADDUWM V26, V28, V26
     98 
     99 	MOVD $10, R14
    100 	MOVD R14, CTR
    101 
    102 loop_outer_vsx:
    103 	// V0, V1, V2, V3
    104 	LXVW4X (R0)(CONSTBASE), VS32
    105 	LXVW4X (R8)(CONSTBASE), VS33
    106 	LXVW4X (R9)(CONSTBASE), VS34
    107 	LXVW4X (R10)(CONSTBASE), VS35
    108 
    109 	// splat values from V17, V18 into V4-V11
    110 	VSPLTW $0, V17, V4
    111 	VSPLTW $1, V17, V5
    112 	VSPLTW $2, V17, V6
    113 	VSPLTW $3, V17, V7
    114 	VSPLTW $0, V18, V8
    115 	VSPLTW $1, V18, V9
    116 	VSPLTW $2, V18, V10
    117 	VSPLTW $3, V18, V11
    118 
    119 	// VOR
    120 	VOR V26, V26, V12
    121 
    122 	// splat values from V19 -> V13, V14, V15
    123 	VSPLTW $1, V19, V13
    124 	VSPLTW $2, V19, V14
    125 	VSPLTW $3, V19, V15
    126 
    127 	// splat   const values
    128 	VSPLTISW $-16, V27
    129 	VSPLTISW $12, V28
    130 	VSPLTISW $8, V29
    131 	VSPLTISW $7, V30
    132 
    133 loop_vsx:
    134 	VADDUWM V0, V4, V0
    135 	VADDUWM V1, V5, V1
    136 	VADDUWM V2, V6, V2
    137 	VADDUWM V3, V7, V3
    138 
    139 	VXOR V12, V0, V12
    140 	VXOR V13, V1, V13
    141 	VXOR V14, V2, V14
    142 	VXOR V15, V3, V15
    143 
    144 	VRLW V12, V27, V12
    145 	VRLW V13, V27, V13
    146 	VRLW V14, V27, V14
    147 	VRLW V15, V27, V15
    148 
    149 	VADDUWM V8, V12, V8
    150 	VADDUWM V9, V13, V9
    151 	VADDUWM V10, V14, V10
    152 	VADDUWM V11, V15, V11
    153 
    154 	VXOR V4, V8, V4
    155 	VXOR V5, V9, V5
    156 	VXOR V6, V10, V6
    157 	VXOR V7, V11, V7
    158 
    159 	VRLW V4, V28, V4
    160 	VRLW V5, V28, V5
    161 	VRLW V6, V28, V6
    162 	VRLW V7, V28, V7
    163 
    164 	VADDUWM V0, V4, V0
    165 	VADDUWM V1, V5, V1
    166 	VADDUWM V2, V6, V2
    167 	VADDUWM V3, V7, V3
    168 
    169 	VXOR V12, V0, V12
    170 	VXOR V13, V1, V13
    171 	VXOR V14, V2, V14
    172 	VXOR V15, V3, V15
    173 
    174 	VRLW V12, V29, V12
    175 	VRLW V13, V29, V13
    176 	VRLW V14, V29, V14
    177 	VRLW V15, V29, V15
    178 
    179 	VADDUWM V8, V12, V8
    180 	VADDUWM V9, V13, V9
    181 	VADDUWM V10, V14, V10
    182 	VADDUWM V11, V15, V11
    183 
    184 	VXOR V4, V8, V4
    185 	VXOR V5, V9, V5
    186 	VXOR V6, V10, V6
    187 	VXOR V7, V11, V7
    188 
    189 	VRLW V4, V30, V4
    190 	VRLW V5, V30, V5
    191 	VRLW V6, V30, V6
    192 	VRLW V7, V30, V7
    193 
    194 	VADDUWM V0, V5, V0
    195 	VADDUWM V1, V6, V1
    196 	VADDUWM V2, V7, V2
    197 	VADDUWM V3, V4, V3
    198 
    199 	VXOR V15, V0, V15
    200 	VXOR V12, V1, V12
    201 	VXOR V13, V2, V13
    202 	VXOR V14, V3, V14
    203 
    204 	VRLW V15, V27, V15
    205 	VRLW V12, V27, V12
    206 	VRLW V13, V27, V13
    207 	VRLW V14, V27, V14
    208 
    209 	VADDUWM V10, V15, V10
    210 	VADDUWM V11, V12, V11
    211 	VADDUWM V8, V13, V8
    212 	VADDUWM V9, V14, V9
    213 
    214 	VXOR V5, V10, V5
    215 	VXOR V6, V11, V6
    216 	VXOR V7, V8, V7
    217 	VXOR V4, V9, V4
    218 
    219 	VRLW V5, V28, V5
    220 	VRLW V6, V28, V6
    221 	VRLW V7, V28, V7
    222 	VRLW V4, V28, V4
    223 
    224 	VADDUWM V0, V5, V0
    225 	VADDUWM V1, V6, V1
    226 	VADDUWM V2, V7, V2
    227 	VADDUWM V3, V4, V3
    228 
    229 	VXOR V15, V0, V15
    230 	VXOR V12, V1, V12
    231 	VXOR V13, V2, V13
    232 	VXOR V14, V3, V14
    233 
    234 	VRLW V15, V29, V15
    235 	VRLW V12, V29, V12
    236 	VRLW V13, V29, V13
    237 	VRLW V14, V29, V14
    238 
    239 	VADDUWM V10, V15, V10
    240 	VADDUWM V11, V12, V11
    241 	VADDUWM V8, V13, V8
    242 	VADDUWM V9, V14, V9
    243 
    244 	VXOR V5, V10, V5
    245 	VXOR V6, V11, V6
    246 	VXOR V7, V8, V7
    247 	VXOR V4, V9, V4
    248 
    249 	VRLW V5, V30, V5
    250 	VRLW V6, V30, V6
    251 	VRLW V7, V30, V7
    252 	VRLW V4, V30, V4
    253 	BC   16, LT, loop_vsx
    254 
    255 	VADDUWM V12, V26, V12
    256 
    257 	WORD $0x13600F8C		// VMRGEW V0, V1, V27
    258 	WORD $0x13821F8C		// VMRGEW V2, V3, V28
    259 
    260 	WORD $0x10000E8C		// VMRGOW V0, V1, V0
    261 	WORD $0x10421E8C		// VMRGOW V2, V3, V2
    262 
    263 	WORD $0x13A42F8C		// VMRGEW V4, V5, V29
    264 	WORD $0x13C63F8C		// VMRGEW V6, V7, V30
    265 
    266 	XXPERMDI VS32, VS34, $0, VS33
    267 	XXPERMDI VS32, VS34, $3, VS35
    268 	XXPERMDI VS59, VS60, $0, VS32
    269 	XXPERMDI VS59, VS60, $3, VS34
    270 
    271 	WORD $0x10842E8C		// VMRGOW V4, V5, V4
    272 	WORD $0x10C63E8C		// VMRGOW V6, V7, V6
    273 
    274 	WORD $0x13684F8C		// VMRGEW V8, V9, V27
    275 	WORD $0x138A5F8C		// VMRGEW V10, V11, V28
    276 
    277 	XXPERMDI VS36, VS38, $0, VS37
    278 	XXPERMDI VS36, VS38, $3, VS39
    279 	XXPERMDI VS61, VS62, $0, VS36
    280 	XXPERMDI VS61, VS62, $3, VS38
    281 
    282 	WORD $0x11084E8C		// VMRGOW V8, V9, V8
    283 	WORD $0x114A5E8C		// VMRGOW V10, V11, V10
    284 
    285 	WORD $0x13AC6F8C		// VMRGEW V12, V13, V29
    286 	WORD $0x13CE7F8C		// VMRGEW V14, V15, V30
    287 
    288 	XXPERMDI VS40, VS42, $0, VS41
    289 	XXPERMDI VS40, VS42, $3, VS43
    290 	XXPERMDI VS59, VS60, $0, VS40
    291 	XXPERMDI VS59, VS60, $3, VS42
    292 
    293 	WORD $0x118C6E8C		// VMRGOW V12, V13, V12
    294 	WORD $0x11CE7E8C		// VMRGOW V14, V15, V14
    295 
    296 	VSPLTISW $4, V27
    297 	VADDUWM V26, V27, V26
    298 
    299 	XXPERMDI VS44, VS46, $0, VS45
    300 	XXPERMDI VS44, VS46, $3, VS47
    301 	XXPERMDI VS61, VS62, $0, VS44
    302 	XXPERMDI VS61, VS62, $3, VS46
    303 
    304 	VADDUWM V0, V16, V0
    305 	VADDUWM V4, V17, V4
    306 	VADDUWM V8, V18, V8
    307 	VADDUWM V12, V19, V12
    308 
    309 	CMPU LEN, $64
    310 	BLT tail_vsx
    311 
    312 	// Bottom of loop
    313 	LXVW4X (INP)(R0), VS59
    314 	LXVW4X (INP)(R8), VS60
    315 	LXVW4X (INP)(R9), VS61
    316 	LXVW4X (INP)(R10), VS62
    317 
    318 	VXOR V27, V0, V27
    319 	VXOR V28, V4, V28
    320 	VXOR V29, V8, V29
    321 	VXOR V30, V12, V30
    322 
    323 	STXVW4X VS59, (OUT)(R0)
    324 	STXVW4X VS60, (OUT)(R8)
    325 	ADD     $64, INP
    326 	STXVW4X VS61, (OUT)(R9)
    327 	ADD     $-64, LEN
    328 	STXVW4X VS62, (OUT)(R10)
    329 	ADD     $64, OUT
    330 	BEQ     done_vsx
    331 
    332 	VADDUWM V1, V16, V0
    333 	VADDUWM V5, V17, V4
    334 	VADDUWM V9, V18, V8
    335 	VADDUWM V13, V19, V12
    336 
    337 	CMPU  LEN, $64
    338 	BLT   tail_vsx
    339 
    340 	LXVW4X (INP)(R0), VS59
    341 	LXVW4X (INP)(R8), VS60
    342 	LXVW4X (INP)(R9), VS61
    343 	LXVW4X (INP)(R10), VS62
    344 	VXOR   V27, V0, V27
    345 
    346 	VXOR V28, V4, V28
    347 	VXOR V29, V8, V29
    348 	VXOR V30, V12, V30
    349 
    350 	STXVW4X VS59, (OUT)(R0)
    351 	STXVW4X VS60, (OUT)(R8)
    352 	ADD     $64, INP
    353 	STXVW4X VS61, (OUT)(R9)
    354 	ADD     $-64, LEN
    355 	STXVW4X VS62, (OUT)(V10)
    356 	ADD     $64, OUT
    357 	BEQ     done_vsx
    358 
    359 	VADDUWM V2, V16, V0
    360 	VADDUWM V6, V17, V4
    361 	VADDUWM V10, V18, V8
    362 	VADDUWM V14, V19, V12
    363 
    364 	CMPU LEN, $64
    365 	BLT  tail_vsx
    366 
    367 	LXVW4X (INP)(R0), VS59
    368 	LXVW4X (INP)(R8), VS60
    369 	LXVW4X (INP)(R9), VS61
    370 	LXVW4X (INP)(R10), VS62
    371 
    372 	VXOR V27, V0, V27
    373 	VXOR V28, V4, V28
    374 	VXOR V29, V8, V29
    375 	VXOR V30, V12, V30
    376 
    377 	STXVW4X VS59, (OUT)(R0)
    378 	STXVW4X VS60, (OUT)(R8)
    379 	ADD     $64, INP
    380 	STXVW4X VS61, (OUT)(R9)
    381 	ADD     $-64, LEN
    382 	STXVW4X VS62, (OUT)(R10)
    383 	ADD     $64, OUT
    384 	BEQ     done_vsx
    385 
    386 	VADDUWM V3, V16, V0
    387 	VADDUWM V7, V17, V4
    388 	VADDUWM V11, V18, V8
    389 	VADDUWM V15, V19, V12
    390 
    391 	CMPU  LEN, $64
    392 	BLT   tail_vsx
    393 
    394 	LXVW4X (INP)(R0), VS59
    395 	LXVW4X (INP)(R8), VS60
    396 	LXVW4X (INP)(R9), VS61
    397 	LXVW4X (INP)(R10), VS62
    398 
    399 	VXOR V27, V0, V27
    400 	VXOR V28, V4, V28
    401 	VXOR V29, V8, V29
    402 	VXOR V30, V12, V30
    403 
    404 	STXVW4X VS59, (OUT)(R0)
    405 	STXVW4X VS60, (OUT)(R8)
    406 	ADD     $64, INP
    407 	STXVW4X VS61, (OUT)(R9)
    408 	ADD     $-64, LEN
    409 	STXVW4X VS62, (OUT)(R10)
    410 	ADD     $64, OUT
    411 
    412 	MOVD $10, R14
    413 	MOVD R14, CTR
    414 	BNE  loop_outer_vsx
    415 
    416 done_vsx:
    417 	// Increment counter by number of 64 byte blocks
    418 	MOVD (CNT), R14
    419 	ADD  BLOCKS, R14
    420 	MOVD R14, (CNT)
    421 	RET
    422 
    423 tail_vsx:
    424 	ADD  $32, R1, R11
    425 	MOVD LEN, CTR
    426 
    427 	// Save values on stack to copy from
    428 	STXVW4X VS32, (R11)(R0)
    429 	STXVW4X VS36, (R11)(R8)
    430 	STXVW4X VS40, (R11)(R9)
    431 	STXVW4X VS44, (R11)(R10)
    432 	ADD $-1, R11, R12
    433 	ADD $-1, INP
    434 	ADD $-1, OUT
    435 
    436 looptail_vsx:
    437 	// Copying the result to OUT
    438 	// in bytes.
    439 	MOVBZU 1(R12), KEY
    440 	MOVBZU 1(INP), TMP
    441 	XOR    KEY, TMP, KEY
    442 	MOVBU  KEY, 1(OUT)
    443 	BC     16, LT, looptail_vsx
    444 
    445 	// Clear the stack values
    446 	STXVW4X VS48, (R11)(R0)
    447 	STXVW4X VS48, (R11)(R8)
    448 	STXVW4X VS48, (R11)(R9)
    449 	STXVW4X VS48, (R11)(R10)
    450 	BR      done_vsx