gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

chacha_arm64.s (8185B)


      1 // Copyright 2018 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:build go1.11 && gc && !purego
      6 // +build go1.11,gc,!purego
      7 
      8 #include "textflag.h"
      9 
     10 #define NUM_ROUNDS 10
     11 
     12 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
     13 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
     14 	MOVD	dst+0(FP), R1
     15 	MOVD	src+24(FP), R2
     16 	MOVD	src_len+32(FP), R3
     17 	MOVD	key+48(FP), R4
     18 	MOVD	nonce+56(FP), R6
     19 	MOVD	counter+64(FP), R7
     20 
     21 	MOVD	$·constants(SB), R10
     22 	MOVD	$·incRotMatrix(SB), R11
     23 
     24 	MOVW	(R7), R20
     25 
     26 	AND	$~255, R3, R13
     27 	ADD	R2, R13, R12 // R12 for block end
     28 	AND	$255, R3, R13
     29 loop:
     30 	MOVD	$NUM_ROUNDS, R21
     31 	VLD1	(R11), [V30.S4, V31.S4]
     32 
     33 	// load contants
     34 	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
     35 	WORD	$0x4D60E940
     36 
     37 	// load keys
     38 	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
     39 	WORD	$0x4DFFE884
     40 	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
     41 	WORD	$0x4DFFE888
     42 	SUB	$32, R4
     43 
     44 	// load counter + nonce
     45 	// VLD1R (R7), [V12.S4]
     46 	WORD	$0x4D40C8EC
     47 
     48 	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
     49 	WORD	$0x4D40E8CD
     50 
     51 	// update counter
     52 	VADD	V30.S4, V12.S4, V12.S4
     53 
     54 chacha:
     55 	// V0..V3 += V4..V7
     56 	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
     57 	VADD	V0.S4, V4.S4, V0.S4
     58 	VADD	V1.S4, V5.S4, V1.S4
     59 	VADD	V2.S4, V6.S4, V2.S4
     60 	VADD	V3.S4, V7.S4, V3.S4
     61 	VEOR	V12.B16, V0.B16, V12.B16
     62 	VEOR	V13.B16, V1.B16, V13.B16
     63 	VEOR	V14.B16, V2.B16, V14.B16
     64 	VEOR	V15.B16, V3.B16, V15.B16
     65 	VREV32	V12.H8, V12.H8
     66 	VREV32	V13.H8, V13.H8
     67 	VREV32	V14.H8, V14.H8
     68 	VREV32	V15.H8, V15.H8
     69 	// V8..V11 += V12..V15
     70 	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
     71 	VADD	V8.S4, V12.S4, V8.S4
     72 	VADD	V9.S4, V13.S4, V9.S4
     73 	VADD	V10.S4, V14.S4, V10.S4
     74 	VADD	V11.S4, V15.S4, V11.S4
     75 	VEOR	V8.B16, V4.B16, V16.B16
     76 	VEOR	V9.B16, V5.B16, V17.B16
     77 	VEOR	V10.B16, V6.B16, V18.B16
     78 	VEOR	V11.B16, V7.B16, V19.B16
     79 	VSHL	$12, V16.S4, V4.S4
     80 	VSHL	$12, V17.S4, V5.S4
     81 	VSHL	$12, V18.S4, V6.S4
     82 	VSHL	$12, V19.S4, V7.S4
     83 	VSRI	$20, V16.S4, V4.S4
     84 	VSRI	$20, V17.S4, V5.S4
     85 	VSRI	$20, V18.S4, V6.S4
     86 	VSRI	$20, V19.S4, V7.S4
     87 
     88 	// V0..V3 += V4..V7
     89 	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
     90 	VADD	V0.S4, V4.S4, V0.S4
     91 	VADD	V1.S4, V5.S4, V1.S4
     92 	VADD	V2.S4, V6.S4, V2.S4
     93 	VADD	V3.S4, V7.S4, V3.S4
     94 	VEOR	V12.B16, V0.B16, V12.B16
     95 	VEOR	V13.B16, V1.B16, V13.B16
     96 	VEOR	V14.B16, V2.B16, V14.B16
     97 	VEOR	V15.B16, V3.B16, V15.B16
     98 	VTBL	V31.B16, [V12.B16], V12.B16
     99 	VTBL	V31.B16, [V13.B16], V13.B16
    100 	VTBL	V31.B16, [V14.B16], V14.B16
    101 	VTBL	V31.B16, [V15.B16], V15.B16
    102 
    103 	// V8..V11 += V12..V15
    104 	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
    105 	VADD	V12.S4, V8.S4, V8.S4
    106 	VADD	V13.S4, V9.S4, V9.S4
    107 	VADD	V14.S4, V10.S4, V10.S4
    108 	VADD	V15.S4, V11.S4, V11.S4
    109 	VEOR	V8.B16, V4.B16, V16.B16
    110 	VEOR	V9.B16, V5.B16, V17.B16
    111 	VEOR	V10.B16, V6.B16, V18.B16
    112 	VEOR	V11.B16, V7.B16, V19.B16
    113 	VSHL	$7, V16.S4, V4.S4
    114 	VSHL	$7, V17.S4, V5.S4
    115 	VSHL	$7, V18.S4, V6.S4
    116 	VSHL	$7, V19.S4, V7.S4
    117 	VSRI	$25, V16.S4, V4.S4
    118 	VSRI	$25, V17.S4, V5.S4
    119 	VSRI	$25, V18.S4, V6.S4
    120 	VSRI	$25, V19.S4, V7.S4
    121 
    122 	// V0..V3 += V5..V7, V4
    123 	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
    124 	VADD	V0.S4, V5.S4, V0.S4
    125 	VADD	V1.S4, V6.S4, V1.S4
    126 	VADD	V2.S4, V7.S4, V2.S4
    127 	VADD	V3.S4, V4.S4, V3.S4
    128 	VEOR	V15.B16, V0.B16, V15.B16
    129 	VEOR	V12.B16, V1.B16, V12.B16
    130 	VEOR	V13.B16, V2.B16, V13.B16
    131 	VEOR	V14.B16, V3.B16, V14.B16
    132 	VREV32	V12.H8, V12.H8
    133 	VREV32	V13.H8, V13.H8
    134 	VREV32	V14.H8, V14.H8
    135 	VREV32	V15.H8, V15.H8
    136 
    137 	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
    138 	// ...
    139 	VADD	V15.S4, V10.S4, V10.S4
    140 	VADD	V12.S4, V11.S4, V11.S4
    141 	VADD	V13.S4, V8.S4, V8.S4
    142 	VADD	V14.S4, V9.S4, V9.S4
    143 	VEOR	V10.B16, V5.B16, V16.B16
    144 	VEOR	V11.B16, V6.B16, V17.B16
    145 	VEOR	V8.B16, V7.B16, V18.B16
    146 	VEOR	V9.B16, V4.B16, V19.B16
    147 	VSHL	$12, V16.S4, V5.S4
    148 	VSHL	$12, V17.S4, V6.S4
    149 	VSHL	$12, V18.S4, V7.S4
    150 	VSHL	$12, V19.S4, V4.S4
    151 	VSRI	$20, V16.S4, V5.S4
    152 	VSRI	$20, V17.S4, V6.S4
    153 	VSRI	$20, V18.S4, V7.S4
    154 	VSRI	$20, V19.S4, V4.S4
    155 
    156 	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
    157 	// ...
    158 	VADD	V5.S4, V0.S4, V0.S4
    159 	VADD	V6.S4, V1.S4, V1.S4
    160 	VADD	V7.S4, V2.S4, V2.S4
    161 	VADD	V4.S4, V3.S4, V3.S4
    162 	VEOR	V0.B16, V15.B16, V15.B16
    163 	VEOR	V1.B16, V12.B16, V12.B16
    164 	VEOR	V2.B16, V13.B16, V13.B16
    165 	VEOR	V3.B16, V14.B16, V14.B16
    166 	VTBL	V31.B16, [V12.B16], V12.B16
    167 	VTBL	V31.B16, [V13.B16], V13.B16
    168 	VTBL	V31.B16, [V14.B16], V14.B16
    169 	VTBL	V31.B16, [V15.B16], V15.B16
    170 
    171 	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
    172 	// ...
    173 	VADD	V15.S4, V10.S4, V10.S4
    174 	VADD	V12.S4, V11.S4, V11.S4
    175 	VADD	V13.S4, V8.S4, V8.S4
    176 	VADD	V14.S4, V9.S4, V9.S4
    177 	VEOR	V10.B16, V5.B16, V16.B16
    178 	VEOR	V11.B16, V6.B16, V17.B16
    179 	VEOR	V8.B16, V7.B16, V18.B16
    180 	VEOR	V9.B16, V4.B16, V19.B16
    181 	VSHL	$7, V16.S4, V5.S4
    182 	VSHL	$7, V17.S4, V6.S4
    183 	VSHL	$7, V18.S4, V7.S4
    184 	VSHL	$7, V19.S4, V4.S4
    185 	VSRI	$25, V16.S4, V5.S4
    186 	VSRI	$25, V17.S4, V6.S4
    187 	VSRI	$25, V18.S4, V7.S4
    188 	VSRI	$25, V19.S4, V4.S4
    189 
    190 	SUB	$1, R21
    191 	CBNZ	R21, chacha
    192 
    193 	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
    194 	WORD	$0x4D60E950
    195 
    196 	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
    197 	WORD	$0x4DFFE894
    198 	VADD	V30.S4, V12.S4, V12.S4
    199 	VADD	V16.S4, V0.S4, V0.S4
    200 	VADD	V17.S4, V1.S4, V1.S4
    201 	VADD	V18.S4, V2.S4, V2.S4
    202 	VADD	V19.S4, V3.S4, V3.S4
    203 	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
    204 	WORD	$0x4DFFE898
    205 	// restore R4
    206 	SUB	$32, R4
    207 
    208 	// load counter + nonce
    209 	// VLD1R (R7), [V28.S4]
    210 	WORD	$0x4D40C8FC
    211 	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
    212 	WORD	$0x4D40E8DD
    213 
    214 	VADD	V20.S4, V4.S4, V4.S4
    215 	VADD	V21.S4, V5.S4, V5.S4
    216 	VADD	V22.S4, V6.S4, V6.S4
    217 	VADD	V23.S4, V7.S4, V7.S4
    218 	VADD	V24.S4, V8.S4, V8.S4
    219 	VADD	V25.S4, V9.S4, V9.S4
    220 	VADD	V26.S4, V10.S4, V10.S4
    221 	VADD	V27.S4, V11.S4, V11.S4
    222 	VADD	V28.S4, V12.S4, V12.S4
    223 	VADD	V29.S4, V13.S4, V13.S4
    224 	VADD	V30.S4, V14.S4, V14.S4
    225 	VADD	V31.S4, V15.S4, V15.S4
    226 
    227 	VZIP1	V1.S4, V0.S4, V16.S4
    228 	VZIP2	V1.S4, V0.S4, V17.S4
    229 	VZIP1	V3.S4, V2.S4, V18.S4
    230 	VZIP2	V3.S4, V2.S4, V19.S4
    231 	VZIP1	V5.S4, V4.S4, V20.S4
    232 	VZIP2	V5.S4, V4.S4, V21.S4
    233 	VZIP1	V7.S4, V6.S4, V22.S4
    234 	VZIP2	V7.S4, V6.S4, V23.S4
    235 	VZIP1	V9.S4, V8.S4, V24.S4
    236 	VZIP2	V9.S4, V8.S4, V25.S4
    237 	VZIP1	V11.S4, V10.S4, V26.S4
    238 	VZIP2	V11.S4, V10.S4, V27.S4
    239 	VZIP1	V13.S4, V12.S4, V28.S4
    240 	VZIP2	V13.S4, V12.S4, V29.S4
    241 	VZIP1	V15.S4, V14.S4, V30.S4
    242 	VZIP2	V15.S4, V14.S4, V31.S4
    243 	VZIP1	V18.D2, V16.D2, V0.D2
    244 	VZIP2	V18.D2, V16.D2, V4.D2
    245 	VZIP1	V19.D2, V17.D2, V8.D2
    246 	VZIP2	V19.D2, V17.D2, V12.D2
    247 	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
    248 
    249 	VZIP1	V22.D2, V20.D2, V1.D2
    250 	VZIP2	V22.D2, V20.D2, V5.D2
    251 	VZIP1	V23.D2, V21.D2, V9.D2
    252 	VZIP2	V23.D2, V21.D2, V13.D2
    253 	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
    254 	VZIP1	V26.D2, V24.D2, V2.D2
    255 	VZIP2	V26.D2, V24.D2, V6.D2
    256 	VZIP1	V27.D2, V25.D2, V10.D2
    257 	VZIP2	V27.D2, V25.D2, V14.D2
    258 	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
    259 	VZIP1	V30.D2, V28.D2, V3.D2
    260 	VZIP2	V30.D2, V28.D2, V7.D2
    261 	VZIP1	V31.D2, V29.D2, V11.D2
    262 	VZIP2	V31.D2, V29.D2, V15.D2
    263 	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
    264 	VEOR	V0.B16, V16.B16, V16.B16
    265 	VEOR	V1.B16, V17.B16, V17.B16
    266 	VEOR	V2.B16, V18.B16, V18.B16
    267 	VEOR	V3.B16, V19.B16, V19.B16
    268 	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
    269 	VEOR	V4.B16, V20.B16, V20.B16
    270 	VEOR	V5.B16, V21.B16, V21.B16
    271 	VEOR	V6.B16, V22.B16, V22.B16
    272 	VEOR	V7.B16, V23.B16, V23.B16
    273 	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
    274 	VEOR	V8.B16, V24.B16, V24.B16
    275 	VEOR	V9.B16, V25.B16, V25.B16
    276 	VEOR	V10.B16, V26.B16, V26.B16
    277 	VEOR	V11.B16, V27.B16, V27.B16
    278 	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
    279 	VEOR	V12.B16, V28.B16, V28.B16
    280 	VEOR	V13.B16, V29.B16, V29.B16
    281 	VEOR	V14.B16, V30.B16, V30.B16
    282 	VEOR	V15.B16, V31.B16, V31.B16
    283 	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
    284 
    285 	ADD	$4, R20
    286 	MOVW	R20, (R7) // update counter
    287 
    288 	CMP	R2, R12
    289 	BGT	loop
    290 
    291 	RET
    292 
    293 
    294 DATA	·constants+0x00(SB)/4, $0x61707865
    295 DATA	·constants+0x04(SB)/4, $0x3320646e
    296 DATA	·constants+0x08(SB)/4, $0x79622d32
    297 DATA	·constants+0x0c(SB)/4, $0x6b206574
    298 GLOBL	·constants(SB), NOPTR|RODATA, $32
    299 
    300 DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
    301 DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
    302 DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
    303 DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
    304 DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
    305 DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
    306 DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
    307 DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
    308 GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32