chacha_arm64.s (8185B)
1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build go1.11 && gc && !purego 6 // +build go1.11,gc,!purego 7 8 #include "textflag.h" 9 10 #define NUM_ROUNDS 10 11 12 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) 13 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 14 MOVD dst+0(FP), R1 15 MOVD src+24(FP), R2 16 MOVD src_len+32(FP), R3 17 MOVD key+48(FP), R4 18 MOVD nonce+56(FP), R6 19 MOVD counter+64(FP), R7 20 21 MOVD $·constants(SB), R10 22 MOVD $·incRotMatrix(SB), R11 23 24 MOVW (R7), R20 25 26 AND $~255, R3, R13 27 ADD R2, R13, R12 // R12 for block end 28 AND $255, R3, R13 29 loop: 30 MOVD $NUM_ROUNDS, R21 31 VLD1 (R11), [V30.S4, V31.S4] 32 33 // load contants 34 // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] 35 WORD $0x4D60E940 36 37 // load keys 38 // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4] 39 WORD $0x4DFFE884 40 // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4] 41 WORD $0x4DFFE888 42 SUB $32, R4 43 44 // load counter + nonce 45 // VLD1R (R7), [V12.S4] 46 WORD $0x4D40C8EC 47 48 // VLD3R (R6), [V13.S4, V14.S4, V15.S4] 49 WORD $0x4D40E8CD 50 51 // update counter 52 VADD V30.S4, V12.S4, V12.S4 53 54 chacha: 55 // V0..V3 += V4..V7 56 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) 57 VADD V0.S4, V4.S4, V0.S4 58 VADD V1.S4, V5.S4, V1.S4 59 VADD V2.S4, V6.S4, V2.S4 60 VADD V3.S4, V7.S4, V3.S4 61 VEOR V12.B16, V0.B16, V12.B16 62 VEOR V13.B16, V1.B16, V13.B16 63 VEOR V14.B16, V2.B16, V14.B16 64 VEOR V15.B16, V3.B16, V15.B16 65 VREV32 V12.H8, V12.H8 66 VREV32 V13.H8, V13.H8 67 VREV32 V14.H8, V14.H8 68 VREV32 V15.H8, V15.H8 69 // V8..V11 += V12..V15 70 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) 71 VADD V8.S4, V12.S4, V8.S4 72 VADD V9.S4, V13.S4, V9.S4 73 VADD V10.S4, V14.S4, V10.S4 74 VADD V11.S4, V15.S4, V11.S4 75 VEOR V8.B16, V4.B16, V16.B16 76 VEOR V9.B16, V5.B16, V17.B16 77 VEOR V10.B16, V6.B16, V18.B16 78 VEOR V11.B16, V7.B16, V19.B16 79 VSHL $12, V16.S4, V4.S4 80 VSHL $12, V17.S4, V5.S4 81 VSHL $12, V18.S4, V6.S4 82 VSHL $12, V19.S4, V7.S4 83 VSRI $20, V16.S4, V4.S4 84 VSRI $20, V17.S4, V5.S4 85 VSRI $20, V18.S4, V6.S4 86 VSRI $20, V19.S4, V7.S4 87 88 // V0..V3 += V4..V7 89 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) 90 VADD V0.S4, V4.S4, V0.S4 91 VADD V1.S4, V5.S4, V1.S4 92 VADD V2.S4, V6.S4, V2.S4 93 VADD V3.S4, V7.S4, V3.S4 94 VEOR V12.B16, V0.B16, V12.B16 95 VEOR V13.B16, V1.B16, V13.B16 96 VEOR V14.B16, V2.B16, V14.B16 97 VEOR V15.B16, V3.B16, V15.B16 98 VTBL V31.B16, [V12.B16], V12.B16 99 VTBL V31.B16, [V13.B16], V13.B16 100 VTBL V31.B16, [V14.B16], V14.B16 101 VTBL V31.B16, [V15.B16], V15.B16 102 103 // V8..V11 += V12..V15 104 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) 105 VADD V12.S4, V8.S4, V8.S4 106 VADD V13.S4, V9.S4, V9.S4 107 VADD V14.S4, V10.S4, V10.S4 108 VADD V15.S4, V11.S4, V11.S4 109 VEOR V8.B16, V4.B16, V16.B16 110 VEOR V9.B16, V5.B16, V17.B16 111 VEOR V10.B16, V6.B16, V18.B16 112 VEOR V11.B16, V7.B16, V19.B16 113 VSHL $7, V16.S4, V4.S4 114 VSHL $7, V17.S4, V5.S4 115 VSHL $7, V18.S4, V6.S4 116 VSHL $7, V19.S4, V7.S4 117 VSRI $25, V16.S4, V4.S4 118 VSRI $25, V17.S4, V5.S4 119 VSRI $25, V18.S4, V6.S4 120 VSRI $25, V19.S4, V7.S4 121 122 // V0..V3 += V5..V7, V4 123 // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) 124 VADD V0.S4, V5.S4, V0.S4 125 VADD V1.S4, V6.S4, V1.S4 126 VADD V2.S4, V7.S4, V2.S4 127 VADD V3.S4, V4.S4, V3.S4 128 VEOR V15.B16, V0.B16, V15.B16 129 VEOR V12.B16, V1.B16, V12.B16 130 VEOR V13.B16, V2.B16, V13.B16 131 VEOR V14.B16, V3.B16, V14.B16 132 VREV32 V12.H8, V12.H8 133 VREV32 V13.H8, V13.H8 134 VREV32 V14.H8, V14.H8 135 VREV32 V15.H8, V15.H8 136 137 // V10 += V15; V5 <<<= ((V10 XOR V5), 12) 138 // ... 139 VADD V15.S4, V10.S4, V10.S4 140 VADD V12.S4, V11.S4, V11.S4 141 VADD V13.S4, V8.S4, V8.S4 142 VADD V14.S4, V9.S4, V9.S4 143 VEOR V10.B16, V5.B16, V16.B16 144 VEOR V11.B16, V6.B16, V17.B16 145 VEOR V8.B16, V7.B16, V18.B16 146 VEOR V9.B16, V4.B16, V19.B16 147 VSHL $12, V16.S4, V5.S4 148 VSHL $12, V17.S4, V6.S4 149 VSHL $12, V18.S4, V7.S4 150 VSHL $12, V19.S4, V4.S4 151 VSRI $20, V16.S4, V5.S4 152 VSRI $20, V17.S4, V6.S4 153 VSRI $20, V18.S4, V7.S4 154 VSRI $20, V19.S4, V4.S4 155 156 // V0 += V5; V15 <<<= ((V0 XOR V15), 8) 157 // ... 158 VADD V5.S4, V0.S4, V0.S4 159 VADD V6.S4, V1.S4, V1.S4 160 VADD V7.S4, V2.S4, V2.S4 161 VADD V4.S4, V3.S4, V3.S4 162 VEOR V0.B16, V15.B16, V15.B16 163 VEOR V1.B16, V12.B16, V12.B16 164 VEOR V2.B16, V13.B16, V13.B16 165 VEOR V3.B16, V14.B16, V14.B16 166 VTBL V31.B16, [V12.B16], V12.B16 167 VTBL V31.B16, [V13.B16], V13.B16 168 VTBL V31.B16, [V14.B16], V14.B16 169 VTBL V31.B16, [V15.B16], V15.B16 170 171 // V10 += V15; V5 <<<= ((V10 XOR V5), 7) 172 // ... 173 VADD V15.S4, V10.S4, V10.S4 174 VADD V12.S4, V11.S4, V11.S4 175 VADD V13.S4, V8.S4, V8.S4 176 VADD V14.S4, V9.S4, V9.S4 177 VEOR V10.B16, V5.B16, V16.B16 178 VEOR V11.B16, V6.B16, V17.B16 179 VEOR V8.B16, V7.B16, V18.B16 180 VEOR V9.B16, V4.B16, V19.B16 181 VSHL $7, V16.S4, V5.S4 182 VSHL $7, V17.S4, V6.S4 183 VSHL $7, V18.S4, V7.S4 184 VSHL $7, V19.S4, V4.S4 185 VSRI $25, V16.S4, V5.S4 186 VSRI $25, V17.S4, V6.S4 187 VSRI $25, V18.S4, V7.S4 188 VSRI $25, V19.S4, V4.S4 189 190 SUB $1, R21 191 CBNZ R21, chacha 192 193 // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4] 194 WORD $0x4D60E950 195 196 // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4] 197 WORD $0x4DFFE894 198 VADD V30.S4, V12.S4, V12.S4 199 VADD V16.S4, V0.S4, V0.S4 200 VADD V17.S4, V1.S4, V1.S4 201 VADD V18.S4, V2.S4, V2.S4 202 VADD V19.S4, V3.S4, V3.S4 203 // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4] 204 WORD $0x4DFFE898 205 // restore R4 206 SUB $32, R4 207 208 // load counter + nonce 209 // VLD1R (R7), [V28.S4] 210 WORD $0x4D40C8FC 211 // VLD3R (R6), [V29.S4, V30.S4, V31.S4] 212 WORD $0x4D40E8DD 213 214 VADD V20.S4, V4.S4, V4.S4 215 VADD V21.S4, V5.S4, V5.S4 216 VADD V22.S4, V6.S4, V6.S4 217 VADD V23.S4, V7.S4, V7.S4 218 VADD V24.S4, V8.S4, V8.S4 219 VADD V25.S4, V9.S4, V9.S4 220 VADD V26.S4, V10.S4, V10.S4 221 VADD V27.S4, V11.S4, V11.S4 222 VADD V28.S4, V12.S4, V12.S4 223 VADD V29.S4, V13.S4, V13.S4 224 VADD V30.S4, V14.S4, V14.S4 225 VADD V31.S4, V15.S4, V15.S4 226 227 VZIP1 V1.S4, V0.S4, V16.S4 228 VZIP2 V1.S4, V0.S4, V17.S4 229 VZIP1 V3.S4, V2.S4, V18.S4 230 VZIP2 V3.S4, V2.S4, V19.S4 231 VZIP1 V5.S4, V4.S4, V20.S4 232 VZIP2 V5.S4, V4.S4, V21.S4 233 VZIP1 V7.S4, V6.S4, V22.S4 234 VZIP2 V7.S4, V6.S4, V23.S4 235 VZIP1 V9.S4, V8.S4, V24.S4 236 VZIP2 V9.S4, V8.S4, V25.S4 237 VZIP1 V11.S4, V10.S4, V26.S4 238 VZIP2 V11.S4, V10.S4, V27.S4 239 VZIP1 V13.S4, V12.S4, V28.S4 240 VZIP2 V13.S4, V12.S4, V29.S4 241 VZIP1 V15.S4, V14.S4, V30.S4 242 VZIP2 V15.S4, V14.S4, V31.S4 243 VZIP1 V18.D2, V16.D2, V0.D2 244 VZIP2 V18.D2, V16.D2, V4.D2 245 VZIP1 V19.D2, V17.D2, V8.D2 246 VZIP2 V19.D2, V17.D2, V12.D2 247 VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16] 248 249 VZIP1 V22.D2, V20.D2, V1.D2 250 VZIP2 V22.D2, V20.D2, V5.D2 251 VZIP1 V23.D2, V21.D2, V9.D2 252 VZIP2 V23.D2, V21.D2, V13.D2 253 VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16] 254 VZIP1 V26.D2, V24.D2, V2.D2 255 VZIP2 V26.D2, V24.D2, V6.D2 256 VZIP1 V27.D2, V25.D2, V10.D2 257 VZIP2 V27.D2, V25.D2, V14.D2 258 VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16] 259 VZIP1 V30.D2, V28.D2, V3.D2 260 VZIP2 V30.D2, V28.D2, V7.D2 261 VZIP1 V31.D2, V29.D2, V11.D2 262 VZIP2 V31.D2, V29.D2, V15.D2 263 VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16] 264 VEOR V0.B16, V16.B16, V16.B16 265 VEOR V1.B16, V17.B16, V17.B16 266 VEOR V2.B16, V18.B16, V18.B16 267 VEOR V3.B16, V19.B16, V19.B16 268 VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1) 269 VEOR V4.B16, V20.B16, V20.B16 270 VEOR V5.B16, V21.B16, V21.B16 271 VEOR V6.B16, V22.B16, V22.B16 272 VEOR V7.B16, V23.B16, V23.B16 273 VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1) 274 VEOR V8.B16, V24.B16, V24.B16 275 VEOR V9.B16, V25.B16, V25.B16 276 VEOR V10.B16, V26.B16, V26.B16 277 VEOR V11.B16, V27.B16, V27.B16 278 VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1) 279 VEOR V12.B16, V28.B16, V28.B16 280 VEOR V13.B16, V29.B16, V29.B16 281 VEOR V14.B16, V30.B16, V30.B16 282 VEOR V15.B16, V31.B16, V31.B16 283 VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1) 284 285 ADD $4, R20 286 MOVW R20, (R7) // update counter 287 288 CMP R2, R12 289 BGT loop 290 291 RET 292 293 294 DATA ·constants+0x00(SB)/4, $0x61707865 295 DATA ·constants+0x04(SB)/4, $0x3320646e 296 DATA ·constants+0x08(SB)/4, $0x79622d32 297 DATA ·constants+0x0c(SB)/4, $0x6b206574 298 GLOBL ·constants(SB), NOPTR|RODATA, $32 299 300 DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 301 DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 302 DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 303 DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 304 DATA ·incRotMatrix+0x10(SB)/4, $0x02010003 305 DATA ·incRotMatrix+0x14(SB)/4, $0x06050407 306 DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B 307 DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F 308 GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32