chacha_s390x.s (5492B)
1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build gc && !purego 6 // +build gc,!purego 7 8 #include "go_asm.h" 9 #include "textflag.h" 10 11 // This is an implementation of the ChaCha20 encryption algorithm as 12 // specified in RFC 7539. It uses vector instructions to compute 13 // 4 keystream blocks in parallel (256 bytes) which are then XORed 14 // with the bytes in the input slice. 15 16 GLOBL ·constants<>(SB), RODATA|NOPTR, $32 17 // BSWAP: swap bytes in each 4-byte element 18 DATA ·constants<>+0x00(SB)/4, $0x03020100 19 DATA ·constants<>+0x04(SB)/4, $0x07060504 20 DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 21 DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c 22 // J0: [j0, j1, j2, j3] 23 DATA ·constants<>+0x10(SB)/4, $0x61707865 24 DATA ·constants<>+0x14(SB)/4, $0x3320646e 25 DATA ·constants<>+0x18(SB)/4, $0x79622d32 26 DATA ·constants<>+0x1c(SB)/4, $0x6b206574 27 28 #define BSWAP V5 29 #define J0 V6 30 #define KEY0 V7 31 #define KEY1 V8 32 #define NONCE V9 33 #define CTR V10 34 #define M0 V11 35 #define M1 V12 36 #define M2 V13 37 #define M3 V14 38 #define INC V15 39 #define X0 V16 40 #define X1 V17 41 #define X2 V18 42 #define X3 V19 43 #define X4 V20 44 #define X5 V21 45 #define X6 V22 46 #define X7 V23 47 #define X8 V24 48 #define X9 V25 49 #define X10 V26 50 #define X11 V27 51 #define X12 V28 52 #define X13 V29 53 #define X14 V30 54 #define X15 V31 55 56 #define NUM_ROUNDS 20 57 58 #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ 59 VAF a1, a0, a0 \ 60 VAF b1, b0, b0 \ 61 VAF c1, c0, c0 \ 62 VAF d1, d0, d0 \ 63 VX a0, a2, a2 \ 64 VX b0, b2, b2 \ 65 VX c0, c2, c2 \ 66 VX d0, d2, d2 \ 67 VERLLF $16, a2, a2 \ 68 VERLLF $16, b2, b2 \ 69 VERLLF $16, c2, c2 \ 70 VERLLF $16, d2, d2 \ 71 VAF a2, a3, a3 \ 72 VAF b2, b3, b3 \ 73 VAF c2, c3, c3 \ 74 VAF d2, d3, d3 \ 75 VX a3, a1, a1 \ 76 VX b3, b1, b1 \ 77 VX c3, c1, c1 \ 78 VX d3, d1, d1 \ 79 VERLLF $12, a1, a1 \ 80 VERLLF $12, b1, b1 \ 81 VERLLF $12, c1, c1 \ 82 VERLLF $12, d1, d1 \ 83 VAF a1, a0, a0 \ 84 VAF b1, b0, b0 \ 85 VAF c1, c0, c0 \ 86 VAF d1, d0, d0 \ 87 VX a0, a2, a2 \ 88 VX b0, b2, b2 \ 89 VX c0, c2, c2 \ 90 VX d0, d2, d2 \ 91 VERLLF $8, a2, a2 \ 92 VERLLF $8, b2, b2 \ 93 VERLLF $8, c2, c2 \ 94 VERLLF $8, d2, d2 \ 95 VAF a2, a3, a3 \ 96 VAF b2, b3, b3 \ 97 VAF c2, c3, c3 \ 98 VAF d2, d3, d3 \ 99 VX a3, a1, a1 \ 100 VX b3, b1, b1 \ 101 VX c3, c1, c1 \ 102 VX d3, d1, d1 \ 103 VERLLF $7, a1, a1 \ 104 VERLLF $7, b1, b1 \ 105 VERLLF $7, c1, c1 \ 106 VERLLF $7, d1, d1 107 108 #define PERMUTE(mask, v0, v1, v2, v3) \ 109 VPERM v0, v0, mask, v0 \ 110 VPERM v1, v1, mask, v1 \ 111 VPERM v2, v2, mask, v2 \ 112 VPERM v3, v3, mask, v3 113 114 #define ADDV(x, v0, v1, v2, v3) \ 115 VAF x, v0, v0 \ 116 VAF x, v1, v1 \ 117 VAF x, v2, v2 \ 118 VAF x, v3, v3 119 120 #define XORV(off, dst, src, v0, v1, v2, v3) \ 121 VLM off(src), M0, M3 \ 122 PERMUTE(BSWAP, v0, v1, v2, v3) \ 123 VX v0, M0, M0 \ 124 VX v1, M1, M1 \ 125 VX v2, M2, M2 \ 126 VX v3, M3, M3 \ 127 VSTM M0, M3, off(dst) 128 129 #define SHUFFLE(a, b, c, d, t, u, v, w) \ 130 VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} 131 VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} 132 VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} 133 VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} 134 VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} 135 VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} 136 VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} 137 VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} 138 139 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) 140 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 141 MOVD $·constants<>(SB), R1 142 MOVD dst+0(FP), R2 // R2=&dst[0] 143 LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) 144 MOVD key+48(FP), R5 // R5=key 145 MOVD nonce+56(FP), R6 // R6=nonce 146 MOVD counter+64(FP), R7 // R7=counter 147 148 // load BSWAP and J0 149 VLM (R1), BSWAP, J0 150 151 // setup 152 MOVD $95, R0 153 VLM (R5), KEY0, KEY1 154 VLL R0, (R6), NONCE 155 VZERO M0 156 VLEIB $7, $32, M0 157 VSRLB M0, NONCE, NONCE 158 159 // initialize counter values 160 VLREPF (R7), CTR 161 VZERO INC 162 VLEIF $1, $1, INC 163 VLEIF $2, $2, INC 164 VLEIF $3, $3, INC 165 VAF INC, CTR, CTR 166 VREPIF $4, INC 167 168 chacha: 169 VREPF $0, J0, X0 170 VREPF $1, J0, X1 171 VREPF $2, J0, X2 172 VREPF $3, J0, X3 173 VREPF $0, KEY0, X4 174 VREPF $1, KEY0, X5 175 VREPF $2, KEY0, X6 176 VREPF $3, KEY0, X7 177 VREPF $0, KEY1, X8 178 VREPF $1, KEY1, X9 179 VREPF $2, KEY1, X10 180 VREPF $3, KEY1, X11 181 VLR CTR, X12 182 VREPF $1, NONCE, X13 183 VREPF $2, NONCE, X14 184 VREPF $3, NONCE, X15 185 186 MOVD $(NUM_ROUNDS/2), R1 187 188 loop: 189 ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) 190 ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) 191 192 ADD $-1, R1 193 BNE loop 194 195 // decrement length 196 ADD $-256, R4 197 198 // rearrange vectors 199 SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) 200 ADDV(J0, X0, X1, X2, X3) 201 SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) 202 ADDV(KEY0, X4, X5, X6, X7) 203 SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) 204 ADDV(KEY1, X8, X9, X10, X11) 205 VAF CTR, X12, X12 206 SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) 207 ADDV(NONCE, X12, X13, X14, X15) 208 209 // increment counters 210 VAF INC, CTR, CTR 211 212 // xor keystream with plaintext 213 XORV(0*64, R2, R3, X0, X4, X8, X12) 214 XORV(1*64, R2, R3, X1, X5, X9, X13) 215 XORV(2*64, R2, R3, X2, X6, X10, X14) 216 XORV(3*64, R2, R3, X3, X7, X11, X15) 217 218 // increment pointers 219 MOVD $256(R2), R2 220 MOVD $256(R3), R3 221 222 CMPBNE R4, $0, chacha 223 224 VSTEF $0, CTR, (R7) 225 RET