sha256blockAvx512_amd64.asm (33500B)
1 2 // 16x Parallel implementation of SHA256 for AVX512 3 4 // 5 // Minio Cloud Storage, (C) 2017 Minio, Inc. 6 // 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // 13 // Unless required by applicable law or agreed to in writing, software 14 // distributed under the License is distributed on an "AS IS" BASIS, 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 // See the License for the specific language governing permissions and 17 // limitations under the License. 18 19 // 20 // This code is based on the Intel Multi-Buffer Crypto for IPSec library 21 // and more specifically the following implementation: 22 // https://github.com/intel/intel-ipsec-mb/blob/master/avx512/sha256_x16_avx512.asm 23 // 24 // For Golang it has been converted into Plan 9 assembly with the help of 25 // github.com/minio/asm2plan9s to assemble the AVX512 instructions 26 // 27 28 // Copyright (c) 2017, Intel Corporation 29 // 30 // Redistribution and use in source and binary forms, with or without 31 // modification, are permitted provided that the following conditions are met: 32 // 33 // * Redistributions of source code must retain the above copyright notice, 34 // this list of conditions and the following disclaimer. 35 // * Redistributions in binary form must reproduce the above copyright 36 // notice, this list of conditions and the following disclaimer in the 37 // documentation and/or other materials provided with the distribution. 38 // * Neither the name of Intel Corporation nor the names of its contributors 39 // may be used to endorse or promote products derived from this software 40 // without specific prior written permission. 41 // 42 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 43 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 45 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 46 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 48 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 49 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 50 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 51 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 52 53 #define SHA256_DIGEST_ROW_SIZE 64 54 55 // arg1 56 #define STATE rdi 57 #define STATE_P9 DI 58 // arg2 59 #define INP_SIZE rsi 60 #define INP_SIZE_P9 SI 61 62 #define IDX rcx 63 #define TBL rdx 64 #define TBL_P9 DX 65 66 #define INPUT rax 67 #define INPUT_P9 AX 68 69 #define inp0 r9 70 #define SCRATCH_P9 R12 71 #define SCRATCH r12 72 #define maskp r13 73 #define MASKP_P9 R13 74 #define mask r14 75 #define MASK_P9 R14 76 77 #define A zmm0 78 #define B zmm1 79 #define C zmm2 80 #define D zmm3 81 #define E zmm4 82 #define F zmm5 83 #define G zmm6 84 #define H zmm7 85 #define T1 zmm8 86 #define TMP0 zmm9 87 #define TMP1 zmm10 88 #define TMP2 zmm11 89 #define TMP3 zmm12 90 #define TMP4 zmm13 91 #define TMP5 zmm14 92 #define TMP6 zmm15 93 94 #define W0 zmm16 95 #define W1 zmm17 96 #define W2 zmm18 97 #define W3 zmm19 98 #define W4 zmm20 99 #define W5 zmm21 100 #define W6 zmm22 101 #define W7 zmm23 102 #define W8 zmm24 103 #define W9 zmm25 104 #define W10 zmm26 105 #define W11 zmm27 106 #define W12 zmm28 107 #define W13 zmm29 108 #define W14 zmm30 109 #define W15 zmm31 110 111 112 #define TRANSPOSE16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _r10, _r11, _r12, _r13, _r14, _r15, _t0, _t1) \ 113 \ 114 \ // input r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} 115 \ // r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} 116 \ // r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} 117 \ // r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} 118 \ // r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} 119 \ // r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} 120 \ // r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} 121 \ // r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} 122 \ // r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} 123 \ // r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} 124 \ // r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} 125 \ // r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} 126 \ // r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} 127 \ // r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} 128 \ // r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} 129 \ // r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} 130 \ 131 \ // output r0 = { p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} 132 \ // r1 = { p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} 133 \ // r2 = { p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} 134 \ // r3 = { p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} 135 \ // r4 = { p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} 136 \ // r5 = { p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} 137 \ // r6 = { p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} 138 \ // r7 = { p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} 139 \ // r8 = { p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} 140 \ // r9 = { p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} 141 \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} 142 \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} 143 \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} 144 \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} 145 \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} 146 \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} 147 \ 148 \ // process top half 149 vshufps _t0, _r0, _r1, 0x44 \ // t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} 150 vshufps _r0, _r0, _r1, 0xEE \ // r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} 151 vshufps _t1, _r2, _r3, 0x44 \ // t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} 152 vshufps _r2, _r2, _r3, 0xEE \ // r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} 153 \ 154 vshufps _r3, _t0, _t1, 0xDD \ // r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} 155 vshufps _r1, _r0, _r2, 0x88 \ // r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} 156 vshufps _r0, _r0, _r2, 0xDD \ // r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} 157 vshufps _t0, _t0, _t1, 0x88 \ // t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} 158 \ 159 \ // use r2 in place of t0 160 vshufps _r2, _r4, _r5, 0x44 \ // r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} 161 vshufps _r4, _r4, _r5, 0xEE \ // r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} 162 vshufps _t1, _r6, _r7, 0x44 \ // t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} 163 vshufps _r6, _r6, _r7, 0xEE \ // r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} 164 \ 165 vshufps _r7, _r2, _t1, 0xDD \ // r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} 166 vshufps _r5, _r4, _r6, 0x88 \ // r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} 167 vshufps _r4, _r4, _r6, 0xDD \ // r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} 168 vshufps _r2, _r2, _t1, 0x88 \ // r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} 169 \ 170 \ // use r6 in place of t0 171 vshufps _r6, _r8, _r9, 0x44 \ // r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} 172 vshufps _r8, _r8, _r9, 0xEE \ // r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} 173 vshufps _t1, _r10, _r11, 0x44 \ // t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} 174 vshufps _r10, _r10, _r11, 0xEE \ // r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} 175 \ 176 vshufps _r11, _r6, _t1, 0xDD \ // r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} 177 vshufps _r9, _r8, _r10, 0x88 \ // r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} 178 vshufps _r8, _r8, _r10, 0xDD \ // r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} 179 vshufps _r6, _r6, _t1, 0x88 \ // r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} 180 \ 181 \ // use r10 in place of t0 182 vshufps _r10, _r12, _r13, 0x44 \ // r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} 183 vshufps _r12, _r12, _r13, 0xEE \ // r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} 184 vshufps _t1, _r14, _r15, 0x44 \ // t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} 185 vshufps _r14, _r14, _r15, 0xEE \ // r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} 186 \ 187 vshufps _r15, _r10, _t1, 0xDD \ // r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} 188 vshufps _r13, _r12, _r14, 0x88 \ // r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} 189 vshufps _r12, _r12, _r14, 0xDD \ // r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} 190 vshufps _r10, _r10, _t1, 0x88 \ // r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} 191 \ 192 \ // At this point, the registers that contain interesting data are: 193 \ // t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 194 \ // Can use t1 and r14 as scratch registers 195 LEAQ PSHUFFLE_TRANSPOSE16_MASK1<>(SB), BX \ 196 LEAQ PSHUFFLE_TRANSPOSE16_MASK2<>(SB), R8 \ 197 \ 198 vmovdqu32 _r14, [rbx] \ 199 vpermi2q _r14, _t0, _r2 \ // r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} 200 vmovdqu32 _t1, [r8] \ 201 vpermi2q _t1, _t0, _r2 \ // t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} 202 \ 203 vmovdqu32 _r2, [rbx] \ 204 vpermi2q _r2, _r3, _r7 \ // r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} 205 vmovdqu32 _t0, [r8] \ 206 vpermi2q _t0, _r3, _r7 \ // t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} 207 \ 208 vmovdqu32 _r3, [rbx] \ 209 vpermi2q _r3, _r1, _r5 \ // r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} 210 vmovdqu32 _r7, [r8] \ 211 vpermi2q _r7, _r1, _r5 \ // r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} 212 \ 213 vmovdqu32 _r1, [rbx] \ 214 vpermi2q _r1, _r0, _r4 \ // r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} 215 vmovdqu32 _r5, [r8] \ 216 vpermi2q _r5, _r0, _r4 \ // r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} 217 \ 218 vmovdqu32 _r0, [rbx] \ 219 vpermi2q _r0, _r6, _r10 \ // r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} 220 vmovdqu32 _r4, [r8] \ 221 vpermi2q _r4, _r6, _r10 \ // r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} 222 \ 223 vmovdqu32 _r6, [rbx] \ 224 vpermi2q _r6, _r11, _r15 \ // r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} 225 vmovdqu32 _r10, [r8] \ 226 vpermi2q _r10, _r11, _r15 \ // r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} 227 \ 228 vmovdqu32 _r11, [rbx] \ 229 vpermi2q _r11, _r9, _r13 \ // r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} 230 vmovdqu32 _r15, [r8] \ 231 vpermi2q _r15, _r9, _r13 \ // r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} 232 \ 233 vmovdqu32 _r9, [rbx] \ 234 vpermi2q _r9, _r8, _r12 \ // r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} 235 vmovdqu32 _r13, [r8] \ 236 vpermi2q _r13, _r8, _r12 \ // r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} 237 \ 238 \ // At this point r8 and r12 can be used as scratch registers 239 vshuff64x2 _r8, _r14, _r0, 0xEE \ // r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} 240 vshuff64x2 _r0, _r14, _r0, 0x44 \ // r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} 241 \ 242 vshuff64x2 _r12, _t1, _r4, 0xEE \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} 243 vshuff64x2 _r4, _t1, _r4, 0x44 \ // r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} 244 \ 245 vshuff64x2 _r14, _r7, _r15, 0xEE \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} 246 vshuff64x2 _t1, _r7, _r15, 0x44 \ // t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} 247 \ 248 vshuff64x2 _r15, _r5, _r13, 0xEE \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} 249 vshuff64x2 _r7, _r5, _r13, 0x44 \ // r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} 250 \ 251 vshuff64x2 _r13, _t0, _r10, 0xEE \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} 252 vshuff64x2 _r5, _t0, _r10, 0x44 \ // r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} 253 \ 254 vshuff64x2 _r10, _r3, _r11, 0xEE \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} 255 vshuff64x2 _t0, _r3, _r11, 0x44 \ // t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} 256 \ 257 vshuff64x2 _r11, _r1, _r9, 0xEE \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} 258 vshuff64x2 _r3, _r1, _r9, 0x44 \ // r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} 259 \ 260 vshuff64x2 _r9, _r2, _r6, 0xEE \ // r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} 261 vshuff64x2 _r1, _r2, _r6, 0x44 \ // r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} 262 \ 263 vmovdqu32 _r2, _t0 \ // r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} 264 vmovdqu32 _r6, _t1 \ // r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} 265 266 267 // CH(A, B, C) = (A&B) ^ (~A&C) 268 // MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) 269 // SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22 270 // SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25 271 // sigma0 = ROR_7 ^ ROR_18 ^ SHR_3 272 // sigma1 = ROR_17 ^ ROR_19 ^ SHR_10 273 274 // Main processing loop per round 275 #define PROCESS_LOOP(_WT, _ROUND, _A, _B, _C, _D, _E, _F, _G, _H) \ 276 \ // T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt 277 \ // T2 = SIGMA0(A) + MAJ(A, B, C) 278 \ // H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 279 \ 280 \ // H becomes T2, then add T1 for A 281 \ // D becomes D + T1 for E 282 \ 283 vpaddd T1, _H, TMP3 \ // T1 = H + Kt 284 vmovdqu32 TMP0, _E \ 285 vprord TMP1, _E, 6 \ // ROR_6(E) 286 vprord TMP2, _E, 11 \ // ROR_11(E) 287 vprord TMP3, _E, 25 \ // ROR_25(E) 288 vpternlogd TMP0, _F, _G, 0xCA \ // TMP0 = CH(E,F,G) 289 vpaddd T1, T1, _WT \ // T1 = T1 + Wt 290 vpternlogd TMP1, TMP2, TMP3, 0x96 \ // TMP1 = SIGMA1(E) 291 vpaddd T1, T1, TMP0 \ // T1 = T1 + CH(E,F,G) 292 vpaddd T1, T1, TMP1 \ // T1 = T1 + SIGMA1(E) 293 vpaddd _D, _D, T1 \ // D = D + T1 294 \ 295 vprord _H, _A, 2 \ // ROR_2(A) 296 vprord TMP2, _A, 13 \ // ROR_13(A) 297 vprord TMP3, _A, 22 \ // ROR_22(A) 298 vmovdqu32 TMP0, _A \ 299 vpternlogd TMP0, _B, _C, 0xE8 \ // TMP0 = MAJ(A,B,C) 300 vpternlogd _H, TMP2, TMP3, 0x96 \ // H(T2) = SIGMA0(A) 301 vpaddd _H, _H, TMP0 \ // H(T2) = SIGMA0(A) + MAJ(A,B,C) 302 vpaddd _H, _H, T1 \ // H(A) = H(T2) + T1 303 \ 304 vmovdqu32 TMP3, [TBL + ((_ROUND+1)*64)] \ // Next Kt 305 306 307 #define MSG_SCHED_ROUND_16_63(_WT, _WTp1, _WTp9, _WTp14) \ 308 vprord TMP4, _WTp14, 17 \ // ROR_17(Wt-2) 309 vprord TMP5, _WTp14, 19 \ // ROR_19(Wt-2) 310 vpsrld TMP6, _WTp14, 10 \ // SHR_10(Wt-2) 311 vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma1(Wt-2) 312 \ 313 vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2) 314 vpaddd _WT, _WT, _WTp9 \ // Wt = Wt-16 + sigma1(Wt-2) + Wt-7 315 \ 316 vprord TMP4, _WTp1, 7 \ // ROR_7(Wt-15) 317 vprord TMP5, _WTp1, 18 \ // ROR_18(Wt-15) 318 vpsrld TMP6, _WTp1, 3 \ // SHR_3(Wt-15) 319 vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma0(Wt-15) 320 \ 321 vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2) + 322 \ // Wt-7 + sigma0(Wt-15) + 323 324 325 // Note this is reading in a block of data for one lane 326 // When all 16 are read, the data must be transposed to build msg schedule 327 #define MSG_SCHED_ROUND_00_15(_WT, OFFSET, LABEL) \ 328 TESTQ $(1<<OFFSET), MASK_P9 \ 329 JE LABEL \ 330 MOVQ OFFSET*24(INPUT_P9), R9 \ 331 vmovups _WT, [inp0+IDX] \ 332 LABEL: \ 333 334 #define MASKED_LOAD(_WT, OFFSET, LABEL) \ 335 TESTQ $(1<<OFFSET), MASK_P9 \ 336 JE LABEL \ 337 MOVQ OFFSET*24(INPUT_P9), R9 \ 338 vmovups _WT,[inp0+IDX] \ 339 LABEL: \ 340 341 TEXT ·sha256_x16_avx512(SB), 7, $0 342 MOVQ digests+0(FP), STATE_P9 // 343 MOVQ scratch+8(FP), SCRATCH_P9 344 MOVQ mask_len+32(FP), INP_SIZE_P9 // number of blocks to process 345 MOVQ mask+24(FP), MASKP_P9 346 MOVQ (MASKP_P9), MASK_P9 347 kmovq k1, mask 348 LEAQ inputs+48(FP), INPUT_P9 349 350 // Initialize digests 351 vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE] 352 vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE] 353 vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE] 354 vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE] 355 vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE] 356 vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE] 357 vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE] 358 vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE] 359 360 MOVQ table+16(FP), TBL_P9 361 362 xor IDX, IDX 363 364 // Read in first block of input data 365 MASKED_LOAD( W0, 0, skipInput0) 366 MASKED_LOAD( W1, 1, skipInput1) 367 MASKED_LOAD( W2, 2, skipInput2) 368 MASKED_LOAD( W3, 3, skipInput3) 369 MASKED_LOAD( W4, 4, skipInput4) 370 MASKED_LOAD( W5, 5, skipInput5) 371 MASKED_LOAD( W6, 6, skipInput6) 372 MASKED_LOAD( W7, 7, skipInput7) 373 MASKED_LOAD( W8, 8, skipInput8) 374 MASKED_LOAD( W9, 9, skipInput9) 375 MASKED_LOAD(W10, 10, skipInput10) 376 MASKED_LOAD(W11, 11, skipInput11) 377 MASKED_LOAD(W12, 12, skipInput12) 378 MASKED_LOAD(W13, 13, skipInput13) 379 MASKED_LOAD(W14, 14, skipInput14) 380 MASKED_LOAD(W15, 15, skipInput15) 381 382 lloop: 383 LEAQ PSHUFFLE_BYTE_FLIP_MASK<>(SB), TBL_P9 384 vmovdqu32 TMP2, [TBL] 385 386 // Get first K from table 387 MOVQ table+16(FP), TBL_P9 388 vmovdqu32 TMP3, [TBL] 389 390 // Save digests for later addition 391 vmovdqu32 [SCRATCH + 64*0], A 392 vmovdqu32 [SCRATCH + 64*1], B 393 vmovdqu32 [SCRATCH + 64*2], C 394 vmovdqu32 [SCRATCH + 64*3], D 395 vmovdqu32 [SCRATCH + 64*4], E 396 vmovdqu32 [SCRATCH + 64*5], F 397 vmovdqu32 [SCRATCH + 64*6], G 398 vmovdqu32 [SCRATCH + 64*7], H 399 400 add IDX, 64 401 402 // Transpose input data 403 TRANSPOSE16(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1) 404 405 vpshufb W0, W0, TMP2 406 vpshufb W1, W1, TMP2 407 vpshufb W2, W2, TMP2 408 vpshufb W3, W3, TMP2 409 vpshufb W4, W4, TMP2 410 vpshufb W5, W5, TMP2 411 vpshufb W6, W6, TMP2 412 vpshufb W7, W7, TMP2 413 vpshufb W8, W8, TMP2 414 vpshufb W9, W9, TMP2 415 vpshufb W10, W10, TMP2 416 vpshufb W11, W11, TMP2 417 vpshufb W12, W12, TMP2 418 vpshufb W13, W13, TMP2 419 vpshufb W14, W14, TMP2 420 vpshufb W15, W15, TMP2 421 422 // MSG Schedule for W0-W15 is now complete in registers 423 // Process first 48 rounds 424 // Calculate next Wt+16 after processing is complete and Wt is unneeded 425 426 PROCESS_LOOP( W0, 0, A, B, C, D, E, F, G, H) 427 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14) 428 PROCESS_LOOP( W1, 1, H, A, B, C, D, E, F, G) 429 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15) 430 PROCESS_LOOP( W2, 2, G, H, A, B, C, D, E, F) 431 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0) 432 PROCESS_LOOP( W3, 3, F, G, H, A, B, C, D, E) 433 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1) 434 PROCESS_LOOP( W4, 4, E, F, G, H, A, B, C, D) 435 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2) 436 PROCESS_LOOP( W5, 5, D, E, F, G, H, A, B, C) 437 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3) 438 PROCESS_LOOP( W6, 6, C, D, E, F, G, H, A, B) 439 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4) 440 PROCESS_LOOP( W7, 7, B, C, D, E, F, G, H, A) 441 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5) 442 PROCESS_LOOP( W8, 8, A, B, C, D, E, F, G, H) 443 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6) 444 PROCESS_LOOP( W9, 9, H, A, B, C, D, E, F, G) 445 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7) 446 PROCESS_LOOP(W10, 10, G, H, A, B, C, D, E, F) 447 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8) 448 PROCESS_LOOP(W11, 11, F, G, H, A, B, C, D, E) 449 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9) 450 PROCESS_LOOP(W12, 12, E, F, G, H, A, B, C, D) 451 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10) 452 PROCESS_LOOP(W13, 13, D, E, F, G, H, A, B, C) 453 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11) 454 PROCESS_LOOP(W14, 14, C, D, E, F, G, H, A, B) 455 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12) 456 PROCESS_LOOP(W15, 15, B, C, D, E, F, G, H, A) 457 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13) 458 PROCESS_LOOP( W0, 16, A, B, C, D, E, F, G, H) 459 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14) 460 PROCESS_LOOP( W1, 17, H, A, B, C, D, E, F, G) 461 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15) 462 PROCESS_LOOP( W2, 18, G, H, A, B, C, D, E, F) 463 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0) 464 PROCESS_LOOP( W3, 19, F, G, H, A, B, C, D, E) 465 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1) 466 PROCESS_LOOP( W4, 20, E, F, G, H, A, B, C, D) 467 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2) 468 PROCESS_LOOP( W5, 21, D, E, F, G, H, A, B, C) 469 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3) 470 PROCESS_LOOP( W6, 22, C, D, E, F, G, H, A, B) 471 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4) 472 PROCESS_LOOP( W7, 23, B, C, D, E, F, G, H, A) 473 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5) 474 PROCESS_LOOP( W8, 24, A, B, C, D, E, F, G, H) 475 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6) 476 PROCESS_LOOP( W9, 25, H, A, B, C, D, E, F, G) 477 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7) 478 PROCESS_LOOP(W10, 26, G, H, A, B, C, D, E, F) 479 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8) 480 PROCESS_LOOP(W11, 27, F, G, H, A, B, C, D, E) 481 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9) 482 PROCESS_LOOP(W12, 28, E, F, G, H, A, B, C, D) 483 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10) 484 PROCESS_LOOP(W13, 29, D, E, F, G, H, A, B, C) 485 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11) 486 PROCESS_LOOP(W14, 30, C, D, E, F, G, H, A, B) 487 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12) 488 PROCESS_LOOP(W15, 31, B, C, D, E, F, G, H, A) 489 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13) 490 PROCESS_LOOP( W0, 32, A, B, C, D, E, F, G, H) 491 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14) 492 PROCESS_LOOP( W1, 33, H, A, B, C, D, E, F, G) 493 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15) 494 PROCESS_LOOP( W2, 34, G, H, A, B, C, D, E, F) 495 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0) 496 PROCESS_LOOP( W3, 35, F, G, H, A, B, C, D, E) 497 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1) 498 PROCESS_LOOP( W4, 36, E, F, G, H, A, B, C, D) 499 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2) 500 PROCESS_LOOP( W5, 37, D, E, F, G, H, A, B, C) 501 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3) 502 PROCESS_LOOP( W6, 38, C, D, E, F, G, H, A, B) 503 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4) 504 PROCESS_LOOP( W7, 39, B, C, D, E, F, G, H, A) 505 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5) 506 PROCESS_LOOP( W8, 40, A, B, C, D, E, F, G, H) 507 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6) 508 PROCESS_LOOP( W9, 41, H, A, B, C, D, E, F, G) 509 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7) 510 PROCESS_LOOP(W10, 42, G, H, A, B, C, D, E, F) 511 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8) 512 PROCESS_LOOP(W11, 43, F, G, H, A, B, C, D, E) 513 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9) 514 PROCESS_LOOP(W12, 44, E, F, G, H, A, B, C, D) 515 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10) 516 PROCESS_LOOP(W13, 45, D, E, F, G, H, A, B, C) 517 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11) 518 PROCESS_LOOP(W14, 46, C, D, E, F, G, H, A, B) 519 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12) 520 PROCESS_LOOP(W15, 47, B, C, D, E, F, G, H, A) 521 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13) 522 523 // Check if this is the last block 524 sub INP_SIZE, 1 525 JE lastLoop 526 527 // Load next mask for inputs 528 ADDQ $8, MASKP_P9 529 MOVQ (MASKP_P9), MASK_P9 530 531 // Process last 16 rounds 532 // Read in next block msg data for use in first 16 words of msg sched 533 534 PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H) 535 MSG_SCHED_ROUND_00_15( W0, 0, skipNext0) 536 PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G) 537 MSG_SCHED_ROUND_00_15( W1, 1, skipNext1) 538 PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F) 539 MSG_SCHED_ROUND_00_15( W2, 2, skipNext2) 540 PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E) 541 MSG_SCHED_ROUND_00_15( W3, 3, skipNext3) 542 PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D) 543 MSG_SCHED_ROUND_00_15( W4, 4, skipNext4) 544 PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C) 545 MSG_SCHED_ROUND_00_15( W5, 5, skipNext5) 546 PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B) 547 MSG_SCHED_ROUND_00_15( W6, 6, skipNext6) 548 PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A) 549 MSG_SCHED_ROUND_00_15( W7, 7, skipNext7) 550 PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H) 551 MSG_SCHED_ROUND_00_15( W8, 8, skipNext8) 552 PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G) 553 MSG_SCHED_ROUND_00_15( W9, 9, skipNext9) 554 PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F) 555 MSG_SCHED_ROUND_00_15(W10, 10, skipNext10) 556 PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E) 557 MSG_SCHED_ROUND_00_15(W11, 11, skipNext11) 558 PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D) 559 MSG_SCHED_ROUND_00_15(W12, 12, skipNext12) 560 PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C) 561 MSG_SCHED_ROUND_00_15(W13, 13, skipNext13) 562 PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B) 563 MSG_SCHED_ROUND_00_15(W14, 14, skipNext14) 564 PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A) 565 MSG_SCHED_ROUND_00_15(W15, 15, skipNext15) 566 567 // Add old digest 568 vmovdqu32 TMP2, A 569 vmovdqu32 A, [SCRATCH + 64*0] 570 vpaddd A{k1}, A, TMP2 571 vmovdqu32 TMP2, B 572 vmovdqu32 B, [SCRATCH + 64*1] 573 vpaddd B{k1}, B, TMP2 574 vmovdqu32 TMP2, C 575 vmovdqu32 C, [SCRATCH + 64*2] 576 vpaddd C{k1}, C, TMP2 577 vmovdqu32 TMP2, D 578 vmovdqu32 D, [SCRATCH + 64*3] 579 vpaddd D{k1}, D, TMP2 580 vmovdqu32 TMP2, E 581 vmovdqu32 E, [SCRATCH + 64*4] 582 vpaddd E{k1}, E, TMP2 583 vmovdqu32 TMP2, F 584 vmovdqu32 F, [SCRATCH + 64*5] 585 vpaddd F{k1}, F, TMP2 586 vmovdqu32 TMP2, G 587 vmovdqu32 G, [SCRATCH + 64*6] 588 vpaddd G{k1}, G, TMP2 589 vmovdqu32 TMP2, H 590 vmovdqu32 H, [SCRATCH + 64*7] 591 vpaddd H{k1}, H, TMP2 592 593 kmovq k1, mask 594 JMP lloop 595 596 lastLoop: 597 // Process last 16 rounds 598 PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H) 599 PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G) 600 PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F) 601 PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E) 602 PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D) 603 PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C) 604 PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B) 605 PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A) 606 PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H) 607 PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G) 608 PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F) 609 PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E) 610 PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D) 611 PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C) 612 PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B) 613 PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A) 614 615 // Add old digest 616 vmovdqu32 TMP2, A 617 vmovdqu32 A, [SCRATCH + 64*0] 618 vpaddd A{k1}, A, TMP2 619 vmovdqu32 TMP2, B 620 vmovdqu32 B, [SCRATCH + 64*1] 621 vpaddd B{k1}, B, TMP2 622 vmovdqu32 TMP2, C 623 vmovdqu32 C, [SCRATCH + 64*2] 624 vpaddd C{k1}, C, TMP2 625 vmovdqu32 TMP2, D 626 vmovdqu32 D, [SCRATCH + 64*3] 627 vpaddd D{k1}, D, TMP2 628 vmovdqu32 TMP2, E 629 vmovdqu32 E, [SCRATCH + 64*4] 630 vpaddd E{k1}, E, TMP2 631 vmovdqu32 TMP2, F 632 vmovdqu32 F, [SCRATCH + 64*5] 633 vpaddd F{k1}, F, TMP2 634 vmovdqu32 TMP2, G 635 vmovdqu32 G, [SCRATCH + 64*6] 636 vpaddd G{k1}, G, TMP2 637 vmovdqu32 TMP2, H 638 vmovdqu32 H, [SCRATCH + 64*7] 639 vpaddd H{k1}, H, TMP2 640 641 // Write out digest 642 vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A 643 vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B 644 vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C 645 vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D 646 vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E 647 vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F 648 vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G 649 vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H 650 651 VZEROUPPER 652 RET 653 654 // 655 // Tables 656 // 657 658 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x000(SB)/8, $0x0405060700010203 659 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x008(SB)/8, $0x0c0d0e0f08090a0b 660 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x010(SB)/8, $0x0405060700010203 661 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x018(SB)/8, $0x0c0d0e0f08090a0b 662 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x020(SB)/8, $0x0405060700010203 663 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x028(SB)/8, $0x0c0d0e0f08090a0b 664 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x030(SB)/8, $0x0405060700010203 665 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x038(SB)/8, $0x0c0d0e0f08090a0b 666 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), 8, $64 667 668 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x000(SB)/8, $0x0000000000000000 669 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x008(SB)/8, $0x0000000000000001 670 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x010(SB)/8, $0x0000000000000008 671 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x018(SB)/8, $0x0000000000000009 672 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x020(SB)/8, $0x0000000000000004 673 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x028(SB)/8, $0x0000000000000005 674 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x030(SB)/8, $0x000000000000000C 675 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x038(SB)/8, $0x000000000000000D 676 GLOBL PSHUFFLE_TRANSPOSE16_MASK1<>(SB), 8, $64 677 678 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x000(SB)/8, $0x0000000000000002 679 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x008(SB)/8, $0x0000000000000003 680 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x010(SB)/8, $0x000000000000000A 681 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x018(SB)/8, $0x000000000000000B 682 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x020(SB)/8, $0x0000000000000006 683 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x028(SB)/8, $0x0000000000000007 684 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x030(SB)/8, $0x000000000000000E 685 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x038(SB)/8, $0x000000000000000F 686 GLOBL PSHUFFLE_TRANSPOSE16_MASK2<>(SB), 8, $64