sha256block_amd64.s (8178B)
1 //+build !noasm,!appengine,gc 2 3 // SHA intrinsic version of SHA256 4 5 // Kristofer Peterson, (C) 2018. 6 // 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // 13 // Unless required by applicable law or agreed to in writing, software 14 // distributed under the License is distributed on an "AS IS" BASIS, 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 // See the License for the specific language governing permissions and 17 // limitations under the License. 18 // 19 20 #include "textflag.h" 21 22 DATA K<>+0x00(SB)/4, $0x428a2f98 23 DATA K<>+0x04(SB)/4, $0x71374491 24 DATA K<>+0x08(SB)/4, $0xb5c0fbcf 25 DATA K<>+0x0c(SB)/4, $0xe9b5dba5 26 DATA K<>+0x10(SB)/4, $0x3956c25b 27 DATA K<>+0x14(SB)/4, $0x59f111f1 28 DATA K<>+0x18(SB)/4, $0x923f82a4 29 DATA K<>+0x1c(SB)/4, $0xab1c5ed5 30 DATA K<>+0x20(SB)/4, $0xd807aa98 31 DATA K<>+0x24(SB)/4, $0x12835b01 32 DATA K<>+0x28(SB)/4, $0x243185be 33 DATA K<>+0x2c(SB)/4, $0x550c7dc3 34 DATA K<>+0x30(SB)/4, $0x72be5d74 35 DATA K<>+0x34(SB)/4, $0x80deb1fe 36 DATA K<>+0x38(SB)/4, $0x9bdc06a7 37 DATA K<>+0x3c(SB)/4, $0xc19bf174 38 DATA K<>+0x40(SB)/4, $0xe49b69c1 39 DATA K<>+0x44(SB)/4, $0xefbe4786 40 DATA K<>+0x48(SB)/4, $0x0fc19dc6 41 DATA K<>+0x4c(SB)/4, $0x240ca1cc 42 DATA K<>+0x50(SB)/4, $0x2de92c6f 43 DATA K<>+0x54(SB)/4, $0x4a7484aa 44 DATA K<>+0x58(SB)/4, $0x5cb0a9dc 45 DATA K<>+0x5c(SB)/4, $0x76f988da 46 DATA K<>+0x60(SB)/4, $0x983e5152 47 DATA K<>+0x64(SB)/4, $0xa831c66d 48 DATA K<>+0x68(SB)/4, $0xb00327c8 49 DATA K<>+0x6c(SB)/4, $0xbf597fc7 50 DATA K<>+0x70(SB)/4, $0xc6e00bf3 51 DATA K<>+0x74(SB)/4, $0xd5a79147 52 DATA K<>+0x78(SB)/4, $0x06ca6351 53 DATA K<>+0x7c(SB)/4, $0x14292967 54 DATA K<>+0x80(SB)/4, $0x27b70a85 55 DATA K<>+0x84(SB)/4, $0x2e1b2138 56 DATA K<>+0x88(SB)/4, $0x4d2c6dfc 57 DATA K<>+0x8c(SB)/4, $0x53380d13 58 DATA K<>+0x90(SB)/4, $0x650a7354 59 DATA K<>+0x94(SB)/4, $0x766a0abb 60 DATA K<>+0x98(SB)/4, $0x81c2c92e 61 DATA K<>+0x9c(SB)/4, $0x92722c85 62 DATA K<>+0xa0(SB)/4, $0xa2bfe8a1 63 DATA K<>+0xa4(SB)/4, $0xa81a664b 64 DATA K<>+0xa8(SB)/4, $0xc24b8b70 65 DATA K<>+0xac(SB)/4, $0xc76c51a3 66 DATA K<>+0xb0(SB)/4, $0xd192e819 67 DATA K<>+0xb4(SB)/4, $0xd6990624 68 DATA K<>+0xb8(SB)/4, $0xf40e3585 69 DATA K<>+0xbc(SB)/4, $0x106aa070 70 DATA K<>+0xc0(SB)/4, $0x19a4c116 71 DATA K<>+0xc4(SB)/4, $0x1e376c08 72 DATA K<>+0xc8(SB)/4, $0x2748774c 73 DATA K<>+0xcc(SB)/4, $0x34b0bcb5 74 DATA K<>+0xd0(SB)/4, $0x391c0cb3 75 DATA K<>+0xd4(SB)/4, $0x4ed8aa4a 76 DATA K<>+0xd8(SB)/4, $0x5b9cca4f 77 DATA K<>+0xdc(SB)/4, $0x682e6ff3 78 DATA K<>+0xe0(SB)/4, $0x748f82ee 79 DATA K<>+0xe4(SB)/4, $0x78a5636f 80 DATA K<>+0xe8(SB)/4, $0x84c87814 81 DATA K<>+0xec(SB)/4, $0x8cc70208 82 DATA K<>+0xf0(SB)/4, $0x90befffa 83 DATA K<>+0xf4(SB)/4, $0xa4506ceb 84 DATA K<>+0xf8(SB)/4, $0xbef9a3f7 85 DATA K<>+0xfc(SB)/4, $0xc67178f2 86 GLOBL K<>(SB), RODATA|NOPTR, $256 87 88 DATA SHUF_MASK<>+0x00(SB)/8, $0x0405060700010203 89 DATA SHUF_MASK<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 90 GLOBL SHUF_MASK<>(SB), RODATA|NOPTR, $16 91 92 // Register Usage 93 // BX base address of constant table (constant) 94 // DX hash_state (constant) 95 // SI hash_data.data 96 // DI hash_data.data + hash_data.length - 64 (constant) 97 // X0 scratch 98 // X1 scratch 99 // X2 working hash state // ABEF 100 // X3 working hash state // CDGH 101 // X4 first 16 bytes of block 102 // X5 second 16 bytes of block 103 // X6 third 16 bytes of block 104 // X7 fourth 16 bytes of block 105 // X12 saved hash state // ABEF 106 // X13 saved hash state // CDGH 107 // X15 data shuffle mask (constant) 108 109 TEXT ·blockIntelSha(SB), NOSPLIT, $0-32 110 MOVQ h+0(FP), DX 111 MOVQ message_base+8(FP), SI 112 MOVQ message_len+16(FP), DI 113 LEAQ -64(SI)(DI*1), DI 114 MOVOU (DX), X2 115 MOVOU 16(DX), X1 116 MOVO X2, X3 117 PUNPCKLLQ X1, X2 118 PUNPCKHLQ X1, X3 119 PSHUFD $0x27, X2, X2 120 PSHUFD $0x27, X3, X3 121 MOVO SHUF_MASK<>(SB), X15 122 LEAQ K<>(SB), BX 123 124 JMP TEST 125 126 LOOP: 127 MOVO X2, X12 128 MOVO X3, X13 129 130 // load block and shuffle 131 MOVOU (SI), X4 132 MOVOU 16(SI), X5 133 MOVOU 32(SI), X6 134 MOVOU 48(SI), X7 135 PSHUFB X15, X4 136 PSHUFB X15, X5 137 PSHUFB X15, X6 138 PSHUFB X15, X7 139 140 #define ROUND456 \ 141 PADDL X5, X0 \ 142 LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2 143 MOVO X5, X1 \ 144 LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1, XMM4, 4 145 PADDL X1, X6 \ 146 LONG $0xf5cd380f \ // SHA256MSG2 XMM6, XMM5 147 PSHUFD $0x4e, X0, X0 \ 148 LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3 149 LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5 150 151 #define ROUND567 \ 152 PADDL X6, X0 \ 153 LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2 154 MOVO X6, X1 \ 155 LONG $0x0f3a0f66; WORD $0x04cd \ // PALIGNR XMM1, XMM5, 4 156 PADDL X1, X7 \ 157 LONG $0xfecd380f \ // SHA256MSG2 XMM7, XMM6 158 PSHUFD $0x4e, X0, X0 \ 159 LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3 160 LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6 161 162 #define ROUND674 \ 163 PADDL X7, X0 \ 164 LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2 165 MOVO X7, X1 \ 166 LONG $0x0f3a0f66; WORD $0x04ce \ // PALIGNR XMM1, XMM6, 4 167 PADDL X1, X4 \ 168 LONG $0xe7cd380f \ // SHA256MSG2 XMM4, XMM7 169 PSHUFD $0x4e, X0, X0 \ 170 LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3 171 LONG $0xf7cc380f // SHA256MSG1 XMM6, XMM7 172 173 #define ROUND745 \ 174 PADDL X4, X0 \ 175 LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2 176 MOVO X4, X1 \ 177 LONG $0x0f3a0f66; WORD $0x04cf \ // PALIGNR XMM1, XMM7, 4 178 PADDL X1, X5 \ 179 LONG $0xeccd380f \ // SHA256MSG2 XMM5, XMM4 180 PSHUFD $0x4e, X0, X0 \ 181 LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3 182 LONG $0xfccc380f // SHA256MSG1 XMM7, XMM4 183 184 // rounds 0-3 185 MOVO (BX), X0 186 PADDL X4, X0 187 LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2 188 PSHUFD $0x4e, X0, X0 189 LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3 190 191 // rounds 4-7 192 MOVO 1*16(BX), X0 193 PADDL X5, X0 194 LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2 195 PSHUFD $0x4e, X0, X0 196 LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3 197 LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5 198 199 // rounds 8-11 200 MOVO 2*16(BX), X0 201 PADDL X6, X0 202 LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2 203 PSHUFD $0x4e, X0, X0 204 LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3 205 LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6 206 207 MOVO 3*16(BX), X0; ROUND674 // rounds 12-15 208 MOVO 4*16(BX), X0; ROUND745 // rounds 16-19 209 MOVO 5*16(BX), X0; ROUND456 // rounds 20-23 210 MOVO 6*16(BX), X0; ROUND567 // rounds 24-27 211 MOVO 7*16(BX), X0; ROUND674 // rounds 28-31 212 MOVO 8*16(BX), X0; ROUND745 // rounds 32-35 213 MOVO 9*16(BX), X0; ROUND456 // rounds 36-39 214 MOVO 10*16(BX), X0; ROUND567 // rounds 40-43 215 MOVO 11*16(BX), X0; ROUND674 // rounds 44-47 216 MOVO 12*16(BX), X0; ROUND745 // rounds 48-51 217 218 // rounds 52-55 219 MOVO 13*16(BX), X0 220 PADDL X5, X0 221 LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2 222 MOVO X5, X1 223 LONG $0x0f3a0f66; WORD $0x04cc // PALIGNR XMM1, XMM4, 4 224 PADDL X1, X6 225 LONG $0xf5cd380f // SHA256MSG2 XMM6, XMM5 226 PSHUFD $0x4e, X0, X0 227 LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3 228 229 // rounds 56-59 230 MOVO 14*16(BX), X0 231 PADDL X6, X0 232 LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2 233 MOVO X6, X1 234 LONG $0x0f3a0f66; WORD $0x04cd // PALIGNR XMM1, XMM5, 4 235 PADDL X1, X7 236 LONG $0xfecd380f // SHA256MSG2 XMM7, XMM6 237 PSHUFD $0x4e, X0, X0 238 LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3 239 240 // rounds 60-63 241 MOVO 15*16(BX), X0 242 PADDL X7, X0 243 LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2 244 PSHUFD $0x4e, X0, X0 245 LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3 246 247 PADDL X12, X2 248 PADDL X13, X3 249 250 ADDQ $64, SI 251 252 TEST: 253 CMPQ SI, DI 254 JBE LOOP 255 256 PSHUFD $0x4e, X3, X0 257 LONG $0x0e3a0f66; WORD $0xf0c2 // PBLENDW XMM0, XMM2, 0xf0 258 PSHUFD $0x4e, X2, X1 259 LONG $0x0e3a0f66; WORD $0x0fcb // PBLENDW XMM1, XMM3, 0x0f 260 PSHUFD $0x1b, X0, X0 261 PSHUFD $0x1b, X1, X1 262 263 MOVOU X0, (DX) 264 MOVOU X1, 16(DX) 265 266 RET