block_amd64.go (6469B)
1 //+build !noasm,!appengine,gc 2 3 // Copyright (c) 2020 MinIO Inc. All rights reserved. 4 // Use of this source code is governed by a license that can be 5 // found in the LICENSE file. 6 7 package md5simd 8 9 import ( 10 "fmt" 11 "math" 12 "unsafe" 13 14 "github.com/klauspost/cpuid/v2" 15 ) 16 17 var hasAVX512 bool 18 19 func init() { 20 // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F. 21 hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ) 22 } 23 24 //go:noescape 25 func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int) 26 27 //go:noescape 28 func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int) 29 30 // 8-way 4x uint32 digests in 4 ymm registers 31 // (ymm0, ymm1, ymm2, ymm3) 32 type digest8 struct { 33 v0, v1, v2, v3 [8]uint32 34 } 35 36 // Stack cache for 8x64 byte md5.BlockSize bytes. 37 // Must be 32-byte aligned, so allocate 512+32 and 38 // align upwards at runtime. 39 type cache8 [512 + 32]byte 40 41 // MD5 magic numbers for one lane of hashing; inflated 42 // 8x below at init time. 43 var md5consts = [64]uint32{ 44 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 45 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, 46 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 47 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 48 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 49 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, 50 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 51 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, 52 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 53 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 54 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 55 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, 56 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 57 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, 58 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 59 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, 60 } 61 62 // inflate the consts 8-way for 8x md5 (256 bit ymm registers) 63 var avx256md5consts = func(c []uint32) []uint32 { 64 inf := make([]uint32, 8*len(c)) 65 for i := range c { 66 for j := 0; j < 8; j++ { 67 inf[(i*8)+j] = c[i] 68 } 69 } 70 return inf 71 }(md5consts[:]) 72 73 // 16-way 4x uint32 digests in 4 zmm registers 74 type digest16 struct { 75 v0, v1, v2, v3 [16]uint32 76 } 77 78 // inflate the consts 16-way for 16x md5 (512 bit zmm registers) 79 var avx512md5consts = func(c []uint32) []uint32 { 80 inf := make([]uint32, 16*len(c)) 81 for i := range c { 82 for j := 0; j < 16; j++ { 83 inf[(i*16)+j] = c[i] 84 } 85 } 86 return inf 87 }(md5consts[:]) 88 89 // Interface function to assembly code 90 func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) { 91 if hasAVX512 { 92 blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16) 93 return 94 } 95 96 // Preparing data using copy is slower since copies aren't inlined. 97 98 // Calculate on this goroutine 99 if half { 100 for i := range s.i8[0][:] { 101 s.i8[0][i] = input[i] 102 } 103 for i := range s.d8a.v0[:] { 104 s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] 105 } 106 blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a) 107 for i := range s.d8a.v0[:] { 108 d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] 109 } 110 return 111 } 112 113 for i := range s.i8[0][:] { 114 s.i8[0][i], s.i8[1][i] = input[i], input[8+i] 115 } 116 117 for i := range s.d8a.v0[:] { 118 j := (i + 8) & 15 119 s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] 120 s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j] 121 } 122 123 // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead 124 // of using the current for one of the blocks. 125 s.wg.Add(2) 126 go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }() 127 go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }() 128 s.wg.Wait() 129 for i := range s.d8a.v0[:] { 130 d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] 131 } 132 for i := range s.d8b.v0[:] { 133 j := (i + 8) & 15 134 d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] 135 } 136 } 137 138 // Interface function to AVX512 assembly code 139 func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) { 140 baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) 141 ptrs := [16]int32{} 142 143 for i := range ptrs { 144 if len(input[i]) > 0 { 145 if len(input[i]) > internalBlockSize { 146 panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i)) 147 } 148 149 off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin 150 if off > math.MaxUint32 { 151 panic(fmt.Sprintf("invalid buffer sent with offset %x", off)) 152 } 153 ptrs[i] = int32(off) 154 } 155 } 156 157 sdup := *s // create copy of initial states to receive intermediate updates 158 159 rounds := generateMaskAndRounds16(input, maskRounds) 160 161 for r := 0; r < rounds; r++ { 162 m := maskRounds[r] 163 164 block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds)) 165 166 for j := 0; j < len(ptrs); j++ { 167 ptrs[j] += int32(64 * m.rounds) // update pointers for next round 168 if m.mask&(1<<j) != 0 { // update digest if still masked as active 169 (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j] 170 } 171 } 172 } 173 } 174 175 // Interface function to AVX2 assembly code 176 func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) { 177 baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4 178 ptrs := [8]int32{} 179 180 for i := range ptrs { 181 if len(input[i]) > 0 { 182 if len(input[i]) > internalBlockSize { 183 panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i)) 184 } 185 186 off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin 187 if off > math.MaxUint32 { 188 panic(fmt.Sprintf("invalid buffer sent with offset %x", off)) 189 } 190 ptrs[i] = int32(off) 191 } 192 } 193 194 sdup := *s // create copy of initial states to receive intermediate updates 195 196 rounds := generateMaskAndRounds8(input, maskRounds) 197 198 for r := 0; r < rounds; r++ { 199 m := maskRounds[r] 200 var cache cache8 // stack storage for block8 tmp state 201 block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds)) 202 203 for j := 0; j < len(ptrs); j++ { 204 ptrs[j] += int32(64 * m.rounds) // update pointers for next round 205 if m.mask&(1<<j) != 0 { // update digest if still masked as active 206 (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j] 207 } 208 } 209 } 210 }