gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

block_amd64.go (6469B)


      1 //+build !noasm,!appengine,gc
      2 
      3 // Copyright (c) 2020 MinIO Inc. All rights reserved.
      4 // Use of this source code is governed by a license that can be
      5 // found in the LICENSE file.
      6 
      7 package md5simd
      8 
      9 import (
     10 	"fmt"
     11 	"math"
     12 	"unsafe"
     13 
     14 	"github.com/klauspost/cpuid/v2"
     15 )
     16 
     17 var hasAVX512 bool
     18 
     19 func init() {
     20 	// VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
     21 	hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
     22 }
     23 
     24 //go:noescape
     25 func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
     26 
     27 //go:noescape
     28 func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int)
     29 
     30 // 8-way 4x uint32 digests in 4 ymm registers
     31 // (ymm0, ymm1, ymm2, ymm3)
     32 type digest8 struct {
     33 	v0, v1, v2, v3 [8]uint32
     34 }
     35 
     36 // Stack cache for 8x64 byte md5.BlockSize bytes.
     37 // Must be 32-byte aligned, so allocate 512+32 and
     38 // align upwards at runtime.
     39 type cache8 [512 + 32]byte
     40 
     41 // MD5 magic numbers for one lane of hashing; inflated
     42 // 8x below at init time.
     43 var md5consts = [64]uint32{
     44 	0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
     45 	0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
     46 	0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
     47 	0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
     48 	0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
     49 	0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
     50 	0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
     51 	0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
     52 	0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
     53 	0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
     54 	0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
     55 	0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
     56 	0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
     57 	0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
     58 	0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
     59 	0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
     60 }
     61 
     62 // inflate the consts 8-way for 8x md5 (256 bit ymm registers)
     63 var avx256md5consts = func(c []uint32) []uint32 {
     64 	inf := make([]uint32, 8*len(c))
     65 	for i := range c {
     66 		for j := 0; j < 8; j++ {
     67 			inf[(i*8)+j] = c[i]
     68 		}
     69 	}
     70 	return inf
     71 }(md5consts[:])
     72 
     73 // 16-way 4x uint32 digests in 4 zmm registers
     74 type digest16 struct {
     75 	v0, v1, v2, v3 [16]uint32
     76 }
     77 
     78 // inflate the consts 16-way for 16x md5 (512 bit zmm registers)
     79 var avx512md5consts = func(c []uint32) []uint32 {
     80 	inf := make([]uint32, 16*len(c))
     81 	for i := range c {
     82 		for j := 0; j < 16; j++ {
     83 			inf[(i*16)+j] = c[i]
     84 		}
     85 	}
     86 	return inf
     87 }(md5consts[:])
     88 
     89 // Interface function to assembly code
     90 func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
     91 	if hasAVX512 {
     92 		blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
     93 		return
     94 	}
     95 
     96 	// Preparing data using copy is slower since copies aren't inlined.
     97 
     98 	// Calculate on this goroutine
     99 	if half {
    100 		for i := range s.i8[0][:] {
    101 			s.i8[0][i] = input[i]
    102 		}
    103 		for i := range s.d8a.v0[:] {
    104 			s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
    105 		}
    106 		blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
    107 		for i := range s.d8a.v0[:] {
    108 			d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
    109 		}
    110 		return
    111 	}
    112 
    113 	for i := range s.i8[0][:] {
    114 		s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
    115 	}
    116 
    117 	for i := range s.d8a.v0[:] {
    118 		j := (i + 8) & 15
    119 		s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
    120 		s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
    121 	}
    122 
    123 	// Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
    124 	// of using the current for one of the blocks.
    125 	s.wg.Add(2)
    126 	go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
    127 	go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
    128 	s.wg.Wait()
    129 	for i := range s.d8a.v0[:] {
    130 		d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
    131 	}
    132 	for i := range s.d8b.v0[:] {
    133 		j := (i + 8) & 15
    134 		d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
    135 	}
    136 }
    137 
    138 // Interface function to AVX512 assembly code
    139 func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) {
    140 	baseMin := uint64(uintptr(unsafe.Pointer(&(base[0]))))
    141 	ptrs := [16]int32{}
    142 
    143 	for i := range ptrs {
    144 		if len(input[i]) > 0 {
    145 			if len(input[i]) > internalBlockSize {
    146 				panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
    147 			}
    148 
    149 			off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
    150 			if off > math.MaxUint32 {
    151 				panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
    152 			}
    153 			ptrs[i] = int32(off)
    154 		}
    155 	}
    156 
    157 	sdup := *s // create copy of initial states to receive intermediate updates
    158 
    159 	rounds := generateMaskAndRounds16(input, maskRounds)
    160 
    161 	for r := 0; r < rounds; r++ {
    162 		m := maskRounds[r]
    163 
    164 		block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds))
    165 
    166 		for j := 0; j < len(ptrs); j++ {
    167 			ptrs[j] += int32(64 * m.rounds) // update pointers for next round
    168 			if m.mask&(1<<j) != 0 {         // update digest if still masked as active
    169 				(*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
    170 			}
    171 		}
    172 	}
    173 }
    174 
    175 // Interface function to AVX2 assembly code
    176 func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) {
    177 	baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4
    178 	ptrs := [8]int32{}
    179 
    180 	for i := range ptrs {
    181 		if len(input[i]) > 0 {
    182 			if len(input[i]) > internalBlockSize {
    183 				panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
    184 			}
    185 
    186 			off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
    187 			if off > math.MaxUint32 {
    188 				panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
    189 			}
    190 			ptrs[i] = int32(off)
    191 		}
    192 	}
    193 
    194 	sdup := *s // create copy of initial states to receive intermediate updates
    195 
    196 	rounds := generateMaskAndRounds8(input, maskRounds)
    197 
    198 	for r := 0; r < rounds; r++ {
    199 		m := maskRounds[r]
    200 		var cache cache8 // stack storage for block8 tmp state
    201 		block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds))
    202 
    203 		for j := 0; j < len(ptrs); j++ {
    204 			ptrs[j] += int32(64 * m.rounds) // update pointers for next round
    205 			if m.mask&(1<<j) != 0 {         // update digest if still masked as active
    206 				(*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
    207 			}
    208 		}
    209 	}
    210 }