gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

sha256blockAvx512_amd64.go (20519B)


      1 //go:build !noasm && !appengine && gc
      2 // +build !noasm,!appengine,gc
      3 
      4 /*
      5  * Minio Cloud Storage, (C) 2017 Minio, Inc.
      6  *
      7  * Licensed under the Apache License, Version 2.0 (the "License");
      8  * you may not use this file except in compliance with the License.
      9  * You may obtain a copy of the License at
     10  *
     11  *     http://www.apache.org/licenses/LICENSE-2.0
     12  *
     13  * Unless required by applicable law or agreed to in writing, software
     14  * distributed under the License is distributed on an "AS IS" BASIS,
     15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     16  * See the License for the specific language governing permissions and
     17  * limitations under the License.
     18  */
     19 
     20 package sha256
     21 
     22 import (
     23 	"encoding/binary"
     24 	"errors"
     25 	"hash"
     26 	"sort"
     27 	"sync/atomic"
     28 	"time"
     29 )
     30 
     31 //go:noescape
     32 func sha256X16Avx512(digests *[512]byte, scratch *[512]byte, table *[512]uint64, mask []uint64, inputs [16][]byte)
     33 
     34 // Avx512ServerUID - Do not start at 0 but next multiple of 16 so as to be able to
     35 // differentiate with default initialiation value of 0
     36 const Avx512ServerUID = 16
     37 
     38 var uidCounter uint64
     39 
     40 // NewAvx512 - initialize sha256 Avx512 implementation.
     41 func NewAvx512(a512srv *Avx512Server) hash.Hash {
     42 	uid := atomic.AddUint64(&uidCounter, 1)
     43 	return &Avx512Digest{uid: uid, a512srv: a512srv}
     44 }
     45 
     46 // Avx512Digest - Type for computing SHA256 using Avx512
     47 type Avx512Digest struct {
     48 	uid     uint64
     49 	a512srv *Avx512Server
     50 	x       [chunk]byte
     51 	nx      int
     52 	len     uint64
     53 	final   bool
     54 	result  [Size]byte
     55 }
     56 
     57 // Size - Return size of checksum
     58 func (d *Avx512Digest) Size() int { return Size }
     59 
     60 // BlockSize - Return blocksize of checksum
     61 func (d Avx512Digest) BlockSize() int { return BlockSize }
     62 
     63 // Reset - reset sha digest to its initial values
     64 func (d *Avx512Digest) Reset() {
     65 	d.a512srv.blocksCh <- blockInput{uid: d.uid, reset: true}
     66 	d.nx = 0
     67 	d.len = 0
     68 	d.final = false
     69 }
     70 
     71 // Write to digest
     72 func (d *Avx512Digest) Write(p []byte) (nn int, err error) {
     73 
     74 	if d.final {
     75 		return 0, errors.New("Avx512Digest already finalized. Reset first before writing again")
     76 	}
     77 
     78 	nn = len(p)
     79 	d.len += uint64(nn)
     80 	if d.nx > 0 {
     81 		n := copy(d.x[d.nx:], p)
     82 		d.nx += n
     83 		if d.nx == chunk {
     84 			d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: d.x[:]}
     85 			d.nx = 0
     86 		}
     87 		p = p[n:]
     88 	}
     89 	if len(p) >= chunk {
     90 		n := len(p) &^ (chunk - 1)
     91 		d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: p[:n]}
     92 		p = p[n:]
     93 	}
     94 	if len(p) > 0 {
     95 		d.nx = copy(d.x[:], p)
     96 	}
     97 	return
     98 }
     99 
    100 // Sum - Return sha256 sum in bytes
    101 func (d *Avx512Digest) Sum(in []byte) (result []byte) {
    102 
    103 	if d.final {
    104 		return append(in, d.result[:]...)
    105 	}
    106 
    107 	trail := make([]byte, 0, 128)
    108 	trail = append(trail, d.x[:d.nx]...)
    109 
    110 	len := d.len
    111 	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
    112 	var tmp [64]byte
    113 	tmp[0] = 0x80
    114 	if len%64 < 56 {
    115 		trail = append(trail, tmp[0:56-len%64]...)
    116 	} else {
    117 		trail = append(trail, tmp[0:64+56-len%64]...)
    118 	}
    119 	d.nx = 0
    120 
    121 	// Length in bits.
    122 	len <<= 3
    123 	for i := uint(0); i < 8; i++ {
    124 		tmp[i] = byte(len >> (56 - 8*i))
    125 	}
    126 	trail = append(trail, tmp[0:8]...)
    127 
    128 	sumCh := make(chan [Size]byte)
    129 	d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: trail, final: true, sumCh: sumCh}
    130 	d.result = <-sumCh
    131 	d.final = true
    132 	return append(in, d.result[:]...)
    133 }
    134 
    135 var table = [512]uint64{
    136 	0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
    137 	0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
    138 	0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
    139 	0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
    140 	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
    141 	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
    142 	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
    143 	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
    144 	0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
    145 	0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
    146 	0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
    147 	0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
    148 	0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
    149 	0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
    150 	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
    151 	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
    152 	0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
    153 	0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
    154 	0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
    155 	0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
    156 	0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
    157 	0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
    158 	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
    159 	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
    160 	0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
    161 	0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
    162 	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
    163 	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
    164 	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
    165 	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
    166 	0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
    167 	0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
    168 	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
    169 	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
    170 	0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
    171 	0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
    172 	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
    173 	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
    174 	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
    175 	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
    176 	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
    177 	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
    178 	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
    179 	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
    180 	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
    181 	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
    182 	0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
    183 	0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
    184 	0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
    185 	0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
    186 	0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
    187 	0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
    188 	0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
    189 	0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
    190 	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
    191 	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
    192 	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
    193 	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
    194 	0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
    195 	0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
    196 	0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
    197 	0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
    198 	0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
    199 	0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
    200 	0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
    201 	0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
    202 	0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
    203 	0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
    204 	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
    205 	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
    206 	0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
    207 	0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
    208 	0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
    209 	0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
    210 	0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
    211 	0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
    212 	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
    213 	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
    214 	0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
    215 	0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
    216 	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
    217 	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
    218 	0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
    219 	0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
    220 	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
    221 	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
    222 	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
    223 	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
    224 	0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
    225 	0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
    226 	0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
    227 	0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
    228 	0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
    229 	0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
    230 	0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
    231 	0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
    232 	0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
    233 	0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
    234 	0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
    235 	0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
    236 	0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
    237 	0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
    238 	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
    239 	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
    240 	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
    241 	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
    242 	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
    243 	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
    244 	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
    245 	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
    246 	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
    247 	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
    248 	0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
    249 	0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
    250 	0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
    251 	0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
    252 	0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
    253 	0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
    254 	0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
    255 	0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
    256 	0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
    257 	0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
    258 	0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
    259 	0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
    260 	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
    261 	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
    262 	0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2,
    263 	0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2}
    264 
    265 // Interface function to assembly ode
    266 func blockAvx512(digests *[512]byte, input [16][]byte, mask []uint64) [16][Size]byte {
    267 
    268 	scratch := [512]byte{}
    269 	sha256X16Avx512(digests, &scratch, &table, mask, input)
    270 
    271 	output := [16][Size]byte{}
    272 	for i := 0; i < 16; i++ {
    273 		output[i] = getDigest(i, digests[:])
    274 	}
    275 
    276 	return output
    277 }
    278 
    279 func getDigest(index int, state []byte) (sum [Size]byte) {
    280 	for j := 0; j < 16; j += 2 {
    281 		for i := index*4 + j*Size; i < index*4+(j+1)*Size; i += Size {
    282 			binary.BigEndian.PutUint32(sum[j*2:], binary.LittleEndian.Uint32(state[i:i+4]))
    283 		}
    284 	}
    285 	return
    286 }
    287 
    288 // Message to send across input channel
    289 type blockInput struct {
    290 	uid   uint64
    291 	msg   []byte
    292 	reset bool
    293 	final bool
    294 	sumCh chan [Size]byte
    295 }
    296 
    297 // Avx512Server - Type to implement 16x parallel handling of SHA256 invocations
    298 type Avx512Server struct {
    299 	blocksCh chan blockInput       // Input channel
    300 	totalIn  int                   // Total number of inputs waiting to be processed
    301 	lanes    [16]Avx512LaneInfo    // Array with info per lane (out of 16)
    302 	digests  map[uint64][Size]byte // Map of uids to (interim) digest results
    303 }
    304 
    305 // Avx512LaneInfo - Info for each lane
    306 type Avx512LaneInfo struct {
    307 	uid      uint64          // unique identification for this SHA processing
    308 	block    []byte          // input block to be processed
    309 	outputCh chan [Size]byte // channel for output result
    310 }
    311 
    312 // NewAvx512Server - Create new object for parallel processing handling
    313 func NewAvx512Server() *Avx512Server {
    314 	a512srv := &Avx512Server{}
    315 	a512srv.digests = make(map[uint64][Size]byte)
    316 	a512srv.blocksCh = make(chan blockInput)
    317 
    318 	// Start a single thread for reading from the input channel
    319 	go a512srv.Process()
    320 	return a512srv
    321 }
    322 
    323 // Process - Sole handler for reading from the input channel
    324 func (a512srv *Avx512Server) Process() {
    325 	for {
    326 		select {
    327 		case block := <-a512srv.blocksCh:
    328 			if block.reset {
    329 				a512srv.reset(block.uid)
    330 				continue
    331 			}
    332 			index := block.uid & 0xf
    333 			// fmt.Println("Adding message:", block.uid, index)
    334 
    335 			if a512srv.lanes[index].block != nil { // If slot is already filled, process all inputs
    336 				//fmt.Println("Invoking Blocks()")
    337 				a512srv.blocks()
    338 			}
    339 			a512srv.totalIn++
    340 			a512srv.lanes[index] = Avx512LaneInfo{uid: block.uid, block: block.msg}
    341 			if block.final {
    342 				a512srv.lanes[index].outputCh = block.sumCh
    343 			}
    344 			if a512srv.totalIn == len(a512srv.lanes) {
    345 				// fmt.Println("Invoking Blocks() while FULL: ")
    346 				a512srv.blocks()
    347 			}
    348 
    349 			// TODO: test with larger timeout
    350 		case <-time.After(1 * time.Microsecond):
    351 			for _, lane := range a512srv.lanes {
    352 				if lane.block != nil { // check if there is any input to process
    353 					// fmt.Println("Invoking Blocks() on TIMEOUT: ")
    354 					a512srv.blocks()
    355 					break // we are done
    356 				}
    357 			}
    358 		}
    359 	}
    360 }
    361 
    362 // Do a reset for this calculation
    363 func (a512srv *Avx512Server) reset(uid uint64) {
    364 
    365 	// Check if there is a message still waiting to be processed (and remove if so)
    366 	for i, lane := range a512srv.lanes {
    367 		if lane.uid == uid {
    368 			if lane.block != nil {
    369 				a512srv.lanes[i] = Avx512LaneInfo{} // clear message
    370 				a512srv.totalIn--
    371 			}
    372 		}
    373 	}
    374 
    375 	// Delete entry from hash map
    376 	delete(a512srv.digests, uid)
    377 }
    378 
    379 // Invoke assembly and send results back
    380 func (a512srv *Avx512Server) blocks() {
    381 
    382 	inputs := [16][]byte{}
    383 	for i := range inputs {
    384 		inputs[i] = a512srv.lanes[i].block
    385 	}
    386 
    387 	mask := expandMask(genMask(inputs))
    388 	outputs := blockAvx512(a512srv.getDigests(), inputs, mask)
    389 
    390 	a512srv.totalIn = 0
    391 	for i := 0; i < len(outputs); i++ {
    392 		uid, outputCh := a512srv.lanes[i].uid, a512srv.lanes[i].outputCh
    393 		a512srv.digests[uid] = outputs[i]
    394 		a512srv.lanes[i] = Avx512LaneInfo{}
    395 
    396 		if outputCh != nil {
    397 			// Send back result
    398 			outputCh <- outputs[i]
    399 			delete(a512srv.digests, uid) // Delete entry from hashmap
    400 		}
    401 	}
    402 }
    403 
    404 func (a512srv *Avx512Server) Write(uid uint64, p []byte) (nn int, err error) {
    405 	a512srv.blocksCh <- blockInput{uid: uid, msg: p}
    406 	return len(p), nil
    407 }
    408 
    409 // Sum - return sha256 sum in bytes for a given sum id.
    410 func (a512srv *Avx512Server) Sum(uid uint64, p []byte) [32]byte {
    411 	sumCh := make(chan [32]byte)
    412 	a512srv.blocksCh <- blockInput{uid: uid, msg: p, final: true, sumCh: sumCh}
    413 	return <-sumCh
    414 }
    415 
    416 func (a512srv *Avx512Server) getDigests() *[512]byte {
    417 	digests := [512]byte{}
    418 	for i, lane := range a512srv.lanes {
    419 		a, ok := a512srv.digests[lane.uid]
    420 		if ok {
    421 			binary.BigEndian.PutUint32(digests[(i+0*16)*4:], binary.LittleEndian.Uint32(a[0:4]))
    422 			binary.BigEndian.PutUint32(digests[(i+1*16)*4:], binary.LittleEndian.Uint32(a[4:8]))
    423 			binary.BigEndian.PutUint32(digests[(i+2*16)*4:], binary.LittleEndian.Uint32(a[8:12]))
    424 			binary.BigEndian.PutUint32(digests[(i+3*16)*4:], binary.LittleEndian.Uint32(a[12:16]))
    425 			binary.BigEndian.PutUint32(digests[(i+4*16)*4:], binary.LittleEndian.Uint32(a[16:20]))
    426 			binary.BigEndian.PutUint32(digests[(i+5*16)*4:], binary.LittleEndian.Uint32(a[20:24]))
    427 			binary.BigEndian.PutUint32(digests[(i+6*16)*4:], binary.LittleEndian.Uint32(a[24:28]))
    428 			binary.BigEndian.PutUint32(digests[(i+7*16)*4:], binary.LittleEndian.Uint32(a[28:32]))
    429 		} else {
    430 			binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0)
    431 			binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1)
    432 			binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2)
    433 			binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3)
    434 			binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4)
    435 			binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5)
    436 			binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6)
    437 			binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7)
    438 		}
    439 	}
    440 	return &digests
    441 }
    442 
    443 // Helper struct for sorting blocks based on length
    444 type lane struct {
    445 	len uint
    446 	pos uint
    447 }
    448 
    449 type lanes []lane
    450 
    451 func (lns lanes) Len() int           { return len(lns) }
    452 func (lns lanes) Swap(i, j int)      { lns[i], lns[j] = lns[j], lns[i] }
    453 func (lns lanes) Less(i, j int) bool { return lns[i].len < lns[j].len }
    454 
    455 // Helper struct for
    456 type maskRounds struct {
    457 	mask   uint64
    458 	rounds uint64
    459 }
    460 
    461 func genMask(input [16][]byte) [16]maskRounds {
    462 
    463 	// Sort on blocks length small to large
    464 	var sorted [16]lane
    465 	for c, inpt := range input {
    466 		sorted[c] = lane{uint(len(inpt)), uint(c)}
    467 	}
    468 	sort.Sort(lanes(sorted[:]))
    469 
    470 	// Create mask array including 'rounds' between masks
    471 	m, round, index := uint64(0xffff), uint64(0), 0
    472 	var mr [16]maskRounds
    473 	for _, s := range sorted {
    474 		if s.len > 0 {
    475 			if uint64(s.len)>>6 > round {
    476 				mr[index] = maskRounds{m, (uint64(s.len) >> 6) - round}
    477 				index++
    478 			}
    479 			round = uint64(s.len) >> 6
    480 		}
    481 		m = m & ^(1 << uint(s.pos))
    482 	}
    483 
    484 	return mr
    485 }
    486 
    487 // TODO: remove function
    488 func expandMask(mr [16]maskRounds) []uint64 {
    489 	size := uint64(0)
    490 	for _, r := range mr {
    491 		size += r.rounds
    492 	}
    493 	result, index := make([]uint64, size), 0
    494 	for _, r := range mr {
    495 		for j := uint64(0); j < r.rounds; j++ {
    496 			result[index] = r.mask
    497 			index++
    498 		}
    499 	}
    500 	return result
    501 }