commit f28cf793ee53e8391c9eabbfba93afbc5b59936b parent 667e7f112ce7b5b7452c392bbbe393a4c998508d Author: tsmethurst <tobi.smethurst@protonmail.com> Date: Mon, 24 Jan 2022 17:35:13 +0100 upgrade go-store Diffstat:
44 files changed, 645 insertions(+), 4964 deletions(-)
diff --git a/go.mod b/go.mod @@ -4,7 +4,7 @@ go 1.17 require ( codeberg.org/gruf/go-runners v1.2.0 - codeberg.org/gruf/go-store v1.2.2 + codeberg.org/gruf/go-store v1.3.2 github.com/ReneKroon/ttlcache v1.7.0 github.com/buckket/go-blurhash v1.1.0 github.com/coreos/go-oidc/v3 v3.1.0 @@ -50,7 +50,7 @@ require ( codeberg.org/gruf/go-fastpath v1.0.2 // indirect codeberg.org/gruf/go-format v1.0.3 // indirect codeberg.org/gruf/go-hashenc v1.0.1 // indirect - codeberg.org/gruf/go-mutexes v1.0.1 // indirect + codeberg.org/gruf/go-mutexes v1.1.0 // indirect codeberg.org/gruf/go-nowish v1.1.0 // indirect codeberg.org/gruf/go-pools v1.0.2 // indirect github.com/aymerick/douceur v0.2.0 // indirect diff --git a/go.sum b/go.sum @@ -62,6 +62,8 @@ codeberg.org/gruf/go-hashenc v1.0.1 h1:EBvNe2wW8IPMUqT1XihB6/IM6KMJDLMFBxIUvmsy1 codeberg.org/gruf/go-hashenc v1.0.1/go.mod h1:IfHhPCVScOiYmJLqdCQT9bYVS1nxNTV4ewMUvFWDPtc= codeberg.org/gruf/go-mutexes v1.0.1 h1:X9bZW74YSEplWWdCrVXAvue5ztw3w5hh+INdXTENu88= codeberg.org/gruf/go-mutexes v1.0.1/go.mod h1:y2hbGLkWVHhNyxBOIVsA3/y2QMm6RSrYsC3sLVZ4EXM= +codeberg.org/gruf/go-mutexes v1.1.0 h1:kMVWHLxdfGEZTetNVRncdBMeqS4M8dSJxSGbRYXyvKk= +codeberg.org/gruf/go-mutexes v1.1.0/go.mod h1:1j/6/MBeBQUedAtAtysLLnBKogfOZAxdym0E3wlaBD8= codeberg.org/gruf/go-nowish v1.0.0/go.mod h1:70nvICNcqQ9OHpF07N614Dyk7cpL5ToWU1K1ZVCec2s= codeberg.org/gruf/go-nowish v1.1.0 h1:rj1z0AXDhLvnxs/DazWFxYAugs6rv5vhgWJkRCgrESg= codeberg.org/gruf/go-nowish v1.1.0/go.mod h1:70nvICNcqQ9OHpF07N614Dyk7cpL5ToWU1K1ZVCec2s= @@ -72,6 +74,8 @@ codeberg.org/gruf/go-runners v1.2.0 h1:tkoPrwYMkVg1o/C4PGTR1YbC11XX4r06uLPOYajBs codeberg.org/gruf/go-runners v1.2.0/go.mod h1:9gTrmMnO3d+50C+hVzcmGBf+zTuswReS278E2EMvnmw= codeberg.org/gruf/go-store v1.2.2 h1:YJPzJpZv/D3t9hQC00/u76eQDScQw4++OWjfobnjHAA= codeberg.org/gruf/go-store v1.2.2/go.mod h1:Xjw1U098th0yXF2CCx6jThQ+9FIPWAX9OGjYslO+UtE= +codeberg.org/gruf/go-store v1.3.2 h1:cLTMEqyK0uF/bt1ULkRR4h41Pdgxwvw3uxSpLUublHo= +codeberg.org/gruf/go-store v1.3.2/go.mod h1:g4+9h3wbwZ6IW0uhpw57xywcqiy4CIj0zQLqqtjEU1M= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= diff --git a/vendor/codeberg.org/gruf/go-mutexes/map.go b/vendor/codeberg.org/gruf/go-mutexes/map.go @@ -1,105 +1,322 @@ package mutexes import ( + "runtime" "sync" + "sync/atomic" ) +// locktype defines maskable mutexmap lock types. +type locktype uint8 + +const ( + // possible lock types. + lockTypeRead = locktype(1) << 0 + lockTypeWrite = locktype(1) << 1 + lockTypeMap = locktype(1) << 2 + + // possible mutexmap states. + stateUnlockd = uint8(0) + stateRLocked = uint8(1) + stateLocked = uint8(2) + stateInUse = uint8(3) +) + +// permitLockType returns if provided locktype is permitted to go ahead in current state. +func permitLockType(state uint8, lt locktype) bool { + switch state { + // Unlocked state + // (all allowed) + case stateUnlockd: + return true + + // Keys locked, no state lock. + // (don't allow map locks) + case stateInUse: + return lt&lockTypeMap == 0 + + // Read locked + // (only allow read locks) + case stateRLocked: + return lt&lockTypeRead != 0 + + // Write locked + // (none allowed) + case stateLocked: + return false + + // shouldn't reach here + default: + panic("unexpected state") + } +} + // MutexMap is a structure that allows having a map of self-evicting mutexes // by key. You do not need to worry about managing the contents of the map, // only requesting RLock/Lock for keys, and ensuring to call the returned // unlock functions. type MutexMap struct { - // NOTE: - // Individual keyed mutexes should ONLY ever - // be locked within the protection of the outer - // mapMu lock. If you lock these outside the - // protection of this, there is a chance for - // deadlocks - mus map[string]RWMutex mapMu sync.Mutex pool sync.Pool + queue []func() + evict []func() + count int32 + maxmu int32 + state uint8 } -// NewMap returns a new MutexMap instance based on supplied -// RWMutex allocator function, nil implies use default -func NewMap(newFn func() RWMutex) MutexMap { - if newFn == nil { - newFn = NewRW +// NewMap returns a new MutexMap instance with provided max no. open mutexes. +func NewMap(max int32) MutexMap { + if max < 1 { + // Default = 128 * GOMAXPROCS + procs := runtime.GOMAXPROCS(0) + max = int32(procs * 128) } return MutexMap{ - mus: make(map[string]RWMutex), - mapMu: sync.Mutex{}, + mus: make(map[string]RWMutex), pool: sync.Pool{ New: func() interface{} { - return newFn() + return NewRW() }, }, + maxmu: max, + } +} + +// acquire will either acquire a mutex from pool or alloc. +func (mm *MutexMap) acquire() RWMutex { + return mm.pool.Get().(RWMutex) +} + +// release will release provided mutex to pool. +func (mm *MutexMap) release(mu RWMutex) { + mm.pool.Put(mu) +} + +// spinLock will wait (using a mutex to sleep thread) until 'cond()' returns true, +// returning with map lock. Note that 'cond' is performed within a map lock. +func (mm *MutexMap) spinLock(cond func() bool) { + mu := mm.acquire() + defer mm.release(mu) + + for { + // Get map lock + mm.mapMu.Lock() + + // Check if return + if cond() { + return + } + + // Queue ourselves + unlock := mu.Lock() + mm.queue = append(mm.queue, unlock) + mm.mapMu.Unlock() + + // Wait on notify + mu.Lock()() + } +} + +// lockMutex will acquire a lock on the mutex at provided key, handling earlier allocated mutex if provided. Unlocks map on return. +func (mm *MutexMap) lockMutex(key string, lt locktype) func() { + var unlock func() + + // Incr counter + mm.count++ + + // Check for existing mutex at key + mu, ok := mm.mus[key] + if !ok { + // Alloc from pool + mu = mm.acquire() + mm.mus[key] = mu + + // Queue mutex for eviction + mm.evict = append(mm.evict, func() { + delete(mm.mus, key) + mm.pool.Put(mu) + }) + } + + // If no state, set in use. + // State will already have been + // set if this is from LockState{} + if mm.state == stateUnlockd { + mm.state = stateInUse + } + + switch { + // Read lock + case lt&lockTypeRead != 0: + unlock = mu.RLock() + + // Write lock + case lt&lockTypeWrite != 0: + unlock = mu.Lock() + + // shouldn't reach here + default: + panic("unexpected lock type") + } + + // Unlock map + return + mm.mapMu.Unlock() + return func() { + mm.mapMu.Lock() + unlock() + go mm.onUnlock() } } -func (mm *MutexMap) evict(key string, mu RWMutex) { - // Acquire map lock - mm.mapMu.Lock() +// onUnlock is performed as the final (async) stage of releasing an acquired key / map mutex. +func (mm *MutexMap) onUnlock() { + // Decr counter + mm.count-- + + if mm.count < 1 { + // Perform all queued evictions + for i := 0; i < len(mm.evict); i++ { + mm.evict[i]() + } - // Toggle mutex lock to - // ensure it is unused - unlock := mu.Lock() - unlock() + // Notify all waiting goroutines + for i := 0; i < len(mm.queue); i++ { + mm.queue[i]() + } - // Delete mutex key - delete(mm.mus, key) + // Reset the map state + mm.evict = nil + mm.queue = nil + mm.state = stateUnlockd + } + + // Finally, unlock mm.mapMu.Unlock() +} - // Release to pool - mm.pool.Put(mu) +// RLockMap acquires a read lock over the entire map, returning a lock state for acquiring key read locks. +// Please note that the 'unlock()' function will block until all keys locked from this state are unlocked. +func (mm *MutexMap) RLockMap() *LockState { + return mm.getMapLock(lockTypeRead) } -// RLock acquires a mutex read lock for supplied key, returning an RUnlock function -func (mm *MutexMap) RLock(key string) func() { - return mm.getLock(key, func(mu RWMutex) func() { - return mu.RLock() +// LockMap acquires a write lock over the entire map, returning a lock state for acquiring key read/write locks. +// Please note that the 'unlock()' function will block until all keys locked from this state are unlocked. +func (mm *MutexMap) LockMap() *LockState { + return mm.getMapLock(lockTypeWrite) +} + +// RLock acquires a mutex read lock for supplied key, returning an RUnlock function. +func (mm *MutexMap) RLock(key string) (runlock func()) { + return mm.getLock(key, lockTypeRead) +} + +// Lock acquires a mutex write lock for supplied key, returning an Unlock function. +func (mm *MutexMap) Lock(key string) (unlock func()) { + return mm.getLock(key, lockTypeWrite) +} + +// getLock will fetch lock of provided type, for given key, returning unlock function. +func (mm *MutexMap) getLock(key string, lt locktype) func() { + // Spin until achieve lock + mm.spinLock(func() bool { + return permitLockType(mm.state, lt) && + mm.count < mm.maxmu // not overloaded }) + + // Perform actual mutex lock + return mm.lockMutex(key, lt) } -// Lock acquires a mutex lock for supplied key, returning an Unlock function -func (mm *MutexMap) Lock(key string) func() { - return mm.getLock(key, func(mu RWMutex) func() { - return mu.Lock() +// getMapLock will acquire a map lock of provided type, returning a LockState session. +func (mm *MutexMap) getMapLock(lt locktype) *LockState { + // Spin until achieve lock + mm.spinLock(func() bool { + return permitLockType(mm.state, lt|lockTypeMap) && + mm.count < mm.maxmu // not overloaded }) + + // Incr counter + mm.count++ + + switch { + // Set read lock state + case lt&lockTypeRead != 0: + mm.state = stateRLocked + + // Set write lock state + case lt&lockTypeWrite != 0: + mm.state = stateLocked + + default: + panic("unexpected lock type") + } + + // Unlock + return + mm.mapMu.Unlock() + return &LockState{ + mmap: mm, + ltyp: lt, + } } -func (mm *MutexMap) getLock(key string, doLock func(RWMutex) func()) func() { - // Get map lock - mm.mapMu.Lock() +// LockState represents a window to a locked MutexMap. +type LockState struct { + wait sync.WaitGroup + mmap *MutexMap + done uint32 + ltyp locktype +} - // Look for mutex - mu, ok := mm.mus[key] - if ok { - // Lock and return - // its unlocker func - unlock := doLock(mu) - mm.mapMu.Unlock() - return unlock +// Lock: see MutexMap.Lock() definition. Will panic if map only read locked. +func (st *LockState) Lock(key string) (unlock func()) { + return st.getLock(key, lockTypeWrite) +} + +// RLock: see MutexMap.RLock() definition. +func (st *LockState) RLock(key string) (runlock func()) { + return st.getLock(key, lockTypeRead) +} + +// UnlockMap will close this state and release the currently locked map. +func (st *LockState) UnlockMap() { + // Set state to finished (or panic if already done) + if !atomic.CompareAndSwapUint32(&st.done, 0, 1) { + panic("called UnlockMap() on expired state") } - // Note: even though the mutex data structure is - // small, benchmarking does actually show that pooled - // alloc of mutexes here is faster + // Wait until done + st.wait.Wait() - // Acquire mu + add - mu = mm.pool.Get().(RWMutex) - mm.mus[key] = mu + // Async reset map + st.mmap.mapMu.Lock() + go st.mmap.onUnlock() +} - // Lock mutex + unlock map - unlockFn := doLock(mu) - mm.mapMu.Unlock() +// getLock: see MutexMap.getLock() definition. +func (st *LockState) getLock(key string, lt locktype) func() { + st.wait.Add(1) // track lock - return func() { - // Unlock mutex - unlockFn() + // Check if closed, or if write lock is allowed + if atomic.LoadUint32(&st.done) == 1 { + panic("map lock closed") + } else if lt&lockTypeWrite != 0 && + st.ltyp&lockTypeWrite == 0 { + panic("called .Lock() on rlocked map") + } + + // Spin until achieve map lock + st.mmap.spinLock(func() bool { + return st.mmap.count < st.mmap.maxmu + }) // i.e. not overloaded - // Release function - go mm.evict(key, mu) + // Perform actual mutex lock + unlock := st.mmap.lockMutex(key, lt) + + return func() { + unlock() + st.wait.Done() } } diff --git a/vendor/codeberg.org/gruf/go-mutexes/mutex_timeout.go b/vendor/codeberg.org/gruf/go-mutexes/mutex_timeout.go @@ -3,8 +3,6 @@ package mutexes import ( "sync" "time" - - "codeberg.org/gruf/go-nowish" ) // TimeoutMutex defines a Mutex with timeouts on locks @@ -73,14 +71,6 @@ func (mu *timeoutRWMutex) RLockFunc(fn func()) func() { return mutexTimeout(mu.rd, mu.mu.RLock(), fn) } -// timeoutPool provides nowish.Timeout objects for timeout mutexes -var timeoutPool = sync.Pool{ - New: func() interface{} { - t := nowish.NewTimeout() - return &t - }, -} - // mutexTimeout performs a timed unlock, calling supplied fn if timeout is reached func mutexTimeout(d time.Duration, unlock func(), fn func()) func() { if d < 1 { @@ -88,18 +78,65 @@ func mutexTimeout(d time.Duration, unlock func(), fn func()) func() { return unlock } - // Acquire timeout obj - t := timeoutPool.Get().(*nowish.Timeout) + // Acquire timer from pool + t := timerPool.Get().(*timer) - // Start the timeout with hook - t.Start(d, fn) + // Start the timer + go t.Start(d, fn) // Return func cancelling timeout, // replacing Timeout in pool and // finally unlocking mutex return func() { + defer timerPool.Put(t) t.Cancel() - timeoutPool.Put(t) unlock() } } + +// timerPool is the global &timer{} pool. +var timerPool = sync.Pool{ + New: func() interface{} { + return newtimer() + }, +} + +// timer represents a reusable cancellable timer. +type timer struct { + t *time.Timer + c chan struct{} +} + +// newtimer returns a new timer instance. +func newtimer() *timer { + t := time.NewTimer(time.Minute) + t.Stop() + return &timer{t: t, c: make(chan struct{})} +} + +// Start will start the timer with duration 'd', performing 'fn' on timeout. +func (t *timer) Start(d time.Duration, fn func()) { + t.t.Reset(d) + select { + // Timed out + case <-t.t.C: + fn() + + // Cancelled + case <-t.c: + } +} + +// Cancel will attempt to cancel the running timer. +func (t *timer) Cancel() { + select { + // cancel successful + case t.c <- struct{}{}: + if !t.t.Stop() { + <-t.t.C + } // stop timer + + // already stopped + default: + } +} diff --git a/vendor/codeberg.org/gruf/go-nowish/LICENSE b/vendor/codeberg.org/gruf/go-nowish/LICENSE @@ -1,9 +0,0 @@ -MIT License - -Copyright (c) 2021 gruf - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/codeberg.org/gruf/go-nowish/README.md b/vendor/codeberg.org/gruf/go-nowish/README.md @@ -1,3 +0,0 @@ -a simple Go library with useful time utiities: -- Clock: a high performance clock giving a good "ish" representation of "now" (hence the name!) -- Timeout: a reusable structure for enforcing timeouts with a cancel diff --git a/vendor/codeberg.org/gruf/go-nowish/clock.go b/vendor/codeberg.org/gruf/go-nowish/clock.go @@ -1,132 +0,0 @@ -package nowish - -import ( - "sync" - "sync/atomic" - "time" - "unsafe" -) - -// Start returns a new Clock instance initialized and -// started with the provided precision, along with the -// stop function for it's underlying timer -func Start(precision time.Duration) (*Clock, func()) { - c := Clock{} - return &c, c.Start(precision) -} - -type Clock struct { - // format stores the time formatting style string - format string - - // valid indicates whether the current value stored in .Format is valid - valid uint32 - - // mutex protects writes to .Format, not because it would be unsafe, but - // because we want to minimize unnnecessary allocations - mutex sync.Mutex - - // nowfmt is an unsafe pointer to the last-updated time format string - nowfmt unsafe.Pointer - - // now is an unsafe pointer to the last-updated time.Time object - now unsafe.Pointer -} - -// Start starts the clock with the provided precision, the returned -// function is the stop function for the underlying timer. For >= 2ms, -// actual precision is usually within AT LEAST 10% of requested precision, -// less than this and the actual precision very quickly deteriorates. -func (c *Clock) Start(precision time.Duration) func() { - // Create ticker from duration - tick := time.NewTicker(precision / 10) - - // Set initial time - t := time.Now() - atomic.StorePointer(&c.now, unsafe.Pointer(&t)) - - // Set initial format - s := "" - atomic.StorePointer(&c.nowfmt, unsafe.Pointer(&s)) - - // If formatting string unset, set default - c.mutex.Lock() - if c.format == "" { - c.format = time.RFC822 - } - c.mutex.Unlock() - - // Start main routine - go c.run(tick) - - // Return stop fn - return tick.Stop -} - -// run is the internal clock ticking loop. -func (c *Clock) run(tick *time.Ticker) { - for { - // Wait on tick - _, ok := <-tick.C - - // Channel closed - if !ok { - break - } - - // Update time - t := time.Now() - atomic.StorePointer(&c.now, unsafe.Pointer(&t)) - - // Invalidate format string - atomic.StoreUint32(&c.valid, 0) - } -} - -// Now returns a good (ish) estimate of the current 'now' time. -func (c *Clock) Now() time.Time { - return *(*time.Time)(atomic.LoadPointer(&c.now)) -} - -// NowFormat returns the formatted "now" time, cached until next tick and "now" updates. -func (c *Clock) NowFormat() string { - // If format still valid, return this - if atomic.LoadUint32(&c.valid) == 1 { - return *(*string)(atomic.LoadPointer(&c.nowfmt)) - } - - // Get mutex lock - c.mutex.Lock() - - // Double check still invalid - if atomic.LoadUint32(&c.valid) == 1 { - c.mutex.Unlock() - return *(*string)(atomic.LoadPointer(&c.nowfmt)) - } - - // Calculate time format - nowfmt := c.Now().Format(c.format) - - // Update the stored value and set valid! - atomic.StorePointer(&c.nowfmt, unsafe.Pointer(&nowfmt)) - atomic.StoreUint32(&c.valid, 1) - - // Unlock and return - c.mutex.Unlock() - return nowfmt -} - -// SetFormat sets the time format string used by .NowFormat(). -func (c *Clock) SetFormat(format string) { - // Get mutex lock - c.mutex.Lock() - - // Update time format - c.format = format - - // Invalidate current format string - atomic.StoreUint32(&c.valid, 0) - - // Unlock - c.mutex.Unlock() -} diff --git a/vendor/codeberg.org/gruf/go-nowish/timeout.go b/vendor/codeberg.org/gruf/go-nowish/timeout.go @@ -1,233 +0,0 @@ -package nowish - -import ( - "sync" - "sync/atomic" - "time" -) - -// Timeout provides a reusable structure for enforcing timeouts with a cancel. -type Timeout struct { - timer *time.Timer // timer is the underlying timeout-timer - cncl syncer // cncl is the cancel synchronization channel - next int64 // next is the next timeout duration to run on - state uint32 // state stores the current timeout state - mu sync.Mutex // mu protects state, and helps synchronize return of .Start() -} - -// NewTimeout returns a new Timeout instance. -func NewTimeout() Timeout { - timer := time.NewTimer(time.Minute) - timer.Stop() // don't keep it running - return Timeout{ - timer: timer, - cncl: make(syncer), - } -} - -// startTimeout is the main timeout routine, handling starting the -// timeout runner at first and upon any time extensions, and handling -// any received cancels by stopping the running timer. -func (t *Timeout) startTimeout(hook func()) { - var cancelled bool - - // Receive first timeout duration - d := atomic.SwapInt64(&t.next, 0) - - // Indicate finished starting, this - // was left locked by t.start(). - t.mu.Unlock() - - for { - // Run supplied timeout - cancelled = t.runTimeout(d) - if cancelled { - break - } - - // Check for extension or set timed out - d = atomic.SwapInt64(&t.next, 0) - if d < 1 { - if t.timedOut() { - // timeout reached - hook() - break - } else { - // already cancelled - t.cncl.wait() - cancelled = true - break - } - } - - if !t.extend() { - // already cancelled - t.cncl.wait() - cancelled = true - break - } - } - - if cancelled { - // Release the .Cancel() - defer t.cncl.notify() - } - - // Mark as done - t.reset() -} - -// runTimeout will until supplied timeout or cancel called. -func (t *Timeout) runTimeout(d int64) (cancelled bool) { - // Start the timer for 'd' - t.timer.Reset(time.Duration(d)) - - select { - // Timeout reached - case <-t.timer.C: - if !t.timingOut() { - // a sneaky cancel! - t.cncl.wait() - cancelled = true - } - - // Cancel called - case <-t.cncl.wait(): - cancelled = true - if !t.timer.Stop() { - <-t.timer.C - } - } - - return cancelled -} - -// Start starts the timer with supplied timeout. If timeout is reached before -// cancel then supplied timeout hook will be called. Panic will be called if -// Timeout is already running when calling this function. -func (t *Timeout) Start(d time.Duration, hook func()) { - if !t.start() { - t.mu.Unlock() // need to unlock - panic("timeout already started") - } - - // Start the timeout - atomic.StoreInt64(&t.next, int64(d)) - go t.startTimeout(hook) - - // Wait until start - t.mu.Lock() - t.mu.Unlock() -} - -// Extend will attempt to extend the timeout runner's time, returns false if not running. -func (t *Timeout) Extend(d time.Duration) bool { - var ok bool - if ok = t.running(); ok { - atomic.AddInt64(&t.next, int64(d)) - } - return ok -} - -// Cancel cancels the currently running timer. If a cancel is achieved, then -// this function will return after the timeout goroutine is finished. -func (t *Timeout) Cancel() { - if !t.cancel() { - return - } - t.cncl.notify() - <-t.cncl.wait() -} - -// possible timeout states. -const ( - stopped = 0 - started = 1 - timingOut = 2 - cancelled = 3 - timedOut = 4 -) - -// cas will perform a compare and swap where the compare is a provided function. -func (t *Timeout) cas(check func(uint32) bool, swap uint32) bool { - var cas bool - - t.mu.Lock() - if cas = check(t.state); cas { - t.state = swap - } - t.mu.Unlock() - - return cas -} - -// start attempts to mark the timeout state as 'started', note DOES NOT unlock Timeout.mu. -func (t *Timeout) start() bool { - var ok bool - - t.mu.Lock() - if ok = (t.state == stopped); ok { - t.state = started - } - - // don't unlock - return ok -} - -// timingOut attempts to mark the timeout state as 'timing out'. -func (t *Timeout) timingOut() bool { - return t.cas(func(u uint32) bool { - return (u == started) - }, timingOut) -} - -// timedOut attempts mark the 'timing out' state as 'timed out'. -func (t *Timeout) timedOut() bool { - return t.cas(func(u uint32) bool { - return (u == timingOut) - }, timedOut) -} - -// extend attempts to extend a 'timing out' state by moving it back to 'started'. -func (t *Timeout) extend() bool { - return t.cas(func(u uint32) bool { - return (u == started) || - (u == timingOut) - }, started) -} - -// running returns whether the state is anything other than 'stopped'. -func (t *Timeout) running() bool { - t.mu.Lock() - running := (t.state != stopped) - t.mu.Unlock() - return running -} - -// cancel attempts to mark the timeout state as 'cancelled'. -func (t *Timeout) cancel() bool { - return t.cas(func(u uint32) bool { - return (u == started) || - (u == timingOut) - }, cancelled) -} - -// reset marks the timeout state as 'stopped'. -func (t *Timeout) reset() { - t.mu.Lock() - t.state = stopped - t.mu.Unlock() -} - -// syncer provides helpful receiver methods for a synchronization channel. -type syncer (chan struct{}) - -// notify blocks on sending an empty value down channel. -func (s syncer) notify() { - s <- struct{}{} -} - -// wait returns the underlying channel for blocking until '.notify()'. -func (s syncer) wait() <-chan struct{} { - return s -} diff --git a/vendor/codeberg.org/gruf/go-nowish/util.go b/vendor/codeberg.org/gruf/go-nowish/util.go @@ -1,10 +0,0 @@ -package nowish - -//nolint -type noCopy struct{} - -//nolint -func (*noCopy) Lock() {} - -//nolint -func (*noCopy) Unlock() {} diff --git a/vendor/codeberg.org/gruf/go-store/kv/iterator.go b/vendor/codeberg.org/gruf/go-store/kv/iterator.go @@ -2,6 +2,7 @@ package kv import ( "codeberg.org/gruf/go-errors" + "codeberg.org/gruf/go-mutexes" "codeberg.org/gruf/go-store/storage" ) @@ -17,10 +18,10 @@ var ErrIteratorClosed = errors.New("store/kv: iterator closed") // have multiple iterators running concurrently type KVIterator struct { store *KVStore // store is the linked KVStore + state *mutexes.LockState entries []storage.StorageEntry index int key string - onClose func() } // Next attempts to set the next key-value pair, the @@ -43,13 +44,10 @@ func (i *KVIterator) Key() string { // Release releases the KVIterator and KVStore's read lock func (i *KVIterator) Release() { - // Reset key, path, entries + i.state.UnlockMap() i.store = nil i.key = "" i.entries = nil - - // Perform requested callback - i.onClose() } // Value returns the next value from the KVStore @@ -60,5 +58,5 @@ func (i *KVIterator) Value() ([]byte, error) { } // Attempt to fetch from store - return i.store.get(i.store.mutexMap.RLock, i.key) + return i.store.get(i.state.RLock, i.key) } diff --git a/vendor/codeberg.org/gruf/go-store/kv/state.go b/vendor/codeberg.org/gruf/go-store/kv/state.go @@ -2,9 +2,9 @@ package kv import ( "io" - "sync" "codeberg.org/gruf/go-errors" + "codeberg.org/gruf/go-mutexes" ) var ErrStateClosed = errors.New("store/kv: state closed") @@ -16,61 +16,42 @@ var ErrStateClosed = errors.New("store/kv: state closed") // then the state has zero guarantees type StateRO struct { store *KVStore - mutex sync.RWMutex + state *mutexes.LockState } func (st *StateRO) Get(key string) ([]byte, error) { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return nil, ErrStateClosed } // Pass request to store - return st.store.get(st.store.mutexMap.RLock, key) + return st.store.get(st.state.RLock, key) } func (st *StateRO) GetStream(key string) (io.ReadCloser, error) { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return nil, ErrStateClosed } // Pass request to store - return st.store.getStream(st.store.mutexMap.RLock, key) + return st.store.getStream(st.state.RLock, key) } func (st *StateRO) Has(key string) (bool, error) { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return false, ErrStateClosed } // Pass request to store - return st.store.has(st.store.mutexMap.RLock, key) + return st.store.has(st.state.RLock, key) } func (st *StateRO) Release() { - // Get state write lock - st.mutex.Lock() - defer st.mutex.Unlock() - - // Release the store - if st.store != nil { - st.store.mutex.RUnlock() - st.store = nil - } + st.state.UnlockMap() + st.store = nil } // StateRW provides a read-write window to the store. While this @@ -80,101 +61,70 @@ func (st *StateRO) Release() { // then the state has zero guarantees type StateRW struct { store *KVStore - mutex sync.RWMutex + state *mutexes.LockState } func (st *StateRW) Get(key string) ([]byte, error) { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return nil, ErrStateClosed } // Pass request to store - return st.store.get(st.store.mutexMap.RLock, key) + return st.store.get(st.state.RLock, key) } func (st *StateRW) GetStream(key string) (io.ReadCloser, error) { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return nil, ErrStateClosed } // Pass request to store - return st.store.getStream(st.store.mutexMap.RLock, key) + return st.store.getStream(st.state.RLock, key) } func (st *StateRW) Put(key string, value []byte) error { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return ErrStateClosed } // Pass request to store - return st.store.put(st.store.mutexMap.Lock, key, value) + return st.store.put(st.state.Lock, key, value) } func (st *StateRW) PutStream(key string, r io.Reader) error { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return ErrStateClosed } // Pass request to store - return st.store.putStream(st.store.mutexMap.Lock, key, r) + return st.store.putStream(st.state.Lock, key, r) } func (st *StateRW) Has(key string) (bool, error) { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return false, ErrStateClosed } // Pass request to store - return st.store.has(st.store.mutexMap.RLock, key) + return st.store.has(st.state.RLock, key) } func (st *StateRW) Delete(key string) error { - // Get state read lock - st.mutex.RLock() - defer st.mutex.RUnlock() - // Check not closed if st.store == nil { return ErrStateClosed } // Pass request to store - return st.store.delete(st.store.mutexMap.Lock, key) + return st.store.delete(st.state.Lock, key) } func (st *StateRW) Release() { - // Get state write lock - st.mutex.Lock() - defer st.mutex.Unlock() - - // Release the store - if st.store != nil { - st.store.mutex.Unlock() - st.store = nil - } + st.state.UnlockMap() + st.store = nil } diff --git a/vendor/codeberg.org/gruf/go-store/kv/store.go b/vendor/codeberg.org/gruf/go-store/kv/store.go @@ -2,7 +2,6 @@ package kv import ( "io" - "sync" "codeberg.org/gruf/go-mutexes" "codeberg.org/gruf/go-store/storage" @@ -11,9 +10,8 @@ import ( // KVStore is a very simple, yet performant key-value store type KVStore struct { - mutexMap mutexes.MutexMap // mutexMap is a map of keys to mutexes to protect file access - mutex sync.RWMutex // mutex is the total store mutex - storage storage.Storage // storage is the underlying storage + mutex mutexes.MutexMap // mutex is a map of keys to mutexes to protect file access + storage storage.Storage // storage is the underlying storage } func OpenFile(path string, cfg *storage.DiskConfig) (*KVStore, error) { @@ -47,26 +45,19 @@ func OpenStorage(storage storage.Storage) (*KVStore, error) { // Return new KVStore return &KVStore{ - mutexMap: mutexes.NewMap(mutexes.NewRW), - mutex: sync.RWMutex{}, - storage: storage, + mutex: mutexes.NewMap(-1), + storage: storage, }, nil } // RLock acquires a read-lock on supplied key, returning unlock function. func (st *KVStore) RLock(key string) (runlock func()) { - st.mutex.RLock() - runlock = st.mutexMap.RLock(key) - st.mutex.RUnlock() - return runlock + return st.mutex.RLock(key) } // Lock acquires a write-lock on supplied key, returning unlock function. func (st *KVStore) Lock(key string) (unlock func()) { - st.mutex.Lock() - unlock = st.mutexMap.Lock(key) - st.mutex.Unlock() - return unlock + return st.mutex.Lock(key) } // Get fetches the bytes for supplied key in the store @@ -167,7 +158,7 @@ func (st *KVStore) Iterator(matchFn func(string) bool) (*KVIterator, error) { } // Get store read lock - st.mutex.RLock() + state := st.mutex.RLockMap() // Setup the walk keys function entries := []storage.StorageEntry{} @@ -184,24 +175,24 @@ func (st *KVStore) Iterator(matchFn func(string) bool) (*KVIterator, error) { // Walk keys in the storage err := st.storage.WalkKeys(storage.WalkKeysOptions{WalkFn: walkFn}) if err != nil { - st.mutex.RUnlock() + state.UnlockMap() return nil, err } // Return new iterator return &KVIterator{ store: st, + state: state, entries: entries, index: -1, key: "", - onClose: st.mutex.RUnlock, }, nil } // Read provides a read-only window to the store, holding it in a read-locked state until release func (st *KVStore) Read() *StateRO { - st.mutex.RLock() - return &StateRO{store: st} + state := st.mutex.RLockMap() + return &StateRO{store: st, state: state} } // ReadFn provides a read-only window to the store, holding it in a read-locked state until fn return. @@ -216,8 +207,8 @@ func (st *KVStore) ReadFn(fn func(*StateRO)) { // Update provides a read-write window to the store, holding it in a write-locked state until release func (st *KVStore) Update() *StateRW { - st.mutex.Lock() - return &StateRW{store: st} + state := st.mutex.LockMap() + return &StateRW{store: st, state: state} } // UpdateFn provides a read-write window to the store, holding it in a write-locked state until fn return. @@ -229,3 +220,8 @@ func (st *KVStore) UpdateFn(fn func(*StateRW)) { // Pass to fn fn(state) } + +// Close will close the underlying storage, the mutex map locking (e.g. RLock(), Lock() will still work). +func (st *KVStore) Close() error { + return st.storage.Close() +} diff --git a/vendor/codeberg.org/gruf/go-store/storage/block.go b/vendor/codeberg.org/gruf/go-store/storage/block.go @@ -1,6 +1,7 @@ package storage import ( + "crypto/sha256" "io" "io/fs" "os" @@ -13,7 +14,6 @@ import ( "codeberg.org/gruf/go-hashenc" "codeberg.org/gruf/go-pools" "codeberg.org/gruf/go-store/util" - "github.com/zeebo/blake3" ) var ( @@ -77,7 +77,7 @@ func getBlockConfig(cfg *BlockConfig) BlockConfig { // BlockStorage is a Storage implementation that stores input data as chunks on // a filesystem. Each value is chunked into blocks of configured size and these -// blocks are stored with name equal to their base64-encoded BLAKE3 hash-sum. A +// blocks are stored with name equal to their base64-encoded SHA256 hash-sum. A // "node" file is finally created containing an array of hashes contained within // this value type BlockStorage struct { @@ -87,7 +87,7 @@ type BlockStorage struct { config BlockConfig // cfg is the supplied configuration for this store hashPool sync.Pool // hashPool is this store's hashEncoder pool bufpool pools.BufferPool // bufpool is this store's bytes.Buffer pool - lock *LockableFile // lock is the opened lockfile for this storage instance + lock *Lock // lock is the opened lockfile for this storage instance // NOTE: // BlockStorage does not need to lock each of the underlying block files @@ -140,11 +140,9 @@ func OpenBlock(path string, cfg *BlockConfig) (*BlockStorage, error) { } // Open and acquire storage lock for path - lock, err := OpenLock(pb.Join(path, LockFile)) + lock, err := OpenLock(pb.Join(path, lockFile)) if err != nil { return nil, err - } else if err := lock.Lock(); err != nil { - return nil, err } // Figure out the largest size for bufpool slices @@ -174,14 +172,23 @@ func OpenBlock(path string, cfg *BlockConfig) (*BlockStorage, error) { // Clean implements storage.Clean() func (st *BlockStorage) Clean() error { - nodes := map[string]*node{} + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return ErrClosed + } // Acquire path builder pb := util.GetPathBuilder() defer util.PutPathBuilder(pb) - // Walk nodes dir for entries + nodes := map[string]*node{} onceErr := errors.OnceError{} + + // Walk nodes dir for entries err := util.WalkDir(pb, st.nodePath, func(npath string, fsentry fs.DirEntry) { // Only deal with regular files if !fsentry.Type().IsRegular() { @@ -303,6 +310,7 @@ func (st *BlockStorage) ReadBytes(key string) ([]byte, error) { if err != nil { return nil, err } + defer rc.Close() // Read all bytes and return return io.ReadAll(rc) @@ -316,9 +324,19 @@ func (st *BlockStorage) ReadStream(key string) (io.ReadCloser, error) { return nil, err } + // Track open + st.lock.Add() + + // Check if open + if st.lock.Closed() { + st.lock.Done() + return nil, ErrClosed + } + // Attempt to open RO file file, err := open(npath, defaultFileROFlags) if err != nil { + st.lock.Done() return nil, err } defer file.Close() @@ -338,14 +356,16 @@ func (st *BlockStorage) ReadStream(key string) (io.ReadCloser, error) { nil, ) if err != nil { + st.lock.Done() return nil, err } - // Return new block reader - return util.NopReadCloser(&blockReader{ + // Prepare block reader and return + rc := util.NopReadCloser(&blockReader{ storage: st, node: &node, - }), nil + }) // we wrap the blockreader to decr lockfile waitgroup + return util.ReadCloserWithCallback(rc, st.lock.Done), nil } func (st *BlockStorage) readBlock(key string) ([]byte, error) { @@ -383,6 +403,15 @@ func (st *BlockStorage) WriteStream(key string, r io.Reader) error { return err } + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return ErrClosed + } + // Check if this exists ok, err := stat(key) if err != nil { @@ -567,6 +596,15 @@ func (st *BlockStorage) Stat(key string) (bool, error) { return false, err } + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return false, ErrClosed + } + // Check for file on disk return stat(kpath) } @@ -579,18 +617,35 @@ func (st *BlockStorage) Remove(key string) error { return err } + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return ErrClosed + } + // Attempt to remove file return os.Remove(kpath) } // Close implements Storage.Close() func (st *BlockStorage) Close() error { - defer st.lock.Close() - return st.lock.Unlock() + return st.lock.Close() } // WalkKeys implements Storage.WalkKeys() func (st *BlockStorage) WalkKeys(opts WalkKeysOptions) error { + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return ErrClosed + } + // Acquire path builder pb := util.GetPathBuilder() defer util.PutPathBuilder(pb) @@ -800,7 +855,7 @@ var ( // encodedHashLen is the once-calculated encoded hash-sum length encodedHashLen = base64Encoding.EncodedLen( - blake3.New().Size(), + sha256.New().Size(), ) ) @@ -812,9 +867,8 @@ type hashEncoder struct { // newHashEncoder returns a new hashEncoder instance func newHashEncoder() *hashEncoder { - hash := blake3.New() return &hashEncoder{ - henc: hashenc.New(hash, base64Encoding), + henc: hashenc.New(sha256.New(), base64Encoding), ebuf: make([]byte, encodedHashLen), } } diff --git a/vendor/codeberg.org/gruf/go-store/storage/disk.go b/vendor/codeberg.org/gruf/go-store/storage/disk.go @@ -71,7 +71,7 @@ type DiskStorage struct { path string // path is the root path of this store bufp pools.BufferPool // bufp is the buffer pool for this DiskStorage config DiskConfig // cfg is the supplied configuration for this store - lock *LockableFile // lock is the opened lockfile for this storage instance + lock *Lock // lock is the opened lockfile for this storage instance } // OpenFile opens a DiskStorage instance for given folder path and configuration @@ -118,11 +118,9 @@ func OpenFile(path string, cfg *DiskConfig) (*DiskStorage, error) { } // Open and acquire storage lock for path - lock, err := OpenLock(pb.Join(path, LockFile)) + lock, err := OpenLock(pb.Join(path, lockFile)) if err != nil { return nil, err - } else if err := lock.Lock(); err != nil { - return nil, err } // Return new DiskStorage @@ -136,6 +134,11 @@ func OpenFile(path string, cfg *DiskConfig) (*DiskStorage, error) { // Clean implements Storage.Clean() func (st *DiskStorage) Clean() error { + st.lock.Add() + defer st.lock.Done() + if st.lock.Closed() { + return ErrClosed + } return util.CleanDirs(st.path) } @@ -160,9 +163,18 @@ func (st *DiskStorage) ReadStream(key string) (io.ReadCloser, error) { return nil, err } + // Track open + st.lock.Add() + + // Check if open + if st.lock.Closed() { + return nil, ErrClosed + } + // Attempt to open file (replace ENOENT with our own) file, err := open(kpath, defaultFileROFlags) if err != nil { + st.lock.Done() return nil, errSwapNotFound(err) } @@ -170,12 +182,14 @@ func (st *DiskStorage) ReadStream(key string) (io.ReadCloser, error) { cFile, err := st.config.Compression.Reader(file) if err != nil { file.Close() // close this here, ignore error + st.lock.Done() return nil, err } // Wrap compressor to ensure file close return util.ReadCloserWithCallback(cFile, func() { file.Close() + st.lock.Done() }), nil } @@ -192,6 +206,15 @@ func (st *DiskStorage) WriteStream(key string, r io.Reader) error { return err } + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return ErrClosed + } + // Ensure dirs leading up to file exist err = os.MkdirAll(path.Dir(kpath), defaultDirPerms) if err != nil { @@ -242,6 +265,15 @@ func (st *DiskStorage) Stat(key string) (bool, error) { return false, err } + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return false, ErrClosed + } + // Check for file on disk return stat(kpath) } @@ -254,18 +286,35 @@ func (st *DiskStorage) Remove(key string) error { return err } + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return ErrClosed + } + // Attempt to remove file return os.Remove(kpath) } // Close implements Storage.Close() func (st *DiskStorage) Close() error { - defer st.lock.Close() - return st.lock.Unlock() + return st.lock.Close() } // WalkKeys implements Storage.WalkKeys() func (st *DiskStorage) WalkKeys(opts WalkKeysOptions) error { + // Track open + st.lock.Add() + defer st.lock.Done() + + // Check if open + if st.lock.Closed() { + return ErrClosed + } + // Acquire path builder pb := util.GetPathBuilder() defer util.PutPathBuilder(pb) @@ -286,13 +335,13 @@ func (st *DiskStorage) WalkKeys(opts WalkKeysOptions) error { // filepath checks and returns a formatted filepath for given key func (st *DiskStorage) filepath(key string) (string, error) { + // Calculate transformed key path + key = st.config.Transform.KeyToPath(key) + // Acquire path builder pb := util.GetPathBuilder() defer util.PutPathBuilder(pb) - // Calculate transformed key path - key = st.config.Transform.KeyToPath(key) - // Generated joined root path pb.AppendString(st.path) pb.AppendString(key) diff --git a/vendor/codeberg.org/gruf/go-store/storage/errors.go b/vendor/codeberg.org/gruf/go-store/storage/errors.go @@ -19,6 +19,9 @@ func (e errorString) Extend(s string, a ...interface{}) errorString { } var ( + // ErrClosed is returned on operations on a closed storage + ErrClosed = errorString("store/storage: closed") + // ErrNotFound is the error returned when a key cannot be found in storage ErrNotFound = errorString("store/storage: key not found") @@ -39,6 +42,9 @@ var ( // errCorruptNodes is returned when nodes with missing blocks are found during a BlockStorage clean errCorruptNodes = errorString("store/storage: corrupted nodes") + + // ErrAlreadyLocked is returned on fail opening a storage lockfile + ErrAlreadyLocked = errorString("store/storage: storage lock already open") ) // errSwapNoop performs no error swaps @@ -61,3 +67,11 @@ func errSwapExist(err error) error { } return err } + +// errSwapUnavailable swaps syscall.EAGAIN for ErrAlreadyLocked +func errSwapUnavailable(err error) error { + if err == syscall.EAGAIN { + return ErrAlreadyLocked + } + return err +} diff --git a/vendor/codeberg.org/gruf/go-store/storage/fs.go b/vendor/codeberg.org/gruf/go-store/storage/fs.go @@ -8,11 +8,14 @@ import ( ) const ( - defaultDirPerms = 0755 - defaultFilePerms = 0644 + // default file permission bits + defaultDirPerms = 0755 + defaultFilePerms = 0644 + + // default file open flags defaultFileROFlags = syscall.O_RDONLY defaultFileRWFlags = syscall.O_CREAT | syscall.O_RDWR - defaultFileLockFlags = syscall.O_RDONLY | syscall.O_EXCL | syscall.O_CREAT + defaultFileLockFlags = syscall.O_RDONLY | syscall.O_CREAT ) // NOTE: diff --git a/vendor/codeberg.org/gruf/go-store/storage/lock.go b/vendor/codeberg.org/gruf/go-store/storage/lock.go @@ -1,38 +1,81 @@ package storage import ( - "os" + "sync" + "sync/atomic" "syscall" "codeberg.org/gruf/go-store/util" ) -// LockFile is our standard lockfile name. -const LockFile = "store.lock" +// lockFile is our standard lockfile name. +var lockFile = "store.lock" -type LockableFile struct { - *os.File +// IsLockKey returns whether storage key is our lockfile. +func IsLockKey(key string) bool { + return key == lockFile +} + +// Lock represents a filesystem lock to ensure only one storage instance open per path. +type Lock struct { + fd int + wg sync.WaitGroup + st uint32 } // OpenLock opens a lockfile at path. -func OpenLock(path string) (*LockableFile, error) { - file, err := open(path, defaultFileLockFlags) +func OpenLock(path string) (*Lock, error) { + var fd int + + // Open the file descriptor at path + err := util.RetryOnEINTR(func() (err error) { + fd, err = syscall.Open(path, defaultFileLockFlags, defaultFilePerms) + return + }) if err != nil { return nil, err } - return &LockableFile{file}, nil + + // Get a flock on the file descriptor + err = util.RetryOnEINTR(func() error { + return syscall.Flock(fd, syscall.LOCK_EX|syscall.LOCK_NB) + }) + if err != nil { + return nil, errSwapUnavailable(err) + } + + return &Lock{fd: fd}, nil } -func (f *LockableFile) Lock() error { - return f.flock(syscall.LOCK_EX | syscall.LOCK_NB) +// Add will add '1' to the underlying sync.WaitGroup. +func (f *Lock) Add() { + f.wg.Add(1) } -func (f *LockableFile) Unlock() error { - return f.flock(syscall.LOCK_UN | syscall.LOCK_NB) +// Done will decrememnt '1' from the underlying sync.WaitGroup. +func (f *Lock) Done() { + f.wg.Done() } -func (f *LockableFile) flock(how int) error { - return util.RetryOnEINTR(func() error { - return syscall.Flock(int(f.Fd()), how) - }) +// Close will attempt to close the lockfile and file descriptor. +func (f *Lock) Close() error { + var err error + if atomic.CompareAndSwapUint32(&f.st, 0, 1) { + // Wait until done + f.wg.Wait() + + // Ensure gets closed + defer syscall.Close(f.fd) + + // Call funlock on the file descriptor + err = util.RetryOnEINTR(func() error { + return syscall.Flock(f.fd, syscall.LOCK_UN|syscall.LOCK_NB) + }) + } + return err +} + +// Closed will return whether this lockfile has been closed (and unlocked). +func (f *Lock) Closed() bool { + return (atomic.LoadUint32(&f.st) == 1) } diff --git a/vendor/codeberg.org/gruf/go-store/storage/memory.go b/vendor/codeberg.org/gruf/go-store/storage/memory.go @@ -14,6 +14,7 @@ type MemoryStorage struct { ow bool // overwrites fs map[string][]byte mu sync.Mutex + st uint32 } // OpenMemory opens a new MemoryStorage instance with internal map of 'size'. @@ -27,13 +28,26 @@ func OpenMemory(size int, overwrites bool) *MemoryStorage { // Clean implements Storage.Clean(). func (st *MemoryStorage) Clean() error { + st.mu.Lock() + defer st.mu.Unlock() + if st.st == 1 { + return ErrClosed + } return nil } // ReadBytes implements Storage.ReadBytes(). func (st *MemoryStorage) ReadBytes(key string) ([]byte, error) { - // Safely check store + // Lock storage st.mu.Lock() + + // Check store open + if st.st == 1 { + st.mu.Unlock() + return nil, ErrClosed + } + + // Check for key b, ok := st.fs[key] st.mu.Unlock() @@ -48,8 +62,16 @@ func (st *MemoryStorage) ReadBytes(key string) ([]byte, error) { // ReadStream implements Storage.ReadStream(). func (st *MemoryStorage) ReadStream(key string) (io.ReadCloser, error) { - // Safely check store + // Lock storage st.mu.Lock() + + // Check store open + if st.st == 1 { + st.mu.Unlock() + return nil, ErrClosed + } + + // Check for key b, ok := st.fs[key] st.mu.Unlock() @@ -66,19 +88,24 @@ func (st *MemoryStorage) ReadStream(key string) (io.ReadCloser, error) { // WriteBytes implements Storage.WriteBytes(). func (st *MemoryStorage) WriteBytes(key string, b []byte) error { - // Safely check store + // Lock storage st.mu.Lock() + defer st.mu.Unlock() + + // Check store open + if st.st == 1 { + return ErrClosed + } + _, ok := st.fs[key] // Check for already exist if ok && !st.ow { - st.mu.Unlock() return ErrAlreadyExists } // Write + unlock st.fs[key] = bytes.Copy(b) - st.mu.Unlock() return nil } @@ -96,43 +123,66 @@ func (st *MemoryStorage) WriteStream(key string, r io.Reader) error { // Stat implements Storage.Stat(). func (st *MemoryStorage) Stat(key string) (bool, error) { + // Lock storage st.mu.Lock() + defer st.mu.Unlock() + + // Check store open + if st.st == 1 { + return false, ErrClosed + } + + // Check for key _, ok := st.fs[key] - st.mu.Unlock() return ok, nil } // Remove implements Storage.Remove(). func (st *MemoryStorage) Remove(key string) error { - // Safely check store + // Lock storage st.mu.Lock() - _, ok := st.fs[key] + defer st.mu.Unlock() - // Check in store + // Check store open + if st.st == 1 { + return ErrClosed + } + + // Check for key + _, ok := st.fs[key] if !ok { - st.mu.Unlock() return ErrNotFound } - // Delete + unlock + // Remove from store delete(st.fs, key) - st.mu.Unlock() + return nil } // Close implements Storage.Close(). func (st *MemoryStorage) Close() error { + st.mu.Lock() + st.st = 1 + st.mu.Unlock() return nil } // WalkKeys implements Storage.WalkKeys(). func (st *MemoryStorage) WalkKeys(opts WalkKeysOptions) error { - // Safely walk storage keys + // Lock storage st.mu.Lock() + defer st.mu.Unlock() + + // Check store open + if st.st == 1 { + return ErrClosed + } + + // Walk store keys for key := range st.fs { opts.WalkFn(entry(key)) } - st.mu.Unlock() return nil } diff --git a/vendor/github.com/zeebo/blake3/.gitignore b/vendor/github.com/zeebo/blake3/.gitignore @@ -1,6 +0,0 @@ -*.pprof -*.test -*.txt -*.out - -/upstream diff --git a/vendor/github.com/zeebo/blake3/LICENSE b/vendor/github.com/zeebo/blake3/LICENSE @@ -1,125 +0,0 @@ -This work is released into the public domain with CC0 1.0. - -------------------------------------------------------------------------------- - -Creative Commons Legal Code - -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. diff --git a/vendor/github.com/zeebo/blake3/Makefile b/vendor/github.com/zeebo/blake3/Makefile @@ -1,11 +0,0 @@ -asm: internal/alg/hash/hash_avx2/impl_amd64.s internal/alg/compress/compress_sse41/impl_amd64.s - -internal/alg/hash/hash_avx2/impl_amd64.s: avo/avx2/*.go - ( cd avo; go run ./avx2 ) > internal/alg/hash/hash_avx2/impl_amd64.s - -internal/alg/compress/compress_sse41/impl_amd64.s: avo/sse41/*.go - ( cd avo; go run ./sse41 ) > internal/alg/compress/compress_sse41/impl_amd64.s - -.PHONY: test -test: - go test -race -bench=. -benchtime=1x diff --git a/vendor/github.com/zeebo/blake3/README.md b/vendor/github.com/zeebo/blake3/README.md @@ -1,77 +0,0 @@ -# BLAKE3 - -<p> - <a href="https://pkg.go.dev/github.com/zeebo/blake3"><img src="https://img.shields.io/badge/doc-reference-007d9b?logo=go&style=flat-square" alt="go.dev" /></a> - <a href="https://goreportcard.com/report/github.com/zeebo/blake3"><img src="https://goreportcard.com/badge/github.com/zeebo/blake3?style=flat-square" alt="Go Report Card" /></a> - <a href="https://sourcegraph.com/github.com/zeebo/blake3?badge"><img src="https://sourcegraph.com/github.com/zeebo/blake3/-/badge.svg?style=flat-square" alt="SourceGraph" /></a> -</p> - -Pure Go implementation of [BLAKE3](https://blake3.io) with AVX2 and SSE4.1 acceleration. - -Special thanks to the excellent [avo](https://github.com/mmcloughlin/avo) making writing vectorized version much easier. - -# Benchmarks - -## Caveats - -This library makes some different design decisions than the upstream Rust crate around internal buffering. Specifically, because it does not target the embedded system space, nor does it support multithreading, it elects to do its own internal buffering. This means that a user does not have to worry about providing large enough buffers to get the best possible performance, but it does worse on smaller input sizes. So some notes: - -- The Rust benchmarks below are all single-threaded to match this Go implementation. -- I make no attempt to get precise measurements (cpu throttling, noisy environment, etc.) so please benchmark on your own systems. -- These benchmarks are run on an i7-6700K which does not support AVX-512, so Rust is limited to use AVX2 at sizes above 8 kib. -- I tried my best to make them benchmark the same thing, but who knows? :smile: - -## Charts - -In this case, both libraries are able to avoid a lot of data copying and will use vectorized instructions to hash as fast as possible, and perform similarly. - -![Large Full Buffer](/assets/large-full-buffer.svg) - -For incremental writes, you must provide the Rust version large enough buffers so that it can use vectorized instructions. This Go library performs consistently regardless of the size being sent into the update function. - -![Incremental](/assets/incremental.svg) - -The downside of internal buffering is most apparent with small sizes as most time is spent initializing the hasher state. In terms of hashing rate, the difference is 3-4x, but in an absolute sense it's ~100ns (see tables below). If you wish to hash a large number of very small strings and you care about those nanoseconds, be sure to use the Reset method to avoid re-initializing the state. - -![Small Full Buffer](/assets/small-full-buffer.svg) - -## Timing Tables - -### Small - -| Size | Full Buffer | Reset | | Full Buffer Rate | Reset Rate | -|--------|-------------|------------|-|------------------|--------------| -| 64 b | `205ns` | `86.5ns` | | `312MB/s` | `740MB/s` | -| 256 b | `364ns` | `250ns` | | `703MB/s` | `1.03GB/s` | -| 512 b | `575ns` | `468ns` | | `892MB/s` | `1.10GB/s` | -| 768 b | `795ns` | `682ns` | | `967MB/s` | `1.13GB/s` | - -### Large - -| Size | Incremental | Full Buffer | Reset | | Incremental Rate | Full Buffer Rate | Reset Rate | -|----------|-------------|-------------|------------|-|------------------|------------------|--------------| -| 1 kib | `1.02µs` | `1.01µs` | `891ns` | | `1.00GB/s` | `1.01GB/s` | `1.15GB/s` | -| 2 kib | `2.11µs` | `2.07µs` | `1.95µs` | | `968MB/s` | `990MB/s` | `1.05GB/s` | -| 4 kib | `2.28µs` | `2.15µs` | `2.05µs` | | `1.80GB/s` | `1.90GB/s` | `2.00GB/s` | -| 8 kib | `2.64µs` | `2.52µs` | `2.44µs` | | `3.11GB/s` | `3.25GB/s` | `3.36GB/s` | -| 16 kib | `4.93µs` | `4.54µs` | `4.48µs` | | `3.33GB/s` | `3.61GB/s` | `3.66GB/s` | -| 32 kib | `9.41µs` | `8.62µs` | `8.54µs` | | `3.48GB/s` | `3.80GB/s` | `3.84GB/s` | -| 64 kib | `18.2µs` | `16.7µs` | `16.6µs` | | `3.59GB/s` | `3.91GB/s` | `3.94GB/s` | -| 128 kib | `36.3µs` | `32.9µs` | `33.1µs` | | `3.61GB/s` | `3.99GB/s` | `3.96GB/s` | -| 256 kib | `72.5µs` | `65.7µs` | `66.0µs` | | `3.62GB/s` | `3.99GB/s` | `3.97GB/s` | -| 512 kib | `145µs` | `131µs` | `132µs` | | `3.60GB/s` | `4.00GB/s` | `3.97GB/s` | -| 1024 kib | `290µs` | `262µs` | `262µs` | | `3.62GB/s` | `4.00GB/s` | `4.00GB/s` | - -### No ASM - -| Size | Incremental | Full Buffer | Reset | | Incremental Rate | Full Buffer Rate | Reset Rate | -|----------|-------------|-------------|------------|-|------------------|------------------|-------------| -| 64 b | `253ns` | `254ns` | `134ns` | | `253MB/s` | `252MB/s` | `478MB/s` | -| 256 b | `553ns` | `557ns` | `441ns` | | `463MB/s` | `459MB/s` | `580MB/s` | -| 512 b | `948ns` | `953ns` | `841ns` | | `540MB/s` | `538MB/s` | `609MB/s` | -| 768 b | `1.38µs` | `1.40µs` | `1.35µs` | | `558MB/s` | `547MB/s` | `570MB/s` | -| 1 kib | `1.77µs` | `1.77µs` | `1.70µs` | | `577MB/s` | `580MB/s` | `602MB/s` | -| | | | | | | | | -| 1024 kib | `880µs` | `883µs` | `878µs` | | `596MB/s` | `595MB/s` | `598MB/s` | - -The speed caps out at around 1 kib, so most rows have been elided from the presentation. diff --git a/vendor/github.com/zeebo/blake3/api.go b/vendor/github.com/zeebo/blake3/api.go @@ -1,166 +0,0 @@ -// Package blake3 provides an SSE4.1/AVX2 accelerated BLAKE3 implementation. -package blake3 - -import ( - "errors" - - "github.com/zeebo/blake3/internal/consts" - "github.com/zeebo/blake3/internal/utils" -) - -// Hasher is a hash.Hash for BLAKE3. -type Hasher struct { - size int - h hasher -} - -// New returns a new Hasher that has a digest size of 32 bytes. -// -// If you need more or less output bytes than that, use Digest method. -func New() *Hasher { - return &Hasher{ - size: 32, - h: hasher{ - key: consts.IV, - }, - } -} - -// NewKeyed returns a new Hasher that uses the 32 byte input key and has -// a digest size of 32 bytes. -// -// If you need more or less output bytes than that, use the Digest method. -func NewKeyed(key []byte) (*Hasher, error) { - if len(key) != 32 { - return nil, errors.New("invalid key size") - } - - h := &Hasher{ - size: 32, - h: hasher{ - flags: consts.Flag_Keyed, - }, - } - utils.KeyFromBytes(key, &h.h.key) - - return h, nil -} - -// DeriveKey derives a key based on reusable key material of any -// length, in the given context. The key will be stored in out, using -// all of its current length. -// -// Context strings must be hardcoded constants, and the recommended -// format is "[application] [commit timestamp] [purpose]", e.g., -// "example.com 2019-12-25 16:18:03 session tokens v1". -func DeriveKey(context string, material []byte, out []byte) { - h := NewDeriveKey(context) - _, _ = h.Write(material) - _, _ = h.Digest().Read(out) -} - -// NewDeriveKey returns a Hasher that is initialized with the context -// string. See DeriveKey for details. It has a digest size of 32 bytes. -// -// If you need more or less output bytes than that, use the Digest method. -func NewDeriveKey(context string) *Hasher { - // hash the context string and use that instead of IV - h := &Hasher{ - size: 32, - h: hasher{ - key: consts.IV, - flags: consts.Flag_DeriveKeyContext, - }, - } - - var buf [32]byte - _, _ = h.WriteString(context) - _, _ = h.Digest().Read(buf[:]) - - h.Reset() - utils.KeyFromBytes(buf[:], &h.h.key) - h.h.flags = consts.Flag_DeriveKeyMaterial - - return h -} - -// Write implements part of the hash.Hash interface. It never returns an error. -func (h *Hasher) Write(p []byte) (int, error) { - h.h.update(p) - return len(p), nil -} - -// WriteString is like Write but specialized to strings to avoid allocations. -func (h *Hasher) WriteString(p string) (int, error) { - h.h.updateString(p) - return len(p), nil -} - -// Reset implements part of the hash.Hash interface. It causes the Hasher to -// act as if it was newly created. -func (h *Hasher) Reset() { - h.h.reset() -} - -// Clone returns a new Hasher with the same internal state. -// -// Modifying the resulting Hasher will not modify the original Hasher, and vice versa. -func (h *Hasher) Clone() *Hasher { - return &Hasher{size: h.size, h: h.h} -} - -// Size implements part of the hash.Hash interface. It returns the number of -// bytes the hash will output in Sum. -func (h *Hasher) Size() int { - return h.size -} - -// BlockSize implements part of the hash.Hash interface. It returns the most -// natural size to write to the Hasher. -func (h *Hasher) BlockSize() int { - // TODO: is there a downside to picking this large size? - return 8192 -} - -// Sum implements part of the hash.Hash interface. It appends the digest of -// the Hasher to the provided buffer and returns it. -func (h *Hasher) Sum(b []byte) []byte { - if top := len(b) + h.size; top <= cap(b) && top >= len(b) { - h.h.finalize(b[len(b):top]) - return b[:top] - } - - tmp := make([]byte, h.size) - h.h.finalize(tmp) - return append(b, tmp...) -} - -// Digest takes a snapshot of the hash state and returns an object that can -// be used to read and seek through 2^64 bytes of digest output. -func (h *Hasher) Digest() *Digest { - var d Digest - h.h.finalizeDigest(&d) - return &d -} - -// Sum256 returns the first 256 bits of the unkeyed digest of the data. -func Sum256(data []byte) (sum [32]byte) { - out := Sum512(data) - copy(sum[:], out[:32]) - return sum -} - -// Sum512 returns the first 512 bits of the unkeyed digest of the data. -func Sum512(data []byte) (sum [64]byte) { - if len(data) <= consts.ChunkLen { - var d Digest - compressAll(&d, data, 0, consts.IV) - _, _ = d.Read(sum[:]) - return sum - } else { - h := hasher{key: consts.IV} - h.update(data) - h.finalize(sum[:]) - return sum - } -} diff --git a/vendor/github.com/zeebo/blake3/blake3.go b/vendor/github.com/zeebo/blake3/blake3.go @@ -1,285 +0,0 @@ -package blake3 - -import ( - "math/bits" - "unsafe" - - "github.com/zeebo/blake3/internal/alg" - "github.com/zeebo/blake3/internal/consts" - "github.com/zeebo/blake3/internal/utils" -) - -// -// hasher contains state for a blake3 hash -// - -type hasher struct { - len uint64 - chunks uint64 - flags uint32 - key [8]uint32 - stack cvstack - buf [8192]byte -} - -func (a *hasher) reset() { - a.len = 0 - a.chunks = 0 - a.stack.occ = 0 - a.stack.lvls = [8]uint8{} - a.stack.bufn = 0 -} - -func (a *hasher) update(buf []byte) { - // relies on the first two words of a string being the same as a slice - a.updateString(*(*string)(unsafe.Pointer(&buf))) -} - -func (a *hasher) updateString(buf string) { - var input *[8192]byte - - for len(buf) > 0 { - if a.len == 0 && len(buf) > 8192 { - // relies on the data pointer being the first word in the string header - input = (*[8192]byte)(*(*unsafe.Pointer)(unsafe.Pointer(&buf))) - buf = buf[8192:] - } else if a.len < 8192 { - n := copy(a.buf[a.len:], buf) - a.len += uint64(n) - buf = buf[n:] - continue - } else { - input = &a.buf - } - - a.consume(input) - a.len = 0 - a.chunks += 8 - } -} - -func (a *hasher) consume(input *[8192]byte) { - var out chainVector - var chain [8]uint32 - alg.HashF(input, 8192, a.chunks, a.flags, &a.key, &out, &chain) - a.stack.pushN(0, &out, 8, a.flags, &a.key) -} - -func (a *hasher) finalize(p []byte) { - var d Digest - a.finalizeDigest(&d) - _, _ = d.Read(p) -} - -func (a *hasher) finalizeDigest(d *Digest) { - if a.chunks == 0 && a.len <= consts.ChunkLen { - compressAll(d, a.buf[:a.len], a.flags, a.key) - return - } - - d.chain = a.key - d.flags = a.flags | consts.Flag_ChunkEnd - - if a.len > 64 { - var buf chainVector - alg.HashF(&a.buf, a.len, a.chunks, a.flags, &a.key, &buf, &d.chain) - - if a.len > consts.ChunkLen { - complete := (a.len - 1) / consts.ChunkLen - a.stack.pushN(0, &buf, int(complete), a.flags, &a.key) - a.chunks += complete - a.len = uint64(copy(a.buf[:], a.buf[complete*consts.ChunkLen:a.len])) - } - } - - if a.len <= 64 { - d.flags |= consts.Flag_ChunkStart - } - - d.counter = a.chunks - d.blen = uint32(a.len) % 64 - - base := a.len / 64 * 64 - if a.len > 0 && d.blen == 0 { - d.blen = 64 - base -= 64 - } - - if consts.IsLittleEndian { - copy((*[64]byte)(unsafe.Pointer(&d.block[0]))[:], a.buf[base:a.len]) - } else { - var tmp [64]byte - copy(tmp[:], a.buf[base:a.len]) - utils.BytesToWords(&tmp, &d.block) - } - - for a.stack.bufn > 0 { - a.stack.flush(a.flags, &a.key) - } - - var tmp [16]uint32 - for occ := a.stack.occ; occ != 0; occ &= occ - 1 { - col := uint(bits.TrailingZeros64(occ)) % 64 - - alg.Compress(&d.chain, &d.block, d.counter, d.blen, d.flags, &tmp) - - *(*[8]uint32)(unsafe.Pointer(&d.block[0])) = a.stack.stack[col] - *(*[8]uint32)(unsafe.Pointer(&d.block[8])) = *(*[8]uint32)(unsafe.Pointer(&tmp[0])) - - if occ == a.stack.occ { - d.chain = a.key - d.counter = 0 - d.blen = consts.BlockLen - d.flags = a.flags | consts.Flag_Parent - } - } - - d.flags |= consts.Flag_Root -} - -// -// chain value stack -// - -type chainVector = [64]uint32 - -type cvstack struct { - occ uint64 // which levels in stack are occupied - lvls [8]uint8 // what level the buf input was in - bufn int // how many pairs are loaded into buf - buf [2]chainVector - stack [64][8]uint32 -} - -func (a *cvstack) pushN(l uint8, cv *chainVector, n int, flags uint32, key *[8]uint32) { - for i := 0; i < n; i++ { - a.pushL(l, cv, i) - for a.bufn == 8 { - a.flush(flags, key) - } - } -} - -func (a *cvstack) pushL(l uint8, cv *chainVector, n int) { - bit := uint64(1) << (l & 63) - if a.occ&bit == 0 { - readChain(cv, n, &a.stack[l&63]) - a.occ ^= bit - return - } - - a.lvls[a.bufn&7] = l - writeChain(&a.stack[l&63], &a.buf[0], a.bufn) - copyChain(cv, n, &a.buf[1], a.bufn) - a.bufn++ - a.occ ^= bit -} - -func (a *cvstack) flush(flags uint32, key *[8]uint32) { - var out chainVector - alg.HashP(&a.buf[0], &a.buf[1], flags|consts.Flag_Parent, key, &out, a.bufn) - - bufn, lvls := a.bufn, a.lvls - a.bufn, a.lvls = 0, [8]uint8{} - - for i := 0; i < bufn; i++ { - a.pushL(lvls[i]+1, &out, i) - } -} - -// -// helpers to deal with reading/writing transposed values -// - -func copyChain(in *chainVector, icol int, out *chainVector, ocol int) { - type u = uintptr - type p = unsafe.Pointer - type a = *uint32 - - i := p(u(p(in)) + u(icol*4)) - o := p(u(p(out)) + u(ocol*4)) - - *a(p(u(o) + 0*32)) = *a(p(u(i) + 0*32)) - *a(p(u(o) + 1*32)) = *a(p(u(i) + 1*32)) - *a(p(u(o) + 2*32)) = *a(p(u(i) + 2*32)) - *a(p(u(o) + 3*32)) = *a(p(u(i) + 3*32)) - *a(p(u(o) + 4*32)) = *a(p(u(i) + 4*32)) - *a(p(u(o) + 5*32)) = *a(p(u(i) + 5*32)) - *a(p(u(o) + 6*32)) = *a(p(u(i) + 6*32)) - *a(p(u(o) + 7*32)) = *a(p(u(i) + 7*32)) -} - -func readChain(in *chainVector, col int, out *[8]uint32) { - type u = uintptr - type p = unsafe.Pointer - type a = *uint32 - - i := p(u(p(in)) + u(col*4)) - - out[0] = *a(p(u(i) + 0*32)) - out[1] = *a(p(u(i) + 1*32)) - out[2] = *a(p(u(i) + 2*32)) - out[3] = *a(p(u(i) + 3*32)) - out[4] = *a(p(u(i) + 4*32)) - out[5] = *a(p(u(i) + 5*32)) - out[6] = *a(p(u(i) + 6*32)) - out[7] = *a(p(u(i) + 7*32)) -} - -func writeChain(in *[8]uint32, out *chainVector, col int) { - type u = uintptr - type p = unsafe.Pointer - type a = *uint32 - - o := p(u(p(out)) + u(col*4)) - - *a(p(u(o) + 0*32)) = in[0] - *a(p(u(o) + 1*32)) = in[1] - *a(p(u(o) + 2*32)) = in[2] - *a(p(u(o) + 3*32)) = in[3] - *a(p(u(o) + 4*32)) = in[4] - *a(p(u(o) + 5*32)) = in[5] - *a(p(u(o) + 6*32)) = in[6] - *a(p(u(o) + 7*32)) = in[7] -} - -// -// compress <= chunkLen bytes in one shot -// - -func compressAll(d *Digest, in []byte, flags uint32, key [8]uint32) { - var compressed [16]uint32 - - d.chain = key - d.flags = flags | consts.Flag_ChunkStart - - for len(in) > 64 { - buf := (*[64]byte)(unsafe.Pointer(&in[0])) - - var block *[16]uint32 - if consts.IsLittleEndian { - block = (*[16]uint32)(unsafe.Pointer(buf)) - } else { - block = &d.block - utils.BytesToWords(buf, block) - } - - alg.Compress(&d.chain, block, 0, consts.BlockLen, d.flags, &compressed) - - d.chain = *(*[8]uint32)(unsafe.Pointer(&compressed[0])) - d.flags &^= consts.Flag_ChunkStart - - in = in[64:] - } - - if consts.IsLittleEndian { - copy((*[64]byte)(unsafe.Pointer(&d.block[0]))[:], in) - } else { - var tmp [64]byte - copy(tmp[:], in) - utils.BytesToWords(&tmp, &d.block) - } - - d.blen = uint32(len(in)) - d.flags |= consts.Flag_ChunkEnd | consts.Flag_Root -} diff --git a/vendor/github.com/zeebo/blake3/digest.go b/vendor/github.com/zeebo/blake3/digest.go @@ -1,100 +0,0 @@ -package blake3 - -import ( - "fmt" - "io" - "unsafe" - - "github.com/zeebo/blake3/internal/alg" - "github.com/zeebo/blake3/internal/consts" - "github.com/zeebo/blake3/internal/utils" -) - -// Digest captures the state of a Hasher allowing reading and seeking through -// the output stream. -type Digest struct { - counter uint64 - chain [8]uint32 - block [16]uint32 - blen uint32 - flags uint32 - buf [16]uint32 - bufn int -} - -// Read reads data frm the hasher into out. It always fills the entire buffer and -// never errors. The stream will wrap around when reading past 2^64 bytes. -func (d *Digest) Read(p []byte) (n int, err error) { - n = len(p) - - if d.bufn > 0 { - n := d.slowCopy(p) - p = p[n:] - d.bufn -= n - } - - for len(p) >= 64 { - d.fillBuf() - - if consts.IsLittleEndian { - *(*[64]byte)(unsafe.Pointer(&p[0])) = *(*[64]byte)(unsafe.Pointer(&d.buf[0])) - } else { - utils.WordsToBytes(&d.buf, p) - } - - p = p[64:] - d.bufn = 0 - } - - if len(p) == 0 { - return n, nil - } - - d.fillBuf() - d.bufn -= d.slowCopy(p) - - return n, nil -} - -// Seek sets the position to the provided location. Only SeekStart and -// SeekCurrent are allowed. -func (d *Digest) Seek(offset int64, whence int) (int64, error) { - switch whence { - case io.SeekStart: - case io.SeekEnd: - return 0, fmt.Errorf("seek from end not supported") - case io.SeekCurrent: - offset += int64(consts.BlockLen*d.counter) - int64(d.bufn) - default: - return 0, fmt.Errorf("invalid whence: %d", whence) - } - if offset < 0 { - return 0, fmt.Errorf("seek before start") - } - d.setPosition(uint64(offset)) - return offset, nil -} - -func (d *Digest) setPosition(pos uint64) { - d.counter = pos / consts.BlockLen - d.fillBuf() - d.bufn -= int(pos % consts.BlockLen) -} - -func (d *Digest) slowCopy(p []byte) (n int) { - off := uint(consts.BlockLen-d.bufn) % consts.BlockLen - if consts.IsLittleEndian { - n = copy(p, (*[consts.BlockLen]byte)(unsafe.Pointer(&d.buf[0]))[off:]) - } else { - var tmp [consts.BlockLen]byte - utils.WordsToBytes(&d.buf, tmp[:]) - n = copy(p, tmp[off:]) - } - return n -} - -func (d *Digest) fillBuf() { - alg.Compress(&d.chain, &d.block, d.counter, d.blen, d.flags, &d.buf) - d.counter++ - d.bufn = consts.BlockLen -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/alg.go b/vendor/github.com/zeebo/blake3/internal/alg/alg.go @@ -1,18 +0,0 @@ -package alg - -import ( - "github.com/zeebo/blake3/internal/alg/compress" - "github.com/zeebo/blake3/internal/alg/hash" -) - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - hash.HashF(input, length, counter, flags, key, out, chain) -} - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - hash.HashP(left, right, flags, key, out, n) -} - -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { - compress.Compress(chain, block, counter, blen, flags, out) -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go @@ -1,15 +0,0 @@ -package compress - -import ( - "github.com/zeebo/blake3/internal/alg/compress/compress_pure" - "github.com/zeebo/blake3/internal/alg/compress/compress_sse41" - "github.com/zeebo/blake3/internal/consts" -) - -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { - if consts.HasSSE41 { - compress_sse41.Compress(chain, block, counter, blen, flags, out) - } else { - compress_pure.Compress(chain, block, counter, blen, flags, out) - } -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go @@ -1,135 +0,0 @@ -package compress_pure - -import ( - "math/bits" - - "github.com/zeebo/blake3/internal/consts" -) - -func Compress( - chain *[8]uint32, - block *[16]uint32, - counter uint64, - blen uint32, - flags uint32, - out *[16]uint32, -) { - - *out = [16]uint32{ - chain[0], chain[1], chain[2], chain[3], - chain[4], chain[5], chain[6], chain[7], - consts.IV0, consts.IV1, consts.IV2, consts.IV3, - uint32(counter), uint32(counter >> 32), blen, flags, - } - - rcompress(out, block) -} - -func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { - a += b + mx - d = bits.RotateLeft32(d^a, -16) - c += d - b = bits.RotateLeft32(b^c, -12) - a += b + my - d = bits.RotateLeft32(d^a, -8) - c += d - b = bits.RotateLeft32(b^c, -7) - return a, b, c, d -} - -func rcompress(s *[16]uint32, m *[16]uint32) { - const ( - a = 10 - b = 11 - c = 12 - d = 13 - e = 14 - f = 15 - ) - - s0, s1, s2, s3 := s[0+0], s[0+1], s[0+2], s[0+3] - s4, s5, s6, s7 := s[0+4], s[0+5], s[0+6], s[0+7] - s8, s9, sa, sb := s[8+0], s[8+1], s[8+2], s[8+3] - sc, sd, se, sf := s[8+4], s[8+5], s[8+6], s[8+7] - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[0], m[1]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[2], m[3]) - s2, s6, sa, se = g(s2, s6, sa, se, m[4], m[5]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[6], m[7]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[8], m[9]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[a], m[b]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[c], m[d]) - s3, s4, s9, se = g(s3, s4, s9, se, m[e], m[f]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[2], m[6]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[3], m[a]) - s2, s6, sa, se = g(s2, s6, sa, se, m[7], m[0]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[4], m[d]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[1], m[b]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[c], m[5]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[9], m[e]) - s3, s4, s9, se = g(s3, s4, s9, se, m[f], m[8]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[3], m[4]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[a], m[c]) - s2, s6, sa, se = g(s2, s6, sa, se, m[d], m[2]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[7], m[e]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[6], m[5]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[9], m[0]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[b], m[f]) - s3, s4, s9, se = g(s3, s4, s9, se, m[8], m[1]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[a], m[7]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[c], m[9]) - s2, s6, sa, se = g(s2, s6, sa, se, m[e], m[3]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[d], m[f]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[4], m[0]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[b], m[2]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[5], m[8]) - s3, s4, s9, se = g(s3, s4, s9, se, m[1], m[6]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[c], m[d]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[9], m[b]) - s2, s6, sa, se = g(s2, s6, sa, se, m[f], m[a]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[e], m[8]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[7], m[2]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[5], m[3]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[0], m[1]) - s3, s4, s9, se = g(s3, s4, s9, se, m[6], m[4]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[9], m[e]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[b], m[5]) - s2, s6, sa, se = g(s2, s6, sa, se, m[8], m[c]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[f], m[1]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[d], m[3]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[0], m[a]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[2], m[6]) - s3, s4, s9, se = g(s3, s4, s9, se, m[4], m[7]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[b], m[f]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[5], m[0]) - s2, s6, sa, se = g(s2, s6, sa, se, m[1], m[9]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[8], m[6]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[e], m[a]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[2], m[c]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[3], m[4]) - s3, s4, s9, se = g(s3, s4, s9, se, m[7], m[d]) - - s[8+0] = s8 ^ s[0] - s[8+1] = s9 ^ s[1] - s[8+2] = sa ^ s[2] - s[8+3] = sb ^ s[3] - s[8+4] = sc ^ s[4] - s[8+5] = sd ^ s[5] - s[8+6] = se ^ s[6] - s[8+7] = sf ^ s[7] - - s[0] = s0 ^ s8 - s[1] = s1 ^ s9 - s[2] = s2 ^ sa - s[3] = s3 ^ sb - s[4] = s4 ^ sc - s[5] = s5 ^ sd - s[6] = s6 ^ se - s[7] = s7 ^ sf -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s @@ -1,560 +0,0 @@ -// Code generated by command: go run compress.go. DO NOT EDIT. - -#include "textflag.h" - -DATA iv<>+0(SB)/4, $0x6a09e667 -DATA iv<>+4(SB)/4, $0xbb67ae85 -DATA iv<>+8(SB)/4, $0x3c6ef372 -DATA iv<>+12(SB)/4, $0xa54ff53a -DATA iv<>+16(SB)/4, $0x510e527f -DATA iv<>+20(SB)/4, $0x9b05688c -DATA iv<>+24(SB)/4, $0x1f83d9ab -DATA iv<>+28(SB)/4, $0x5be0cd19 -GLOBL iv<>(SB), RODATA|NOPTR, $32 - -DATA rot16_shuf<>+0(SB)/1, $0x02 -DATA rot16_shuf<>+1(SB)/1, $0x03 -DATA rot16_shuf<>+2(SB)/1, $0x00 -DATA rot16_shuf<>+3(SB)/1, $0x01 -DATA rot16_shuf<>+4(SB)/1, $0x06 -DATA rot16_shuf<>+5(SB)/1, $0x07 -DATA rot16_shuf<>+6(SB)/1, $0x04 -DATA rot16_shuf<>+7(SB)/1, $0x05 -DATA rot16_shuf<>+8(SB)/1, $0x0a -DATA rot16_shuf<>+9(SB)/1, $0x0b -DATA rot16_shuf<>+10(SB)/1, $0x08 -DATA rot16_shuf<>+11(SB)/1, $0x09 -DATA rot16_shuf<>+12(SB)/1, $0x0e -DATA rot16_shuf<>+13(SB)/1, $0x0f -DATA rot16_shuf<>+14(SB)/1, $0x0c -DATA rot16_shuf<>+15(SB)/1, $0x0d -DATA rot16_shuf<>+16(SB)/1, $0x12 -DATA rot16_shuf<>+17(SB)/1, $0x13 -DATA rot16_shuf<>+18(SB)/1, $0x10 -DATA rot16_shuf<>+19(SB)/1, $0x11 -DATA rot16_shuf<>+20(SB)/1, $0x16 -DATA rot16_shuf<>+21(SB)/1, $0x17 -DATA rot16_shuf<>+22(SB)/1, $0x14 -DATA rot16_shuf<>+23(SB)/1, $0x15 -DATA rot16_shuf<>+24(SB)/1, $0x1a -DATA rot16_shuf<>+25(SB)/1, $0x1b -DATA rot16_shuf<>+26(SB)/1, $0x18 -DATA rot16_shuf<>+27(SB)/1, $0x19 -DATA rot16_shuf<>+28(SB)/1, $0x1e -DATA rot16_shuf<>+29(SB)/1, $0x1f -DATA rot16_shuf<>+30(SB)/1, $0x1c -DATA rot16_shuf<>+31(SB)/1, $0x1d -GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32 - -DATA rot8_shuf<>+0(SB)/1, $0x01 -DATA rot8_shuf<>+1(SB)/1, $0x02 -DATA rot8_shuf<>+2(SB)/1, $0x03 -DATA rot8_shuf<>+3(SB)/1, $0x00 -DATA rot8_shuf<>+4(SB)/1, $0x05 -DATA rot8_shuf<>+5(SB)/1, $0x06 -DATA rot8_shuf<>+6(SB)/1, $0x07 -DATA rot8_shuf<>+7(SB)/1, $0x04 -DATA rot8_shuf<>+8(SB)/1, $0x09 -DATA rot8_shuf<>+9(SB)/1, $0x0a -DATA rot8_shuf<>+10(SB)/1, $0x0b -DATA rot8_shuf<>+11(SB)/1, $0x08 -DATA rot8_shuf<>+12(SB)/1, $0x0d -DATA rot8_shuf<>+13(SB)/1, $0x0e -DATA rot8_shuf<>+14(SB)/1, $0x0f -DATA rot8_shuf<>+15(SB)/1, $0x0c -DATA rot8_shuf<>+16(SB)/1, $0x11 -DATA rot8_shuf<>+17(SB)/1, $0x12 -DATA rot8_shuf<>+18(SB)/1, $0x13 -DATA rot8_shuf<>+19(SB)/1, $0x10 -DATA rot8_shuf<>+20(SB)/1, $0x15 -DATA rot8_shuf<>+21(SB)/1, $0x16 -DATA rot8_shuf<>+22(SB)/1, $0x17 -DATA rot8_shuf<>+23(SB)/1, $0x14 -DATA rot8_shuf<>+24(SB)/1, $0x19 -DATA rot8_shuf<>+25(SB)/1, $0x1a -DATA rot8_shuf<>+26(SB)/1, $0x1b -DATA rot8_shuf<>+27(SB)/1, $0x18 -DATA rot8_shuf<>+28(SB)/1, $0x1d -DATA rot8_shuf<>+29(SB)/1, $0x1e -DATA rot8_shuf<>+30(SB)/1, $0x1f -DATA rot8_shuf<>+31(SB)/1, $0x1c -GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32 - -// func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) -// Requires: SSE, SSE2, SSE4.1, SSSE3 -TEXT ·Compress(SB), NOSPLIT, $0-40 - MOVQ chain+0(FP), AX - MOVQ block+8(FP), CX - MOVQ counter+16(FP), DX - MOVL blen+24(FP), BX - MOVL flags+28(FP), BP - MOVQ out+32(FP), SI - MOVUPS (AX), X0 - MOVUPS 16(AX), X1 - MOVUPS iv<>+0(SB), X2 - PINSRD $0x00, DX, X3 - SHRQ $0x20, DX - PINSRD $0x01, DX, X3 - PINSRD $0x02, BX, X3 - PINSRD $0x03, BP, X3 - MOVUPS (CX), X4 - MOVUPS 16(CX), X5 - MOVUPS 32(CX), X6 - MOVUPS 48(CX), X7 - MOVUPS rot16_shuf<>+0(SB), X8 - MOVUPS rot8_shuf<>+0(SB), X9 - - // round 1 - MOVAPS X4, X10 - SHUFPS $0x88, X5, X10 - PADDD X10, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X4, X4 - SHUFPS $0xdd, X5, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X6, X5 - SHUFPS $0x88, X7, X5 - SHUFPS $0x93, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X6, X6 - SHUFPS $0xdd, X7, X6 - SHUFPS $0x93, X6, X6 - PADDD X6, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 2 - MOVAPS X10, X7 - SHUFPS $0xd6, X4, X7 - SHUFPS $0x39, X7, X7 - PADDD X7, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X5, X11 - SHUFPS $0xfa, X6, X11 - PSHUFD $0x0f, X10, X10 - PBLENDW $0x33, X10, X11 - PADDD X11, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X10 - PSRLL $0x07, X1 - PSLLL $0x19, X10 - POR X10, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X6, X12 - PUNPCKLLQ X4, X12 - PBLENDW $0xc0, X5, X12 - SHUFPS $0xb4, X12, X12 - PADDD X12, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X10 - PSRLL $0x0c, X1 - PSLLL $0x14, X10 - POR X10, X1 - MOVAPS X4, X10 - PUNPCKHLQ X6, X10 - MOVAPS X5, X4 - PUNPCKLLQ X10, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 3 - MOVAPS X7, X5 - SHUFPS $0xd6, X11, X5 - SHUFPS $0x39, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X6 - PSRLL $0x0c, X1 - PSLLL $0x14, X6 - POR X6, X1 - MOVAPS X12, X6 - SHUFPS $0xfa, X4, X6 - PSHUFD $0x0f, X7, X7 - PBLENDW $0x33, X7, X6 - PADDD X6, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X10 - PUNPCKLLQ X11, X10 - PBLENDW $0xc0, X12, X10 - SHUFPS $0xb4, X10, X10 - PADDD X10, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x0c, X1 - PSLLL $0x14, X7 - POR X7, X1 - MOVAPS X11, X7 - PUNPCKHLQ X4, X7 - MOVAPS X12, X4 - PUNPCKLLQ X7, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 4 - MOVAPS X5, X7 - SHUFPS $0xd6, X6, X7 - SHUFPS $0x39, X7, X7 - PADDD X7, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X10, X11 - SHUFPS $0xfa, X4, X11 - PSHUFD $0x0f, X5, X5 - PBLENDW $0x33, X5, X11 - PADDD X11, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X12 - PUNPCKLLQ X6, X12 - PBLENDW $0xc0, X10, X12 - SHUFPS $0xb4, X12, X12 - PADDD X12, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X6, X5 - PUNPCKHLQ X4, X5 - MOVAPS X10, X4 - PUNPCKLLQ X5, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 5 - MOVAPS X7, X5 - SHUFPS $0xd6, X11, X5 - SHUFPS $0x39, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X6 - PSRLL $0x0c, X1 - PSLLL $0x14, X6 - POR X6, X1 - MOVAPS X12, X6 - SHUFPS $0xfa, X4, X6 - PSHUFD $0x0f, X7, X7 - PBLENDW $0x33, X7, X6 - PADDD X6, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X10 - PUNPCKLLQ X11, X10 - PBLENDW $0xc0, X12, X10 - SHUFPS $0xb4, X10, X10 - PADDD X10, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x0c, X1 - PSLLL $0x14, X7 - POR X7, X1 - MOVAPS X11, X7 - PUNPCKHLQ X4, X7 - MOVAPS X12, X4 - PUNPCKLLQ X7, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 6 - MOVAPS X5, X7 - SHUFPS $0xd6, X6, X7 - SHUFPS $0x39, X7, X7 - PADDD X7, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X10, X11 - SHUFPS $0xfa, X4, X11 - PSHUFD $0x0f, X5, X5 - PBLENDW $0x33, X5, X11 - PADDD X11, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X12 - PUNPCKLLQ X6, X12 - PBLENDW $0xc0, X10, X12 - SHUFPS $0xb4, X12, X12 - PADDD X12, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X6, X5 - PUNPCKHLQ X4, X5 - MOVAPS X10, X4 - PUNPCKLLQ X5, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 7 - MOVAPS X7, X5 - SHUFPS $0xd6, X11, X5 - SHUFPS $0x39, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X12, X5 - SHUFPS $0xfa, X4, X5 - PSHUFD $0x0f, X7, X6 - PBLENDW $0x33, X6, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X5 - PUNPCKLLQ X11, X5 - PBLENDW $0xc0, X12, X5 - SHUFPS $0xb4, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X11, X6 - PUNPCKHLQ X4, X6 - MOVAPS X12, X4 - PUNPCKLLQ X6, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X4 - PSRLL $0x07, X1 - PSLLL $0x19, X4 - POR X4, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // finalize - PXOR X2, X0 - PXOR X3, X1 - MOVUPS (AX), X4 - PXOR X4, X2 - MOVUPS 16(AX), X4 - PXOR X4, X3 - MOVUPS X0, (SI) - MOVUPS X1, 16(SI) - MOVUPS X2, 32(SI) - MOVUPS X3, 48(SI) - RET diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go @@ -1,9 +0,0 @@ -// +build !amd64 - -package compress_sse41 - -import "github.com/zeebo/blake3/internal/alg/compress/compress_pure" - -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { - compress_pure.Compress(chain, block, counter, blen, flags, out) -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go @@ -1,6 +0,0 @@ -// +build amd64 - -package compress_sse41 - -//go:noescape -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go @@ -1,23 +0,0 @@ -package hash - -import ( - "github.com/zeebo/blake3/internal/alg/hash/hash_avx2" - "github.com/zeebo/blake3/internal/alg/hash/hash_pure" - "github.com/zeebo/blake3/internal/consts" -) - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - if consts.HasAVX2 && length > 2*consts.ChunkLen { - hash_avx2.HashF(input, length, counter, flags, key, out, chain) - } else { - hash_pure.HashF(input, length, counter, flags, key, out, chain) - } -} - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - if consts.HasAVX2 && n >= 2 { - hash_avx2.HashP(left, right, flags, key, out, n) - } else { - hash_pure.HashP(left, right, flags, key, out, n) - } -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s @@ -1,2561 +0,0 @@ -// Code generated by command: go run main.go. DO NOT EDIT. - -#include "textflag.h" - -DATA iv<>+0(SB)/4, $0x6a09e667 -DATA iv<>+4(SB)/4, $0xbb67ae85 -DATA iv<>+8(SB)/4, $0x3c6ef372 -DATA iv<>+12(SB)/4, $0xa54ff53a -DATA iv<>+16(SB)/4, $0x510e527f -DATA iv<>+20(SB)/4, $0x9b05688c -DATA iv<>+24(SB)/4, $0x1f83d9ab -DATA iv<>+28(SB)/4, $0x5be0cd19 -GLOBL iv<>(SB), RODATA|NOPTR, $32 - -DATA rot16_shuf<>+0(SB)/1, $0x02 -DATA rot16_shuf<>+1(SB)/1, $0x03 -DATA rot16_shuf<>+2(SB)/1, $0x00 -DATA rot16_shuf<>+3(SB)/1, $0x01 -DATA rot16_shuf<>+4(SB)/1, $0x06 -DATA rot16_shuf<>+5(SB)/1, $0x07 -DATA rot16_shuf<>+6(SB)/1, $0x04 -DATA rot16_shuf<>+7(SB)/1, $0x05 -DATA rot16_shuf<>+8(SB)/1, $0x0a -DATA rot16_shuf<>+9(SB)/1, $0x0b -DATA rot16_shuf<>+10(SB)/1, $0x08 -DATA rot16_shuf<>+11(SB)/1, $0x09 -DATA rot16_shuf<>+12(SB)/1, $0x0e -DATA rot16_shuf<>+13(SB)/1, $0x0f -DATA rot16_shuf<>+14(SB)/1, $0x0c -DATA rot16_shuf<>+15(SB)/1, $0x0d -DATA rot16_shuf<>+16(SB)/1, $0x12 -DATA rot16_shuf<>+17(SB)/1, $0x13 -DATA rot16_shuf<>+18(SB)/1, $0x10 -DATA rot16_shuf<>+19(SB)/1, $0x11 -DATA rot16_shuf<>+20(SB)/1, $0x16 -DATA rot16_shuf<>+21(SB)/1, $0x17 -DATA rot16_shuf<>+22(SB)/1, $0x14 -DATA rot16_shuf<>+23(SB)/1, $0x15 -DATA rot16_shuf<>+24(SB)/1, $0x1a -DATA rot16_shuf<>+25(SB)/1, $0x1b -DATA rot16_shuf<>+26(SB)/1, $0x18 -DATA rot16_shuf<>+27(SB)/1, $0x19 -DATA rot16_shuf<>+28(SB)/1, $0x1e -DATA rot16_shuf<>+29(SB)/1, $0x1f -DATA rot16_shuf<>+30(SB)/1, $0x1c -DATA rot16_shuf<>+31(SB)/1, $0x1d -GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32 - -DATA rot8_shuf<>+0(SB)/1, $0x01 -DATA rot8_shuf<>+1(SB)/1, $0x02 -DATA rot8_shuf<>+2(SB)/1, $0x03 -DATA rot8_shuf<>+3(SB)/1, $0x00 -DATA rot8_shuf<>+4(SB)/1, $0x05 -DATA rot8_shuf<>+5(SB)/1, $0x06 -DATA rot8_shuf<>+6(SB)/1, $0x07 -DATA rot8_shuf<>+7(SB)/1, $0x04 -DATA rot8_shuf<>+8(SB)/1, $0x09 -DATA rot8_shuf<>+9(SB)/1, $0x0a -DATA rot8_shuf<>+10(SB)/1, $0x0b -DATA rot8_shuf<>+11(SB)/1, $0x08 -DATA rot8_shuf<>+12(SB)/1, $0x0d -DATA rot8_shuf<>+13(SB)/1, $0x0e -DATA rot8_shuf<>+14(SB)/1, $0x0f -DATA rot8_shuf<>+15(SB)/1, $0x0c -DATA rot8_shuf<>+16(SB)/1, $0x11 -DATA rot8_shuf<>+17(SB)/1, $0x12 -DATA rot8_shuf<>+18(SB)/1, $0x13 -DATA rot8_shuf<>+19(SB)/1, $0x10 -DATA rot8_shuf<>+20(SB)/1, $0x15 -DATA rot8_shuf<>+21(SB)/1, $0x16 -DATA rot8_shuf<>+22(SB)/1, $0x17 -DATA rot8_shuf<>+23(SB)/1, $0x14 -DATA rot8_shuf<>+24(SB)/1, $0x19 -DATA rot8_shuf<>+25(SB)/1, $0x1a -DATA rot8_shuf<>+26(SB)/1, $0x1b -DATA rot8_shuf<>+27(SB)/1, $0x18 -DATA rot8_shuf<>+28(SB)/1, $0x1d -DATA rot8_shuf<>+29(SB)/1, $0x1e -DATA rot8_shuf<>+30(SB)/1, $0x1f -DATA rot8_shuf<>+31(SB)/1, $0x1c -GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32 - -DATA block_len<>+0(SB)/4, $0x00000040 -DATA block_len<>+4(SB)/4, $0x00000040 -DATA block_len<>+8(SB)/4, $0x00000040 -DATA block_len<>+12(SB)/4, $0x00000040 -DATA block_len<>+16(SB)/4, $0x00000040 -DATA block_len<>+20(SB)/4, $0x00000040 -DATA block_len<>+24(SB)/4, $0x00000040 -DATA block_len<>+28(SB)/4, $0x00000040 -GLOBL block_len<>(SB), RODATA|NOPTR, $32 - -DATA zero<>+0(SB)/4, $0x00000000 -DATA zero<>+4(SB)/4, $0x00000000 -DATA zero<>+8(SB)/4, $0x00000000 -DATA zero<>+12(SB)/4, $0x00000000 -DATA zero<>+16(SB)/4, $0x00000000 -DATA zero<>+20(SB)/4, $0x00000000 -DATA zero<>+24(SB)/4, $0x00000000 -DATA zero<>+28(SB)/4, $0x00000000 -GLOBL zero<>(SB), RODATA|NOPTR, $32 - -DATA counter<>+0(SB)/8, $0x0000000000000000 -DATA counter<>+8(SB)/8, $0x0000000000000001 -DATA counter<>+16(SB)/8, $0x0000000000000002 -DATA counter<>+24(SB)/8, $0x0000000000000003 -DATA counter<>+32(SB)/8, $0x0000000000000004 -DATA counter<>+40(SB)/8, $0x0000000000000005 -DATA counter<>+48(SB)/8, $0x0000000000000006 -DATA counter<>+56(SB)/8, $0x0000000000000007 -GLOBL counter<>(SB), RODATA|NOPTR, $64 - -// func HashF(input *[8192]byte, length uint64, counter uint64, flags uint32, key *[8]uint32, out *[32]uint32, chain *[8]uint32) -// Requires: AVX, AVX2 -TEXT ·HashF(SB), $688-56 - MOVQ input+0(FP), AX - MOVQ length+8(FP), CX - MOVQ counter+16(FP), DX - MOVL flags+24(FP), BX - MOVQ key+32(FP), BP - MOVQ out+40(FP), SI - MOVQ chain+48(FP), DI - - // Allocate local space and align it - LEAQ 31(SP), R10 - MOVQ $0x000000000000001f, R8 - NOTQ R8 - ANDQ R8, R10 - - // Skip if the length is zero - XORQ R8, R8 - XORQ R9, R9 - TESTQ CX, CX - JZ skip_compute - - // Compute complete chunks and blocks - SUBQ $0x01, CX - MOVQ CX, R8 - SHRQ $0x0a, R8 - MOVQ CX, R9 - ANDQ $0x000003c0, R9 - -skip_compute: - // Load some params into the stack (avo improvment?) - MOVL BX, 64(SP) - MOVQ DX, 72(SP) - - // Load IV into vectors - VPBROADCASTD (BP), Y0 - VPBROADCASTD 4(BP), Y1 - VPBROADCASTD 8(BP), Y2 - VPBROADCASTD 12(BP), Y3 - VPBROADCASTD 16(BP), Y4 - VPBROADCASTD 20(BP), Y5 - VPBROADCASTD 24(BP), Y6 - VPBROADCASTD 28(BP), Y7 - - // Build and store counter data on the stack - VPBROADCASTQ 72(SP), Y8 - VPADDQ counter<>+0(SB), Y8, Y8 - VPBROADCASTQ 72(SP), Y9 - VPADDQ counter<>+32(SB), Y9, Y9 - VPUNPCKLDQ Y9, Y8, Y10 - VPUNPCKHDQ Y9, Y8, Y8 - VPUNPCKLDQ Y8, Y10, Y9 - VPUNPCKHDQ Y8, Y10, Y8 - VPERMQ $0xd8, Y9, Y9 - VPERMQ $0xd8, Y8, Y8 - VMOVDQU Y9, 112(SP) - VMOVDQU Y8, 144(SP) - - // Set up block flags and variables for iteration - XORQ CX, CX - ORL $0x01, 64(SP) - -loop: - // Include end flags if last block - CMPQ CX, $0x000003c0 - JNE round_setup - ORL $0x02, 64(SP) - -round_setup: - // Load and transpose message vectors - VMOVDQU (AX)(CX*1), Y8 - VMOVDQU 1024(AX)(CX*1), Y9 - VMOVDQU 2048(AX)(CX*1), Y10 - VMOVDQU 3072(AX)(CX*1), Y11 - VMOVDQU 4096(AX)(CX*1), Y12 - VMOVDQU 5120(AX)(CX*1), Y13 - VMOVDQU 6144(AX)(CX*1), Y14 - VMOVDQU 7168(AX)(CX*1), Y15 - VMOVDQA Y0, (R10) - VPUNPCKLDQ Y9, Y8, Y0 - VPUNPCKHDQ Y9, Y8, Y8 - VPUNPCKLDQ Y11, Y10, Y9 - VPUNPCKHDQ Y11, Y10, Y10 - VPUNPCKLDQ Y13, Y12, Y11 - VPUNPCKHDQ Y13, Y12, Y12 - VPUNPCKLDQ Y15, Y14, Y13 - VPUNPCKHDQ Y15, Y14, Y14 - VPUNPCKLQDQ Y9, Y0, Y15 - VPUNPCKHQDQ Y9, Y0, Y0 - VPUNPCKLQDQ Y10, Y8, Y9 - VPUNPCKHQDQ Y10, Y8, Y8 - VPUNPCKLQDQ Y13, Y11, Y10 - VPUNPCKHQDQ Y13, Y11, Y11 - VPUNPCKLQDQ Y14, Y12, Y13 - VPUNPCKHQDQ Y14, Y12, Y12 - VINSERTI128 $0x01, X10, Y15, Y14 - VPERM2I128 $0x31, Y10, Y15, Y10 - VINSERTI128 $0x01, X11, Y0, Y15 - VPERM2I128 $0x31, Y11, Y0, Y0 - VINSERTI128 $0x01, X13, Y9, Y11 - VPERM2I128 $0x31, Y13, Y9, Y9 - VINSERTI128 $0x01, X12, Y8, Y13 - VPERM2I128 $0x31, Y12, Y8, Y8 - VMOVDQU Y14, 176(SP) - VMOVDQU Y15, 208(SP) - VMOVDQU Y11, 240(SP) - VMOVDQU Y13, 272(SP) - VMOVDQU Y10, 304(SP) - VMOVDQU Y0, 336(SP) - VMOVDQU Y9, 368(SP) - VMOVDQU Y8, 400(SP) - VMOVDQU 32(AX)(CX*1), Y0 - VMOVDQU 1056(AX)(CX*1), Y8 - VMOVDQU 2080(AX)(CX*1), Y9 - VMOVDQU 3104(AX)(CX*1), Y10 - VMOVDQU 4128(AX)(CX*1), Y11 - VMOVDQU 5152(AX)(CX*1), Y12 - VMOVDQU 6176(AX)(CX*1), Y13 - VMOVDQU 7200(AX)(CX*1), Y14 - VPUNPCKLDQ Y8, Y0, Y15 - VPUNPCKHDQ Y8, Y0, Y0 - VPUNPCKLDQ Y10, Y9, Y8 - VPUNPCKHDQ Y10, Y9, Y9 - VPUNPCKLDQ Y12, Y11, Y10 - VPUNPCKHDQ Y12, Y11, Y11 - VPUNPCKLDQ Y14, Y13, Y12 - VPUNPCKHDQ Y14, Y13, Y13 - VPUNPCKLQDQ Y8, Y15, Y14 - VPUNPCKHQDQ Y8, Y15, Y8 - VPUNPCKLQDQ Y9, Y0, Y15 - VPUNPCKHQDQ Y9, Y0, Y0 - VPUNPCKLQDQ Y12, Y10, Y9 - VPUNPCKHQDQ Y12, Y10, Y10 - VPUNPCKLQDQ Y13, Y11, Y12 - VPUNPCKHQDQ Y13, Y11, Y11 - VINSERTI128 $0x01, X9, Y14, Y13 - VPERM2I128 $0x31, Y9, Y14, Y9 - VINSERTI128 $0x01, X10, Y8, Y14 - VPERM2I128 $0x31, Y10, Y8, Y8 - VINSERTI128 $0x01, X12, Y15, Y10 - VPERM2I128 $0x31, Y12, Y15, Y12 - VINSERTI128 $0x01, X11, Y0, Y15 - VPERM2I128 $0x31, Y11, Y0, Y0 - VMOVDQU Y13, 432(SP) - VMOVDQU Y14, 464(SP) - VMOVDQU Y10, 496(SP) - VMOVDQU Y15, 528(SP) - VMOVDQU Y9, 560(SP) - VMOVDQU Y8, 592(SP) - VMOVDQU Y12, 624(SP) - VMOVDQU Y0, 656(SP) - - // Load constants for the round - VMOVDQA (R10), Y0 - VMOVDQU block_len<>+0(SB), Y8 - VPBROADCASTD 64(SP), Y9 - VPBROADCASTD iv<>+0(SB), Y10 - VPBROADCASTD iv<>+4(SB), Y11 - VPBROADCASTD iv<>+8(SB), Y12 - VPBROADCASTD iv<>+12(SB), Y13 - VMOVDQU 112(SP), Y14 - VMOVDQU 144(SP), Y15 - - // Save state for partial chunk if necessary - CMPQ CX, R9 - JNE begin_rounds - VMOVDQU Y0, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, (DI) - VMOVDQU Y1, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 4(DI) - VMOVDQU Y2, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 8(DI) - VMOVDQU Y3, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 12(DI) - VMOVDQU Y4, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 16(DI) - VMOVDQU Y5, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 20(DI) - VMOVDQU Y6, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 24(DI) - VMOVDQU Y7, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 28(DI) - -begin_rounds: - // Perform the rounds - // Round 1 - VPADDD 176(SP), Y0, Y0 - VPADDD 240(SP), Y1, Y1 - VPADDD 304(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y4, Y0, Y0 - VPXOR Y0, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y7, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y4, Y4 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y5, Y5 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y6, Y6 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y7, Y7 - VMOVDQA Y0, (R10) - VPSRLD $0x0c, Y4, Y0 - VPSLLD $0x14, Y4, Y4 - VPOR Y0, Y4, Y0 - VPSRLD $0x0c, Y5, Y4 - VPSLLD $0x14, Y5, Y5 - VPOR Y4, Y5, Y4 - VPSRLD $0x0c, Y6, Y5 - VPSLLD $0x14, Y6, Y6 - VPOR Y5, Y6, Y5 - VPSRLD $0x0c, Y7, Y6 - VPSLLD $0x14, Y7, Y7 - VPOR Y6, Y7, Y6 - VMOVDQA (R10), Y7 - VPADDD 208(SP), Y7, Y7 - VPADDD 272(SP), Y1, Y1 - VPADDD 336(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 432(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 560(SP), Y2, Y2 - VPADDD 624(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 464(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 592(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 2 - VMOVDQA (R10), Y7 - VPADDD 240(SP), Y7, Y7 - VPADDD 272(SP), Y1, Y1 - VPADDD 400(SP), Y2, Y2 - VPADDD 304(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 368(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 176(SP), Y2, Y2 - VPADDD 592(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 208(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 464(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 528(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 624(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 3 - VMOVDQA (R10), Y7 - VPADDD 272(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 592(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 304(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 240(SP), Y2, Y2 - VPADDD 624(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 368(SP), Y7, Y7 - VPADDD 464(SP), Y1, Y1 - VPADDD 528(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 336(SP), Y7, Y7 - VPADDD 176(SP), Y1, Y1 - VPADDD 656(SP), Y2, Y2 - VPADDD 208(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 4 - VMOVDQA (R10), Y7 - VPADDD 496(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 624(SP), Y2, Y2 - VPADDD 592(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 400(SP), Y7, Y7 - VPADDD 464(SP), Y1, Y1 - VPADDD 272(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 304(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 336(SP), Y2, Y2 - VPADDD 208(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 176(SP), Y7, Y7 - VPADDD 240(SP), Y1, Y1 - VPADDD 432(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 5 - VMOVDQA (R10), Y7 - VPADDD 560(SP), Y7, Y7 - VPADDD 464(SP), Y1, Y1 - VPADDD 656(SP), Y2, Y2 - VPADDD 624(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 592(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 496(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 400(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 176(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 240(SP), Y7, Y7 - VPADDD 272(SP), Y1, Y1 - VPADDD 208(SP), Y2, Y2 - VPADDD 304(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 6 - VMOVDQA (R10), Y7 - VPADDD 464(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 432(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 624(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 560(SP), Y2, Y2 - VPADDD 208(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 592(SP), Y7, Y7 - VPADDD 176(SP), Y1, Y1 - VPADDD 240(SP), Y2, Y2 - VPADDD 304(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 272(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 368(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 7 - VMOVDQA (R10), Y7 - VPADDD 528(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 208(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 656(SP), Y7, Y7 - VPADDD 176(SP), Y1, Y1 - VPADDD 464(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 624(SP), Y7, Y7 - VPADDD 240(SP), Y1, Y1 - VPADDD 272(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 496(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 304(SP), Y2, Y2 - VPADDD 592(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Finalize rounds - VPXOR Y9, Y6, Y6 - VPXOR (R10), Y10, Y7 - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y2, Y2 - VPXOR Y13, Y3, Y3 - VPXOR Y14, Y0, Y0 - VPXOR Y15, Y4, Y4 - VPXOR Y8, Y5, Y5 - - // Fix up registers for next iteration - VMOVDQU Y7, Y8 - VMOVDQU Y6, Y7 - VMOVDQU Y5, Y6 - VMOVDQU Y4, Y5 - VMOVDQU Y0, Y4 - VMOVDQU Y8, Y0 - - // If we have zero complete chunks, we're done - CMPQ R8, $0x00 - JNE loop_trailer - CMPQ R9, CX - JEQ finalize - -loop_trailer: - // Increment, reset flags, and loop - CMPQ CX, $0x000003c0 - JEQ finalize - ADDQ $0x40, CX - MOVL BX, 64(SP) - JMP loop - -finalize: - // Store result into out - VMOVDQU Y0, (SI) - VMOVDQU Y1, 32(SI) - VMOVDQU Y2, 64(SI) - VMOVDQU Y3, 96(SI) - VMOVDQU Y4, 128(SI) - VMOVDQU Y5, 160(SI) - VMOVDQU Y6, 192(SI) - VMOVDQU Y7, 224(SI) - VZEROUPPER - RET - -// func HashP(left *[32]uint32, right *[32]uint32, flags uint8, key *[8]uint32, out *[32]uint32, n int) -// Requires: AVX, AVX2 -TEXT ·HashP(SB), NOSPLIT, $72-48 - MOVQ left+0(FP), AX - MOVQ right+8(FP), CX - MOVBLZX flags+16(FP), DX - MOVQ key+24(FP), BX - MOVQ out+32(FP), BP - - // Allocate local space and align it - LEAQ 31(SP), SI - MOVQ $0x000000000000001f, DI - NOTQ DI - ANDQ DI, SI - - // Set up flags value - MOVL DX, 64(SP) - - // Perform the rounds - // Round 1 - VPBROADCASTD (BX), Y0 - VPADDD (AX), Y0, Y0 - VPBROADCASTD 4(BX), Y1 - VPADDD 64(AX), Y1, Y1 - VPBROADCASTD 8(BX), Y2 - VPADDD 128(AX), Y2, Y2 - VPBROADCASTD 12(BX), Y3 - VPADDD 192(AX), Y3, Y3 - VPBROADCASTD 16(BX), Y4 - VPADDD Y4, Y0, Y0 - VMOVDQU zero<>+0(SB), Y5 - VPXOR Y0, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPBROADCASTD 20(BX), Y6 - VPADDD Y6, Y1, Y1 - VMOVDQU zero<>+0(SB), Y7 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPBROADCASTD 24(BX), Y8 - VPADDD Y8, Y2, Y2 - VMOVDQU block_len<>+0(SB), Y9 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPBROADCASTD 28(BX), Y10 - VPADDD Y10, Y3, Y3 - VPBROADCASTD 64(SP), Y11 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPBROADCASTD iv<>+0(SB), Y12 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPBROADCASTD iv<>+4(SB), Y13 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y6, Y6 - VPBROADCASTD iv<>+8(SB), Y14 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y8, Y8 - VPBROADCASTD iv<>+12(SB), Y15 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y10, Y10 - VMOVDQA Y0, (SI) - VPSRLD $0x0c, Y4, Y0 - VPSLLD $0x14, Y4, Y4 - VPOR Y0, Y4, Y0 - VPSRLD $0x0c, Y6, Y4 - VPSLLD $0x14, Y6, Y6 - VPOR Y4, Y6, Y4 - VPSRLD $0x0c, Y8, Y6 - VPSLLD $0x14, Y8, Y8 - VPOR Y6, Y8, Y6 - VPSRLD $0x0c, Y10, Y8 - VPSLLD $0x14, Y10, Y10 - VPOR Y8, Y10, Y8 - VMOVDQA (SI), Y10 - VPADDD 32(AX), Y10, Y10 - VPADDD 96(AX), Y1, Y1 - VPADDD 160(AX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD (CX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD 128(CX), Y2, Y2 - VPADDD 192(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 32(CX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD 160(CX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 2 - VMOVDQA (SI), Y10 - VPADDD 64(AX), Y10, Y10 - VPADDD 96(AX), Y1, Y1 - VPADDD 224(AX), Y2, Y2 - VPADDD 128(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(AX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD (AX), Y2, Y2 - VPADDD 160(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 32(AX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 32(CX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 96(CX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD 192(CX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 3 - VMOVDQA (SI), Y10 - VPADDD 96(AX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD 160(CX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 128(AX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 64(AX), Y2, Y2 - VPADDD 192(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(AX), Y10, Y10 - VPADDD 32(CX), Y1, Y1 - VPADDD 96(CX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 160(AX), Y10, Y10 - VPADDD (AX), Y1, Y1 - VPADDD 224(CX), Y2, Y2 - VPADDD 32(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 4 - VMOVDQA (SI), Y10 - VPADDD 64(CX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 192(CX), Y2, Y2 - VPADDD 160(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 224(AX), Y10, Y10 - VPADDD 32(CX), Y1, Y1 - VPADDD 96(AX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 128(AX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD 160(AX), Y2, Y2 - VPADDD 32(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD (AX), Y10, Y10 - VPADDD 64(AX), Y1, Y1 - VPADDD (CX), Y2, Y2 - VPADDD 192(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 5 - VMOVDQA (SI), Y10 - VPADDD 128(CX), Y10, Y10 - VPADDD 32(CX), Y1, Y1 - VPADDD 224(CX), Y2, Y2 - VPADDD 192(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 160(CX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD 64(CX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 224(AX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD (AX), Y2, Y2 - VPADDD 192(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 64(AX), Y10, Y10 - VPADDD 96(AX), Y1, Y1 - VPADDD 32(AX), Y2, Y2 - VPADDD 128(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 6 - VMOVDQA (SI), Y10 - VPADDD 32(CX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD (CX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(CX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD 128(CX), Y2, Y2 - VPADDD 32(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 160(CX), Y10, Y10 - VPADDD (AX), Y1, Y1 - VPADDD 64(AX), Y2, Y2 - VPADDD 128(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 96(AX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD 192(AX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 7 - VMOVDQA (SI), Y10 - VPADDD 96(CX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD 32(AX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 224(CX), Y10, Y10 - VPADDD (AX), Y1, Y1 - VPADDD 32(CX), Y2, Y2 - VPADDD 192(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(CX), Y10, Y10 - VPADDD 64(AX), Y1, Y1 - VPADDD 96(AX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 64(CX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 128(AX), Y2, Y2 - VPADDD 160(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Finalize - VPXOR (SI), Y12, Y10 - VPXOR Y13, Y1, Y1 - VPXOR Y14, Y2, Y2 - VPXOR Y15, Y3, Y3 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y6, Y5 - VPXOR Y11, Y8, Y6 - - // Store result into out - VMOVDQU Y10, (BP) - VMOVDQU Y1, 32(BP) - VMOVDQU Y2, 64(BP) - VMOVDQU Y3, 96(BP) - VMOVDQU Y0, 128(BP) - VMOVDQU Y4, 160(BP) - VMOVDQU Y5, 192(BP) - VMOVDQU Y6, 224(BP) - VZEROUPPER - RET diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go @@ -1,13 +0,0 @@ -// +build !amd64 - -package hash_avx2 - -import "github.com/zeebo/blake3/internal/alg/hash/hash_pure" - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - hash_pure.HashF(input, length, counter, flags, key, out, chain) -} - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - hash_pure.HashP(left, right, flags, key, out, n) -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go @@ -1,9 +0,0 @@ -// +build amd64 - -package hash_avx2 - -//go:noescape -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) - -//go:noescape -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go @@ -1,56 +0,0 @@ -package hash_pure - -import ( - "unsafe" - - "github.com/zeebo/blake3/internal/alg/compress" - "github.com/zeebo/blake3/internal/consts" - "github.com/zeebo/blake3/internal/utils" -) - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - var tmp [16]uint32 - - for i := uint64(0); consts.ChunkLen*i < length && i < 8; i++ { - bchain := *key - bflags := flags | consts.Flag_ChunkStart - start := consts.ChunkLen * i - - for n := uint64(0); n < 16; n++ { - if n == 15 { - bflags |= consts.Flag_ChunkEnd - } - if start+64*n >= length { - break - } - if start+64+64*n >= length { - *chain = bchain - } - - var blockPtr *[16]uint32 - if consts.IsLittleEndian { - blockPtr = (*[16]uint32)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])) - } else { - var block [16]uint32 - utils.BytesToWords((*[64]uint8)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])), &block) - blockPtr = &block - } - - compress.Compress(&bchain, blockPtr, counter, consts.BlockLen, bflags, &tmp) - - bchain = *(*[8]uint32)(unsafe.Pointer(&tmp[0])) - bflags = flags - } - - out[i+0] = bchain[0] - out[i+8] = bchain[1] - out[i+16] = bchain[2] - out[i+24] = bchain[3] - out[i+32] = bchain[4] - out[i+40] = bchain[5] - out[i+48] = bchain[6] - out[i+56] = bchain[7] - - counter++ - } -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go @@ -1,38 +0,0 @@ -package hash_pure - -import "github.com/zeebo/blake3/internal/alg/compress" - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - var tmp [16]uint32 - var block [16]uint32 - - for i := 0; i < n && i < 8; i++ { - block[0] = left[i+0] - block[1] = left[i+8] - block[2] = left[i+16] - block[3] = left[i+24] - block[4] = left[i+32] - block[5] = left[i+40] - block[6] = left[i+48] - block[7] = left[i+56] - block[8] = right[i+0] - block[9] = right[i+8] - block[10] = right[i+16] - block[11] = right[i+24] - block[12] = right[i+32] - block[13] = right[i+40] - block[14] = right[i+48] - block[15] = right[i+56] - - compress.Compress(key, &block, 0, 64, flags, &tmp) - - out[i+0] = tmp[0] - out[i+8] = tmp[1] - out[i+16] = tmp[2] - out[i+24] = tmp[3] - out[i+32] = tmp[4] - out[i+40] = tmp[5] - out[i+48] = tmp[6] - out[i+56] = tmp[7] - } -} diff --git a/vendor/github.com/zeebo/blake3/internal/consts/consts.go b/vendor/github.com/zeebo/blake3/internal/consts/consts.go @@ -1,29 +0,0 @@ -package consts - -var IV = [...]uint32{IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7} - -const ( - IV0 = 0x6A09E667 - IV1 = 0xBB67AE85 - IV2 = 0x3C6EF372 - IV3 = 0xA54FF53A - IV4 = 0x510E527F - IV5 = 0x9B05688C - IV6 = 0x1F83D9AB - IV7 = 0x5BE0CD19 -) - -const ( - Flag_ChunkStart uint32 = 1 << 0 - Flag_ChunkEnd uint32 = 1 << 1 - Flag_Parent uint32 = 1 << 2 - Flag_Root uint32 = 1 << 3 - Flag_Keyed uint32 = 1 << 4 - Flag_DeriveKeyContext uint32 = 1 << 5 - Flag_DeriveKeyMaterial uint32 = 1 << 6 -) - -const ( - BlockLen = 64 - ChunkLen = 1024 -) diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu.go @@ -1,17 +0,0 @@ -package consts - -import ( - "os" - - "golang.org/x/sys/cpu" -) - -var ( - HasAVX2 = cpu.X86.HasAVX2 && - os.Getenv("BLAKE3_DISABLE_AVX2") == "" && - os.Getenv("BLAKE3_PUREGO") == "" - - HasSSE41 = cpu.X86.HasSSE41 && - os.Getenv("BLAKE3_DISABLE_SSE41") == "" && - os.Getenv("BLAKE3_PUREGO") == "" -) diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu_big.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu_big.go @@ -1,5 +0,0 @@ -// +build mips mips64 ppc64 s390x - -package consts - -const IsLittleEndian = false diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu_little.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu_little.go @@ -1,5 +0,0 @@ -// +build amd64 386 arm arm64 mipsle mips64le ppc64le riscv64 wasm - -package consts - -const IsLittleEndian = true diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu_other.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu_other.go @@ -1,7 +0,0 @@ -// +build !mips,!mips64,!ppc64,!s390x,!amd64,!386,!arm,!arm64,!mipsle,!mips64le,!ppc64le,!riscv64,!wasm - -package consts - -import "unsafe" - -var IsLittleEndian = *(*uint16)(unsafe.Pointer(&[2]byte{0, 1})) != 1 diff --git a/vendor/github.com/zeebo/blake3/internal/utils/utils.go b/vendor/github.com/zeebo/blake3/internal/utils/utils.go @@ -1,60 +0,0 @@ -package utils - -import ( - "encoding/binary" - "unsafe" -) - -func SliceToArray32(bytes []byte) *[32]uint8 { return (*[32]uint8)(unsafe.Pointer(&bytes[0])) } -func SliceToArray64(bytes []byte) *[64]uint8 { return (*[64]uint8)(unsafe.Pointer(&bytes[0])) } - -func BytesToWords(bytes *[64]uint8, words *[16]uint32) { - words[0] = binary.LittleEndian.Uint32(bytes[0*4:]) - words[1] = binary.LittleEndian.Uint32(bytes[1*4:]) - words[2] = binary.LittleEndian.Uint32(bytes[2*4:]) - words[3] = binary.LittleEndian.Uint32(bytes[3*4:]) - words[4] = binary.LittleEndian.Uint32(bytes[4*4:]) - words[5] = binary.LittleEndian.Uint32(bytes[5*4:]) - words[6] = binary.LittleEndian.Uint32(bytes[6*4:]) - words[7] = binary.LittleEndian.Uint32(bytes[7*4:]) - words[8] = binary.LittleEndian.Uint32(bytes[8*4:]) - words[9] = binary.LittleEndian.Uint32(bytes[9*4:]) - words[10] = binary.LittleEndian.Uint32(bytes[10*4:]) - words[11] = binary.LittleEndian.Uint32(bytes[11*4:]) - words[12] = binary.LittleEndian.Uint32(bytes[12*4:]) - words[13] = binary.LittleEndian.Uint32(bytes[13*4:]) - words[14] = binary.LittleEndian.Uint32(bytes[14*4:]) - words[15] = binary.LittleEndian.Uint32(bytes[15*4:]) -} - -func WordsToBytes(words *[16]uint32, bytes []byte) { - bytes = bytes[:64] - binary.LittleEndian.PutUint32(bytes[0*4:1*4], words[0]) - binary.LittleEndian.PutUint32(bytes[1*4:2*4], words[1]) - binary.LittleEndian.PutUint32(bytes[2*4:3*4], words[2]) - binary.LittleEndian.PutUint32(bytes[3*4:4*4], words[3]) - binary.LittleEndian.PutUint32(bytes[4*4:5*4], words[4]) - binary.LittleEndian.PutUint32(bytes[5*4:6*4], words[5]) - binary.LittleEndian.PutUint32(bytes[6*4:7*4], words[6]) - binary.LittleEndian.PutUint32(bytes[7*4:8*4], words[7]) - binary.LittleEndian.PutUint32(bytes[8*4:9*4], words[8]) - binary.LittleEndian.PutUint32(bytes[9*4:10*4], words[9]) - binary.LittleEndian.PutUint32(bytes[10*4:11*4], words[10]) - binary.LittleEndian.PutUint32(bytes[11*4:12*4], words[11]) - binary.LittleEndian.PutUint32(bytes[12*4:13*4], words[12]) - binary.LittleEndian.PutUint32(bytes[13*4:14*4], words[13]) - binary.LittleEndian.PutUint32(bytes[14*4:15*4], words[14]) - binary.LittleEndian.PutUint32(bytes[15*4:16*4], words[15]) -} - -func KeyFromBytes(key []byte, out *[8]uint32) { - key = key[:32] - out[0] = binary.LittleEndian.Uint32(key[0:]) - out[1] = binary.LittleEndian.Uint32(key[4:]) - out[2] = binary.LittleEndian.Uint32(key[8:]) - out[3] = binary.LittleEndian.Uint32(key[12:]) - out[4] = binary.LittleEndian.Uint32(key[16:]) - out[5] = binary.LittleEndian.Uint32(key[20:]) - out[6] = binary.LittleEndian.Uint32(key[24:]) - out[7] = binary.LittleEndian.Uint32(key[28:]) -} diff --git a/vendor/modules.txt b/vendor/modules.txt @@ -13,19 +13,18 @@ codeberg.org/gruf/go-format # codeberg.org/gruf/go-hashenc v1.0.1 ## explicit; go 1.16 codeberg.org/gruf/go-hashenc -# codeberg.org/gruf/go-mutexes v1.0.1 +# codeberg.org/gruf/go-mutexes v1.1.0 ## explicit; go 1.14 codeberg.org/gruf/go-mutexes # codeberg.org/gruf/go-nowish v1.1.0 ## explicit; go 1.14 -codeberg.org/gruf/go-nowish # codeberg.org/gruf/go-pools v1.0.2 ## explicit; go 1.16 codeberg.org/gruf/go-pools # codeberg.org/gruf/go-runners v1.2.0 ## explicit; go 1.14 codeberg.org/gruf/go-runners -# codeberg.org/gruf/go-store v1.2.2 +# codeberg.org/gruf/go-store v1.3.2 ## explicit; go 1.14 codeberg.org/gruf/go-store/kv codeberg.org/gruf/go-store/storage @@ -524,16 +523,6 @@ github.com/vmihailenco/tagparser/v2/internal/parser github.com/wagslane/go-password-validator # github.com/zeebo/blake3 v0.2.1 ## explicit; go 1.13 -github.com/zeebo/blake3 -github.com/zeebo/blake3/internal/alg -github.com/zeebo/blake3/internal/alg/compress -github.com/zeebo/blake3/internal/alg/compress/compress_pure -github.com/zeebo/blake3/internal/alg/compress/compress_sse41 -github.com/zeebo/blake3/internal/alg/hash -github.com/zeebo/blake3/internal/alg/hash/hash_avx2 -github.com/zeebo/blake3/internal/alg/hash/hash_pure -github.com/zeebo/blake3/internal/consts -github.com/zeebo/blake3/internal/utils # golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b ## explicit; go 1.17 golang.org/x/crypto/acme