gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

encode_better.go (26953B)


      1 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
      2 // Copyright (c) 2019 Klaus Post. All rights reserved.
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 package s2
      7 
      8 import (
      9 	"bytes"
     10 	"fmt"
     11 	"math/bits"
     12 )
     13 
     14 // hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
     15 // Preferably h should be a constant and should always be <32.
     16 func hash4(u uint64, h uint8) uint32 {
     17 	const prime4bytes = 2654435761
     18 	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
     19 }
     20 
     21 // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
     22 // Preferably h should be a constant and should always be <64.
     23 func hash5(u uint64, h uint8) uint32 {
     24 	const prime5bytes = 889523592379
     25 	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
     26 }
     27 
     28 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
     29 // Preferably h should be a constant and should always be <64.
     30 func hash7(u uint64, h uint8) uint32 {
     31 	const prime7bytes = 58295818150454627
     32 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
     33 }
     34 
     35 // hash8 returns the hash of u to fit in a hash table with h bits.
     36 // Preferably h should be a constant and should always be <64.
     37 func hash8(u uint64, h uint8) uint32 {
     38 	const prime8bytes = 0xcf1bbcdcb7a56463
     39 	return uint32((u * prime8bytes) >> ((64 - h) & 63))
     40 }
     41 
     42 // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
     43 // assumes that the varint-encoded length of the decompressed bytes has already
     44 // been written.
     45 //
     46 // It also assumes that:
     47 //
     48 //	len(dst) >= MaxEncodedLen(len(src)) &&
     49 //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
     50 func encodeBlockBetterGo(dst, src []byte) (d int) {
     51 	// sLimit is when to stop looking for offset/length copies. The inputMargin
     52 	// lets us use a fast path for emitLiteral in the main loop, while we are
     53 	// looking for copies.
     54 	sLimit := len(src) - inputMargin
     55 	if len(src) < minNonLiteralBlockSize {
     56 		return 0
     57 	}
     58 
     59 	// Initialize the hash tables.
     60 	const (
     61 		// Long hash matches.
     62 		lTableBits    = 17
     63 		maxLTableSize = 1 << lTableBits
     64 
     65 		// Short hash matches.
     66 		sTableBits    = 14
     67 		maxSTableSize = 1 << sTableBits
     68 	)
     69 
     70 	var lTable [maxLTableSize]uint32
     71 	var sTable [maxSTableSize]uint32
     72 
     73 	// Bail if we can't compress to at least this.
     74 	dstLimit := len(src) - len(src)>>5 - 6
     75 
     76 	// nextEmit is where in src the next emitLiteral should start from.
     77 	nextEmit := 0
     78 
     79 	// The encoded form must start with a literal, as there are no previous
     80 	// bytes to copy, so we start looking for hash matches at s == 1.
     81 	s := 1
     82 	cv := load64(src, s)
     83 
     84 	// We initialize repeat to 0, so we never match on first attempt
     85 	repeat := 0
     86 
     87 	for {
     88 		candidateL := 0
     89 		nextS := 0
     90 		for {
     91 			// Next src position to check
     92 			nextS = s + (s-nextEmit)>>7 + 1
     93 			if nextS > sLimit {
     94 				goto emitRemainder
     95 			}
     96 			hashL := hash7(cv, lTableBits)
     97 			hashS := hash4(cv, sTableBits)
     98 			candidateL = int(lTable[hashL])
     99 			candidateS := int(sTable[hashS])
    100 			lTable[hashL] = uint32(s)
    101 			sTable[hashS] = uint32(s)
    102 
    103 			valLong := load64(src, candidateL)
    104 			valShort := load64(src, candidateS)
    105 
    106 			// If long matches at least 8 bytes, use that.
    107 			if cv == valLong {
    108 				break
    109 			}
    110 			if cv == valShort {
    111 				candidateL = candidateS
    112 				break
    113 			}
    114 
    115 			// Check repeat at offset checkRep.
    116 			const checkRep = 1
    117 			// Minimum length of a repeat. Tested with various values.
    118 			// While 4-5 offers improvements in some, 6 reduces
    119 			// regressions significantly.
    120 			const wantRepeatBytes = 6
    121 			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
    122 			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
    123 				base := s + checkRep
    124 				// Extend back
    125 				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
    126 					i--
    127 					base--
    128 				}
    129 				d += emitLiteral(dst[d:], src[nextEmit:base])
    130 
    131 				// Extend forward
    132 				candidate := s - repeat + wantRepeatBytes + checkRep
    133 				s += wantRepeatBytes + checkRep
    134 				for s < len(src) {
    135 					if len(src)-s < 8 {
    136 						if src[s] == src[candidate] {
    137 							s++
    138 							candidate++
    139 							continue
    140 						}
    141 						break
    142 					}
    143 					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
    144 						s += bits.TrailingZeros64(diff) >> 3
    145 						break
    146 					}
    147 					s += 8
    148 					candidate += 8
    149 				}
    150 				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
    151 				d += emitRepeat(dst[d:], repeat, s-base)
    152 				nextEmit = s
    153 				if s >= sLimit {
    154 					goto emitRemainder
    155 				}
    156 				// Index in-between
    157 				index0 := base + 1
    158 				index1 := s - 2
    159 
    160 				cv = load64(src, s)
    161 				for index0 < index1 {
    162 					cv0 := load64(src, index0)
    163 					cv1 := load64(src, index1)
    164 					lTable[hash7(cv0, lTableBits)] = uint32(index0)
    165 					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
    166 
    167 					lTable[hash7(cv1, lTableBits)] = uint32(index1)
    168 					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
    169 					index0 += 2
    170 					index1 -= 2
    171 				}
    172 
    173 				cv = load64(src, s)
    174 				continue
    175 			}
    176 
    177 			// Long likely matches 7, so take that.
    178 			if uint32(cv) == uint32(valLong) {
    179 				break
    180 			}
    181 
    182 			// Check our short candidate
    183 			if uint32(cv) == uint32(valShort) {
    184 				// Try a long candidate at s+1
    185 				hashL = hash7(cv>>8, lTableBits)
    186 				candidateL = int(lTable[hashL])
    187 				lTable[hashL] = uint32(s + 1)
    188 				if uint32(cv>>8) == load32(src, candidateL) {
    189 					s++
    190 					break
    191 				}
    192 				// Use our short candidate.
    193 				candidateL = candidateS
    194 				break
    195 			}
    196 
    197 			cv = load64(src, nextS)
    198 			s = nextS
    199 		}
    200 
    201 		// Extend backwards
    202 		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
    203 			candidateL--
    204 			s--
    205 		}
    206 
    207 		// Bail if we exceed the maximum size.
    208 		if d+(s-nextEmit) > dstLimit {
    209 			return 0
    210 		}
    211 
    212 		base := s
    213 		offset := base - candidateL
    214 
    215 		// Extend the 4-byte match as long as possible.
    216 		s += 4
    217 		candidateL += 4
    218 		for s < len(src) {
    219 			if len(src)-s < 8 {
    220 				if src[s] == src[candidateL] {
    221 					s++
    222 					candidateL++
    223 					continue
    224 				}
    225 				break
    226 			}
    227 			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
    228 				s += bits.TrailingZeros64(diff) >> 3
    229 				break
    230 			}
    231 			s += 8
    232 			candidateL += 8
    233 		}
    234 
    235 		if offset > 65535 && s-base <= 5 && repeat != offset {
    236 			// Bail if the match is equal or worse to the encoding.
    237 			s = nextS + 1
    238 			if s >= sLimit {
    239 				goto emitRemainder
    240 			}
    241 			cv = load64(src, s)
    242 			continue
    243 		}
    244 
    245 		d += emitLiteral(dst[d:], src[nextEmit:base])
    246 		if repeat == offset {
    247 			d += emitRepeat(dst[d:], offset, s-base)
    248 		} else {
    249 			d += emitCopy(dst[d:], offset, s-base)
    250 			repeat = offset
    251 		}
    252 
    253 		nextEmit = s
    254 		if s >= sLimit {
    255 			goto emitRemainder
    256 		}
    257 
    258 		if d > dstLimit {
    259 			// Do we have space for more, if not bail.
    260 			return 0
    261 		}
    262 
    263 		// Index short & long
    264 		index0 := base + 1
    265 		index1 := s - 2
    266 
    267 		cv0 := load64(src, index0)
    268 		cv1 := load64(src, index1)
    269 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
    270 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
    271 
    272 		lTable[hash7(cv1, lTableBits)] = uint32(index1)
    273 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
    274 		index0 += 1
    275 		index1 -= 1
    276 		cv = load64(src, s)
    277 
    278 		// index every second long in between.
    279 		for index0 < index1 {
    280 			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
    281 			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
    282 			index0 += 2
    283 			index1 -= 2
    284 		}
    285 	}
    286 
    287 emitRemainder:
    288 	if nextEmit < len(src) {
    289 		// Bail if we exceed the maximum size.
    290 		if d+len(src)-nextEmit > dstLimit {
    291 			return 0
    292 		}
    293 		d += emitLiteral(dst[d:], src[nextEmit:])
    294 	}
    295 	return d
    296 }
    297 
    298 // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
    299 // assumes that the varint-encoded length of the decompressed bytes has already
    300 // been written.
    301 //
    302 // It also assumes that:
    303 //
    304 //	len(dst) >= MaxEncodedLen(len(src)) &&
    305 //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
    306 func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
    307 	// sLimit is when to stop looking for offset/length copies. The inputMargin
    308 	// lets us use a fast path for emitLiteral in the main loop, while we are
    309 	// looking for copies.
    310 	sLimit := len(src) - inputMargin
    311 	if len(src) < minNonLiteralBlockSize {
    312 		return 0
    313 	}
    314 
    315 	// Initialize the hash tables.
    316 	const (
    317 		// Long hash matches.
    318 		lTableBits    = 16
    319 		maxLTableSize = 1 << lTableBits
    320 
    321 		// Short hash matches.
    322 		sTableBits    = 14
    323 		maxSTableSize = 1 << sTableBits
    324 	)
    325 
    326 	var lTable [maxLTableSize]uint32
    327 	var sTable [maxSTableSize]uint32
    328 
    329 	// Bail if we can't compress to at least this.
    330 	dstLimit := len(src) - len(src)>>5 - 6
    331 
    332 	// nextEmit is where in src the next emitLiteral should start from.
    333 	nextEmit := 0
    334 
    335 	// The encoded form must start with a literal, as there are no previous
    336 	// bytes to copy, so we start looking for hash matches at s == 1.
    337 	s := 1
    338 	cv := load64(src, s)
    339 
    340 	// We initialize repeat to 0, so we never match on first attempt
    341 	repeat := 0
    342 	const maxSkip = 100
    343 
    344 	for {
    345 		candidateL := 0
    346 		nextS := 0
    347 		for {
    348 			// Next src position to check
    349 			nextS = (s-nextEmit)>>7 + 1
    350 			if nextS > maxSkip {
    351 				nextS = s + maxSkip
    352 			} else {
    353 				nextS += s
    354 			}
    355 
    356 			if nextS > sLimit {
    357 				goto emitRemainder
    358 			}
    359 			hashL := hash7(cv, lTableBits)
    360 			hashS := hash4(cv, sTableBits)
    361 			candidateL = int(lTable[hashL])
    362 			candidateS := int(sTable[hashS])
    363 			lTable[hashL] = uint32(s)
    364 			sTable[hashS] = uint32(s)
    365 
    366 			if uint32(cv) == load32(src, candidateL) {
    367 				break
    368 			}
    369 
    370 			// Check our short candidate
    371 			if uint32(cv) == load32(src, candidateS) {
    372 				// Try a long candidate at s+1
    373 				hashL = hash7(cv>>8, lTableBits)
    374 				candidateL = int(lTable[hashL])
    375 				lTable[hashL] = uint32(s + 1)
    376 				if uint32(cv>>8) == load32(src, candidateL) {
    377 					s++
    378 					break
    379 				}
    380 				// Use our short candidate.
    381 				candidateL = candidateS
    382 				break
    383 			}
    384 
    385 			cv = load64(src, nextS)
    386 			s = nextS
    387 		}
    388 
    389 		// Extend backwards
    390 		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
    391 			candidateL--
    392 			s--
    393 		}
    394 
    395 		// Bail if we exceed the maximum size.
    396 		if d+(s-nextEmit) > dstLimit {
    397 			return 0
    398 		}
    399 
    400 		base := s
    401 		offset := base - candidateL
    402 
    403 		// Extend the 4-byte match as long as possible.
    404 		s += 4
    405 		candidateL += 4
    406 		for s < len(src) {
    407 			if len(src)-s < 8 {
    408 				if src[s] == src[candidateL] {
    409 					s++
    410 					candidateL++
    411 					continue
    412 				}
    413 				break
    414 			}
    415 			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
    416 				s += bits.TrailingZeros64(diff) >> 3
    417 				break
    418 			}
    419 			s += 8
    420 			candidateL += 8
    421 		}
    422 
    423 		if offset > 65535 && s-base <= 5 && repeat != offset {
    424 			// Bail if the match is equal or worse to the encoding.
    425 			s = nextS + 1
    426 			if s >= sLimit {
    427 				goto emitRemainder
    428 			}
    429 			cv = load64(src, s)
    430 			continue
    431 		}
    432 
    433 		d += emitLiteral(dst[d:], src[nextEmit:base])
    434 		d += emitCopyNoRepeat(dst[d:], offset, s-base)
    435 		repeat = offset
    436 
    437 		nextEmit = s
    438 		if s >= sLimit {
    439 			goto emitRemainder
    440 		}
    441 
    442 		if d > dstLimit {
    443 			// Do we have space for more, if not bail.
    444 			return 0
    445 		}
    446 
    447 		// Index short & long
    448 		index0 := base + 1
    449 		index1 := s - 2
    450 
    451 		cv0 := load64(src, index0)
    452 		cv1 := load64(src, index1)
    453 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
    454 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
    455 
    456 		lTable[hash7(cv1, lTableBits)] = uint32(index1)
    457 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
    458 		index0 += 1
    459 		index1 -= 1
    460 		cv = load64(src, s)
    461 
    462 		// index every second long in between.
    463 		for index0 < index1 {
    464 			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
    465 			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
    466 			index0 += 2
    467 			index1 -= 2
    468 		}
    469 	}
    470 
    471 emitRemainder:
    472 	if nextEmit < len(src) {
    473 		// Bail if we exceed the maximum size.
    474 		if d+len(src)-nextEmit > dstLimit {
    475 			return 0
    476 		}
    477 		d += emitLiteral(dst[d:], src[nextEmit:])
    478 	}
    479 	return d
    480 }
    481 
    482 // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
    483 // assumes that the varint-encoded length of the decompressed bytes has already
    484 // been written.
    485 //
    486 // It also assumes that:
    487 //
    488 //	len(dst) >= MaxEncodedLen(len(src)) &&
    489 //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
    490 func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
    491 	// sLimit is when to stop looking for offset/length copies. The inputMargin
    492 	// lets us use a fast path for emitLiteral in the main loop, while we are
    493 	// looking for copies.
    494 	// Initialize the hash tables.
    495 	const (
    496 		// Long hash matches.
    497 		lTableBits    = 17
    498 		maxLTableSize = 1 << lTableBits
    499 
    500 		// Short hash matches.
    501 		sTableBits    = 14
    502 		maxSTableSize = 1 << sTableBits
    503 
    504 		maxAhead = 8 // maximum bytes ahead without checking sLimit
    505 
    506 		debug = false
    507 	)
    508 
    509 	sLimit := len(src) - inputMargin
    510 	if sLimit > MaxDictSrcOffset-maxAhead {
    511 		sLimit = MaxDictSrcOffset - maxAhead
    512 	}
    513 	if len(src) < minNonLiteralBlockSize {
    514 		return 0
    515 	}
    516 
    517 	dict.initBetter()
    518 
    519 	var lTable [maxLTableSize]uint32
    520 	var sTable [maxSTableSize]uint32
    521 
    522 	// Bail if we can't compress to at least this.
    523 	dstLimit := len(src) - len(src)>>5 - 6
    524 
    525 	// nextEmit is where in src the next emitLiteral should start from.
    526 	nextEmit := 0
    527 
    528 	// The encoded form must start with a literal, as there are no previous
    529 	// bytes to copy, so we start looking for hash matches at s == 1.
    530 	s := 0
    531 	cv := load64(src, s)
    532 
    533 	// We initialize repeat to 0, so we never match on first attempt
    534 	repeat := len(dict.dict) - dict.repeat
    535 
    536 	// While in dict
    537 searchDict:
    538 	for {
    539 		candidateL := 0
    540 		nextS := 0
    541 		for {
    542 			// Next src position to check
    543 			nextS = s + (s-nextEmit)>>7 + 1
    544 			if nextS > sLimit {
    545 				break searchDict
    546 			}
    547 			hashL := hash7(cv, lTableBits)
    548 			hashS := hash4(cv, sTableBits)
    549 			candidateL = int(lTable[hashL])
    550 			candidateS := int(sTable[hashS])
    551 			dictL := int(dict.betterTableLong[hashL])
    552 			dictS := int(dict.betterTableShort[hashS])
    553 			lTable[hashL] = uint32(s)
    554 			sTable[hashS] = uint32(s)
    555 
    556 			valLong := load64(src, candidateL)
    557 			valShort := load64(src, candidateS)
    558 
    559 			// If long matches at least 8 bytes, use that.
    560 			if s != 0 {
    561 				if cv == valLong {
    562 					goto emitMatch
    563 				}
    564 				if cv == valShort {
    565 					candidateL = candidateS
    566 					goto emitMatch
    567 				}
    568 			}
    569 
    570 			// Check dict repeat.
    571 			if repeat >= s+4 {
    572 				candidate := len(dict.dict) - repeat + s
    573 				if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
    574 					// Extend back
    575 					base := s
    576 					for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
    577 						i--
    578 						base--
    579 					}
    580 					d += emitLiteral(dst[d:], src[nextEmit:base])
    581 					if debug && nextEmit != base {
    582 						fmt.Println("emitted ", base-nextEmit, "literals")
    583 					}
    584 					s += 4
    585 					candidate += 4
    586 					for candidate < len(dict.dict)-8 && s <= len(src)-8 {
    587 						if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
    588 							s += bits.TrailingZeros64(diff) >> 3
    589 							break
    590 						}
    591 						s += 8
    592 						candidate += 8
    593 					}
    594 					d += emitRepeat(dst[d:], repeat, s-base)
    595 					if debug {
    596 						fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
    597 					}
    598 					nextEmit = s
    599 					if s >= sLimit {
    600 						break searchDict
    601 					}
    602 					cv = load64(src, s)
    603 					// Index in-between
    604 					index0 := base + 1
    605 					index1 := s - 2
    606 
    607 					cv = load64(src, s)
    608 					for index0 < index1 {
    609 						cv0 := load64(src, index0)
    610 						cv1 := load64(src, index1)
    611 						lTable[hash7(cv0, lTableBits)] = uint32(index0)
    612 						sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
    613 
    614 						lTable[hash7(cv1, lTableBits)] = uint32(index1)
    615 						sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
    616 						index0 += 2
    617 						index1 -= 2
    618 					}
    619 					continue
    620 				}
    621 			}
    622 			// Don't try to find match at s==0
    623 			if s == 0 {
    624 				cv = load64(src, nextS)
    625 				s = nextS
    626 				continue
    627 			}
    628 
    629 			// Long likely matches 7, so take that.
    630 			if uint32(cv) == uint32(valLong) {
    631 				goto emitMatch
    632 			}
    633 
    634 			// Long dict...
    635 			if uint32(cv) == load32(dict.dict, dictL) {
    636 				candidateL = dictL
    637 				goto emitDict
    638 			}
    639 
    640 			// Check our short candidate
    641 			if uint32(cv) == uint32(valShort) {
    642 				// Try a long candidate at s+1
    643 				hashL = hash7(cv>>8, lTableBits)
    644 				candidateL = int(lTable[hashL])
    645 				lTable[hashL] = uint32(s + 1)
    646 				if uint32(cv>>8) == load32(src, candidateL) {
    647 					s++
    648 					goto emitMatch
    649 				}
    650 				// Use our short candidate.
    651 				candidateL = candidateS
    652 				goto emitMatch
    653 			}
    654 			if uint32(cv) == load32(dict.dict, dictS) {
    655 				// Try a long candidate at s+1
    656 				hashL = hash7(cv>>8, lTableBits)
    657 				candidateL = int(lTable[hashL])
    658 				lTable[hashL] = uint32(s + 1)
    659 				if uint32(cv>>8) == load32(src, candidateL) {
    660 					s++
    661 					goto emitMatch
    662 				}
    663 				candidateL = dictS
    664 				goto emitDict
    665 			}
    666 			cv = load64(src, nextS)
    667 			s = nextS
    668 		}
    669 	emitDict:
    670 		{
    671 			if debug {
    672 				if load32(dict.dict, candidateL) != load32(src, s) {
    673 					panic("dict emit mismatch")
    674 				}
    675 			}
    676 			// Extend backwards.
    677 			// The top bytes will be rechecked to get the full match.
    678 			for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
    679 				candidateL--
    680 				s--
    681 			}
    682 
    683 			// Bail if we exceed the maximum size.
    684 			if d+(s-nextEmit) > dstLimit {
    685 				return 0
    686 			}
    687 
    688 			// A 4-byte match has been found. We'll later see if more than 4 bytes
    689 			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
    690 			// them as literal bytes.
    691 
    692 			d += emitLiteral(dst[d:], src[nextEmit:s])
    693 			if debug && nextEmit != s {
    694 				fmt.Println("emitted ", s-nextEmit, "literals")
    695 			}
    696 			{
    697 				// Invariant: we have a 4-byte match at s, and no need to emit any
    698 				// literal bytes prior to s.
    699 				base := s
    700 				offset := s + (len(dict.dict)) - candidateL
    701 
    702 				// Extend the 4-byte match as long as possible.
    703 				s += 4
    704 				candidateL += 4
    705 				for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
    706 					if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
    707 						s += bits.TrailingZeros64(diff) >> 3
    708 						break
    709 					}
    710 					s += 8
    711 					candidateL += 8
    712 				}
    713 
    714 				if repeat == offset {
    715 					if debug {
    716 						fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
    717 					}
    718 					d += emitRepeat(dst[d:], offset, s-base)
    719 				} else {
    720 					if debug {
    721 						fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
    722 					}
    723 					// Matches longer than 64 are split.
    724 					if s <= sLimit || s-base < 8 {
    725 						d += emitCopy(dst[d:], offset, s-base)
    726 					} else {
    727 						// Split to ensure we don't start a copy within next block.
    728 						d += emitCopy(dst[d:], offset, 4)
    729 						d += emitRepeat(dst[d:], offset, s-base-4)
    730 					}
    731 					repeat = offset
    732 				}
    733 				if false {
    734 					// Validate match.
    735 					if s <= candidateL {
    736 						panic("s <= candidate")
    737 					}
    738 					a := src[base:s]
    739 					b := dict.dict[base-repeat : base-repeat+(s-base)]
    740 					if !bytes.Equal(a, b) {
    741 						panic("mismatch")
    742 					}
    743 				}
    744 
    745 				nextEmit = s
    746 				if s >= sLimit {
    747 					break searchDict
    748 				}
    749 
    750 				if d > dstLimit {
    751 					// Do we have space for more, if not bail.
    752 					return 0
    753 				}
    754 
    755 				// Index short & long
    756 				index0 := base + 1
    757 				index1 := s - 2
    758 
    759 				cv0 := load64(src, index0)
    760 				cv1 := load64(src, index1)
    761 				lTable[hash7(cv0, lTableBits)] = uint32(index0)
    762 				sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
    763 
    764 				lTable[hash7(cv1, lTableBits)] = uint32(index1)
    765 				sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
    766 				index0 += 1
    767 				index1 -= 1
    768 				cv = load64(src, s)
    769 
    770 				// index every second long in between.
    771 				for index0 < index1 {
    772 					lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
    773 					lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
    774 					index0 += 2
    775 					index1 -= 2
    776 				}
    777 			}
    778 			continue
    779 		}
    780 	emitMatch:
    781 
    782 		// Extend backwards
    783 		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
    784 			candidateL--
    785 			s--
    786 		}
    787 
    788 		// Bail if we exceed the maximum size.
    789 		if d+(s-nextEmit) > dstLimit {
    790 			return 0
    791 		}
    792 
    793 		base := s
    794 		offset := base - candidateL
    795 
    796 		// Extend the 4-byte match as long as possible.
    797 		s += 4
    798 		candidateL += 4
    799 		for s < len(src) {
    800 			if len(src)-s < 8 {
    801 				if src[s] == src[candidateL] {
    802 					s++
    803 					candidateL++
    804 					continue
    805 				}
    806 				break
    807 			}
    808 			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
    809 				s += bits.TrailingZeros64(diff) >> 3
    810 				break
    811 			}
    812 			s += 8
    813 			candidateL += 8
    814 		}
    815 
    816 		if offset > 65535 && s-base <= 5 && repeat != offset {
    817 			// Bail if the match is equal or worse to the encoding.
    818 			s = nextS + 1
    819 			if s >= sLimit {
    820 				goto emitRemainder
    821 			}
    822 			cv = load64(src, s)
    823 			continue
    824 		}
    825 
    826 		d += emitLiteral(dst[d:], src[nextEmit:base])
    827 		if debug && nextEmit != s {
    828 			fmt.Println("emitted ", s-nextEmit, "literals")
    829 		}
    830 		if repeat == offset {
    831 			if debug {
    832 				fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
    833 			}
    834 			d += emitRepeat(dst[d:], offset, s-base)
    835 		} else {
    836 			if debug {
    837 				fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
    838 			}
    839 			d += emitCopy(dst[d:], offset, s-base)
    840 			repeat = offset
    841 		}
    842 
    843 		nextEmit = s
    844 		if s >= sLimit {
    845 			goto emitRemainder
    846 		}
    847 
    848 		if d > dstLimit {
    849 			// Do we have space for more, if not bail.
    850 			return 0
    851 		}
    852 
    853 		// Index short & long
    854 		index0 := base + 1
    855 		index1 := s - 2
    856 
    857 		cv0 := load64(src, index0)
    858 		cv1 := load64(src, index1)
    859 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
    860 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
    861 
    862 		lTable[hash7(cv1, lTableBits)] = uint32(index1)
    863 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
    864 		index0 += 1
    865 		index1 -= 1
    866 		cv = load64(src, s)
    867 
    868 		// index every second long in between.
    869 		for index0 < index1 {
    870 			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
    871 			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
    872 			index0 += 2
    873 			index1 -= 2
    874 		}
    875 	}
    876 
    877 	// Search without dict:
    878 	if repeat > s {
    879 		repeat = 0
    880 	}
    881 
    882 	// No more dict
    883 	sLimit = len(src) - inputMargin
    884 	if s >= sLimit {
    885 		goto emitRemainder
    886 	}
    887 	cv = load64(src, s)
    888 	if debug {
    889 		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
    890 	}
    891 	for {
    892 		candidateL := 0
    893 		nextS := 0
    894 		for {
    895 			// Next src position to check
    896 			nextS = s + (s-nextEmit)>>7 + 1
    897 			if nextS > sLimit {
    898 				goto emitRemainder
    899 			}
    900 			hashL := hash7(cv, lTableBits)
    901 			hashS := hash4(cv, sTableBits)
    902 			candidateL = int(lTable[hashL])
    903 			candidateS := int(sTable[hashS])
    904 			lTable[hashL] = uint32(s)
    905 			sTable[hashS] = uint32(s)
    906 
    907 			valLong := load64(src, candidateL)
    908 			valShort := load64(src, candidateS)
    909 
    910 			// If long matches at least 8 bytes, use that.
    911 			if cv == valLong {
    912 				break
    913 			}
    914 			if cv == valShort {
    915 				candidateL = candidateS
    916 				break
    917 			}
    918 
    919 			// Check repeat at offset checkRep.
    920 			const checkRep = 1
    921 			// Minimum length of a repeat. Tested with various values.
    922 			// While 4-5 offers improvements in some, 6 reduces
    923 			// regressions significantly.
    924 			const wantRepeatBytes = 6
    925 			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
    926 			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
    927 				base := s + checkRep
    928 				// Extend back
    929 				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
    930 					i--
    931 					base--
    932 				}
    933 				d += emitLiteral(dst[d:], src[nextEmit:base])
    934 
    935 				// Extend forward
    936 				candidate := s - repeat + wantRepeatBytes + checkRep
    937 				s += wantRepeatBytes + checkRep
    938 				for s < len(src) {
    939 					if len(src)-s < 8 {
    940 						if src[s] == src[candidate] {
    941 							s++
    942 							candidate++
    943 							continue
    944 						}
    945 						break
    946 					}
    947 					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
    948 						s += bits.TrailingZeros64(diff) >> 3
    949 						break
    950 					}
    951 					s += 8
    952 					candidate += 8
    953 				}
    954 				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
    955 				d += emitRepeat(dst[d:], repeat, s-base)
    956 				nextEmit = s
    957 				if s >= sLimit {
    958 					goto emitRemainder
    959 				}
    960 				// Index in-between
    961 				index0 := base + 1
    962 				index1 := s - 2
    963 
    964 				cv = load64(src, s)
    965 				for index0 < index1 {
    966 					cv0 := load64(src, index0)
    967 					cv1 := load64(src, index1)
    968 					lTable[hash7(cv0, lTableBits)] = uint32(index0)
    969 					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
    970 
    971 					lTable[hash7(cv1, lTableBits)] = uint32(index1)
    972 					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
    973 					index0 += 2
    974 					index1 -= 2
    975 				}
    976 
    977 				cv = load64(src, s)
    978 				continue
    979 			}
    980 
    981 			// Long likely matches 7, so take that.
    982 			if uint32(cv) == uint32(valLong) {
    983 				break
    984 			}
    985 
    986 			// Check our short candidate
    987 			if uint32(cv) == uint32(valShort) {
    988 				// Try a long candidate at s+1
    989 				hashL = hash7(cv>>8, lTableBits)
    990 				candidateL = int(lTable[hashL])
    991 				lTable[hashL] = uint32(s + 1)
    992 				if uint32(cv>>8) == load32(src, candidateL) {
    993 					s++
    994 					break
    995 				}
    996 				// Use our short candidate.
    997 				candidateL = candidateS
    998 				break
    999 			}
   1000 
   1001 			cv = load64(src, nextS)
   1002 			s = nextS
   1003 		}
   1004 
   1005 		// Extend backwards
   1006 		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
   1007 			candidateL--
   1008 			s--
   1009 		}
   1010 
   1011 		// Bail if we exceed the maximum size.
   1012 		if d+(s-nextEmit) > dstLimit {
   1013 			return 0
   1014 		}
   1015 
   1016 		base := s
   1017 		offset := base - candidateL
   1018 
   1019 		// Extend the 4-byte match as long as possible.
   1020 		s += 4
   1021 		candidateL += 4
   1022 		for s < len(src) {
   1023 			if len(src)-s < 8 {
   1024 				if src[s] == src[candidateL] {
   1025 					s++
   1026 					candidateL++
   1027 					continue
   1028 				}
   1029 				break
   1030 			}
   1031 			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
   1032 				s += bits.TrailingZeros64(diff) >> 3
   1033 				break
   1034 			}
   1035 			s += 8
   1036 			candidateL += 8
   1037 		}
   1038 
   1039 		if offset > 65535 && s-base <= 5 && repeat != offset {
   1040 			// Bail if the match is equal or worse to the encoding.
   1041 			s = nextS + 1
   1042 			if s >= sLimit {
   1043 				goto emitRemainder
   1044 			}
   1045 			cv = load64(src, s)
   1046 			continue
   1047 		}
   1048 
   1049 		d += emitLiteral(dst[d:], src[nextEmit:base])
   1050 		if repeat == offset {
   1051 			d += emitRepeat(dst[d:], offset, s-base)
   1052 		} else {
   1053 			d += emitCopy(dst[d:], offset, s-base)
   1054 			repeat = offset
   1055 		}
   1056 
   1057 		nextEmit = s
   1058 		if s >= sLimit {
   1059 			goto emitRemainder
   1060 		}
   1061 
   1062 		if d > dstLimit {
   1063 			// Do we have space for more, if not bail.
   1064 			return 0
   1065 		}
   1066 
   1067 		// Index short & long
   1068 		index0 := base + 1
   1069 		index1 := s - 2
   1070 
   1071 		cv0 := load64(src, index0)
   1072 		cv1 := load64(src, index1)
   1073 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
   1074 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   1075 
   1076 		lTable[hash7(cv1, lTableBits)] = uint32(index1)
   1077 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   1078 		index0 += 1
   1079 		index1 -= 1
   1080 		cv = load64(src, s)
   1081 
   1082 		// index every second long in between.
   1083 		for index0 < index1 {
   1084 			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
   1085 			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
   1086 			index0 += 2
   1087 			index1 -= 2
   1088 		}
   1089 	}
   1090 
   1091 emitRemainder:
   1092 	if nextEmit < len(src) {
   1093 		// Bail if we exceed the maximum size.
   1094 		if d+len(src)-nextEmit > dstLimit {
   1095 			return 0
   1096 		}
   1097 		d += emitLiteral(dst[d:], src[nextEmit:])
   1098 	}
   1099 	return d
   1100 }