encode_better.go (26953B)
1 // Copyright 2016 The Snappy-Go Authors. All rights reserved. 2 // Copyright (c) 2019 Klaus Post. All rights reserved. 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 package s2 7 8 import ( 9 "bytes" 10 "fmt" 11 "math/bits" 12 ) 13 14 // hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. 15 // Preferably h should be a constant and should always be <32. 16 func hash4(u uint64, h uint8) uint32 { 17 const prime4bytes = 2654435761 18 return (uint32(u) * prime4bytes) >> ((32 - h) & 31) 19 } 20 21 // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits. 22 // Preferably h should be a constant and should always be <64. 23 func hash5(u uint64, h uint8) uint32 { 24 const prime5bytes = 889523592379 25 return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63)) 26 } 27 28 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. 29 // Preferably h should be a constant and should always be <64. 30 func hash7(u uint64, h uint8) uint32 { 31 const prime7bytes = 58295818150454627 32 return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) 33 } 34 35 // hash8 returns the hash of u to fit in a hash table with h bits. 36 // Preferably h should be a constant and should always be <64. 37 func hash8(u uint64, h uint8) uint32 { 38 const prime8bytes = 0xcf1bbcdcb7a56463 39 return uint32((u * prime8bytes) >> ((64 - h) & 63)) 40 } 41 42 // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It 43 // assumes that the varint-encoded length of the decompressed bytes has already 44 // been written. 45 // 46 // It also assumes that: 47 // 48 // len(dst) >= MaxEncodedLen(len(src)) && 49 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize 50 func encodeBlockBetterGo(dst, src []byte) (d int) { 51 // sLimit is when to stop looking for offset/length copies. The inputMargin 52 // lets us use a fast path for emitLiteral in the main loop, while we are 53 // looking for copies. 54 sLimit := len(src) - inputMargin 55 if len(src) < minNonLiteralBlockSize { 56 return 0 57 } 58 59 // Initialize the hash tables. 60 const ( 61 // Long hash matches. 62 lTableBits = 17 63 maxLTableSize = 1 << lTableBits 64 65 // Short hash matches. 66 sTableBits = 14 67 maxSTableSize = 1 << sTableBits 68 ) 69 70 var lTable [maxLTableSize]uint32 71 var sTable [maxSTableSize]uint32 72 73 // Bail if we can't compress to at least this. 74 dstLimit := len(src) - len(src)>>5 - 6 75 76 // nextEmit is where in src the next emitLiteral should start from. 77 nextEmit := 0 78 79 // The encoded form must start with a literal, as there are no previous 80 // bytes to copy, so we start looking for hash matches at s == 1. 81 s := 1 82 cv := load64(src, s) 83 84 // We initialize repeat to 0, so we never match on first attempt 85 repeat := 0 86 87 for { 88 candidateL := 0 89 nextS := 0 90 for { 91 // Next src position to check 92 nextS = s + (s-nextEmit)>>7 + 1 93 if nextS > sLimit { 94 goto emitRemainder 95 } 96 hashL := hash7(cv, lTableBits) 97 hashS := hash4(cv, sTableBits) 98 candidateL = int(lTable[hashL]) 99 candidateS := int(sTable[hashS]) 100 lTable[hashL] = uint32(s) 101 sTable[hashS] = uint32(s) 102 103 valLong := load64(src, candidateL) 104 valShort := load64(src, candidateS) 105 106 // If long matches at least 8 bytes, use that. 107 if cv == valLong { 108 break 109 } 110 if cv == valShort { 111 candidateL = candidateS 112 break 113 } 114 115 // Check repeat at offset checkRep. 116 const checkRep = 1 117 // Minimum length of a repeat. Tested with various values. 118 // While 4-5 offers improvements in some, 6 reduces 119 // regressions significantly. 120 const wantRepeatBytes = 6 121 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) 122 if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { 123 base := s + checkRep 124 // Extend back 125 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { 126 i-- 127 base-- 128 } 129 d += emitLiteral(dst[d:], src[nextEmit:base]) 130 131 // Extend forward 132 candidate := s - repeat + wantRepeatBytes + checkRep 133 s += wantRepeatBytes + checkRep 134 for s < len(src) { 135 if len(src)-s < 8 { 136 if src[s] == src[candidate] { 137 s++ 138 candidate++ 139 continue 140 } 141 break 142 } 143 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { 144 s += bits.TrailingZeros64(diff) >> 3 145 break 146 } 147 s += 8 148 candidate += 8 149 } 150 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. 151 d += emitRepeat(dst[d:], repeat, s-base) 152 nextEmit = s 153 if s >= sLimit { 154 goto emitRemainder 155 } 156 // Index in-between 157 index0 := base + 1 158 index1 := s - 2 159 160 cv = load64(src, s) 161 for index0 < index1 { 162 cv0 := load64(src, index0) 163 cv1 := load64(src, index1) 164 lTable[hash7(cv0, lTableBits)] = uint32(index0) 165 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 166 167 lTable[hash7(cv1, lTableBits)] = uint32(index1) 168 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 169 index0 += 2 170 index1 -= 2 171 } 172 173 cv = load64(src, s) 174 continue 175 } 176 177 // Long likely matches 7, so take that. 178 if uint32(cv) == uint32(valLong) { 179 break 180 } 181 182 // Check our short candidate 183 if uint32(cv) == uint32(valShort) { 184 // Try a long candidate at s+1 185 hashL = hash7(cv>>8, lTableBits) 186 candidateL = int(lTable[hashL]) 187 lTable[hashL] = uint32(s + 1) 188 if uint32(cv>>8) == load32(src, candidateL) { 189 s++ 190 break 191 } 192 // Use our short candidate. 193 candidateL = candidateS 194 break 195 } 196 197 cv = load64(src, nextS) 198 s = nextS 199 } 200 201 // Extend backwards 202 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { 203 candidateL-- 204 s-- 205 } 206 207 // Bail if we exceed the maximum size. 208 if d+(s-nextEmit) > dstLimit { 209 return 0 210 } 211 212 base := s 213 offset := base - candidateL 214 215 // Extend the 4-byte match as long as possible. 216 s += 4 217 candidateL += 4 218 for s < len(src) { 219 if len(src)-s < 8 { 220 if src[s] == src[candidateL] { 221 s++ 222 candidateL++ 223 continue 224 } 225 break 226 } 227 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { 228 s += bits.TrailingZeros64(diff) >> 3 229 break 230 } 231 s += 8 232 candidateL += 8 233 } 234 235 if offset > 65535 && s-base <= 5 && repeat != offset { 236 // Bail if the match is equal or worse to the encoding. 237 s = nextS + 1 238 if s >= sLimit { 239 goto emitRemainder 240 } 241 cv = load64(src, s) 242 continue 243 } 244 245 d += emitLiteral(dst[d:], src[nextEmit:base]) 246 if repeat == offset { 247 d += emitRepeat(dst[d:], offset, s-base) 248 } else { 249 d += emitCopy(dst[d:], offset, s-base) 250 repeat = offset 251 } 252 253 nextEmit = s 254 if s >= sLimit { 255 goto emitRemainder 256 } 257 258 if d > dstLimit { 259 // Do we have space for more, if not bail. 260 return 0 261 } 262 263 // Index short & long 264 index0 := base + 1 265 index1 := s - 2 266 267 cv0 := load64(src, index0) 268 cv1 := load64(src, index1) 269 lTable[hash7(cv0, lTableBits)] = uint32(index0) 270 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 271 272 lTable[hash7(cv1, lTableBits)] = uint32(index1) 273 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 274 index0 += 1 275 index1 -= 1 276 cv = load64(src, s) 277 278 // index every second long in between. 279 for index0 < index1 { 280 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) 281 lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) 282 index0 += 2 283 index1 -= 2 284 } 285 } 286 287 emitRemainder: 288 if nextEmit < len(src) { 289 // Bail if we exceed the maximum size. 290 if d+len(src)-nextEmit > dstLimit { 291 return 0 292 } 293 d += emitLiteral(dst[d:], src[nextEmit:]) 294 } 295 return d 296 } 297 298 // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It 299 // assumes that the varint-encoded length of the decompressed bytes has already 300 // been written. 301 // 302 // It also assumes that: 303 // 304 // len(dst) >= MaxEncodedLen(len(src)) && 305 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize 306 func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { 307 // sLimit is when to stop looking for offset/length copies. The inputMargin 308 // lets us use a fast path for emitLiteral in the main loop, while we are 309 // looking for copies. 310 sLimit := len(src) - inputMargin 311 if len(src) < minNonLiteralBlockSize { 312 return 0 313 } 314 315 // Initialize the hash tables. 316 const ( 317 // Long hash matches. 318 lTableBits = 16 319 maxLTableSize = 1 << lTableBits 320 321 // Short hash matches. 322 sTableBits = 14 323 maxSTableSize = 1 << sTableBits 324 ) 325 326 var lTable [maxLTableSize]uint32 327 var sTable [maxSTableSize]uint32 328 329 // Bail if we can't compress to at least this. 330 dstLimit := len(src) - len(src)>>5 - 6 331 332 // nextEmit is where in src the next emitLiteral should start from. 333 nextEmit := 0 334 335 // The encoded form must start with a literal, as there are no previous 336 // bytes to copy, so we start looking for hash matches at s == 1. 337 s := 1 338 cv := load64(src, s) 339 340 // We initialize repeat to 0, so we never match on first attempt 341 repeat := 0 342 const maxSkip = 100 343 344 for { 345 candidateL := 0 346 nextS := 0 347 for { 348 // Next src position to check 349 nextS = (s-nextEmit)>>7 + 1 350 if nextS > maxSkip { 351 nextS = s + maxSkip 352 } else { 353 nextS += s 354 } 355 356 if nextS > sLimit { 357 goto emitRemainder 358 } 359 hashL := hash7(cv, lTableBits) 360 hashS := hash4(cv, sTableBits) 361 candidateL = int(lTable[hashL]) 362 candidateS := int(sTable[hashS]) 363 lTable[hashL] = uint32(s) 364 sTable[hashS] = uint32(s) 365 366 if uint32(cv) == load32(src, candidateL) { 367 break 368 } 369 370 // Check our short candidate 371 if uint32(cv) == load32(src, candidateS) { 372 // Try a long candidate at s+1 373 hashL = hash7(cv>>8, lTableBits) 374 candidateL = int(lTable[hashL]) 375 lTable[hashL] = uint32(s + 1) 376 if uint32(cv>>8) == load32(src, candidateL) { 377 s++ 378 break 379 } 380 // Use our short candidate. 381 candidateL = candidateS 382 break 383 } 384 385 cv = load64(src, nextS) 386 s = nextS 387 } 388 389 // Extend backwards 390 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { 391 candidateL-- 392 s-- 393 } 394 395 // Bail if we exceed the maximum size. 396 if d+(s-nextEmit) > dstLimit { 397 return 0 398 } 399 400 base := s 401 offset := base - candidateL 402 403 // Extend the 4-byte match as long as possible. 404 s += 4 405 candidateL += 4 406 for s < len(src) { 407 if len(src)-s < 8 { 408 if src[s] == src[candidateL] { 409 s++ 410 candidateL++ 411 continue 412 } 413 break 414 } 415 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { 416 s += bits.TrailingZeros64(diff) >> 3 417 break 418 } 419 s += 8 420 candidateL += 8 421 } 422 423 if offset > 65535 && s-base <= 5 && repeat != offset { 424 // Bail if the match is equal or worse to the encoding. 425 s = nextS + 1 426 if s >= sLimit { 427 goto emitRemainder 428 } 429 cv = load64(src, s) 430 continue 431 } 432 433 d += emitLiteral(dst[d:], src[nextEmit:base]) 434 d += emitCopyNoRepeat(dst[d:], offset, s-base) 435 repeat = offset 436 437 nextEmit = s 438 if s >= sLimit { 439 goto emitRemainder 440 } 441 442 if d > dstLimit { 443 // Do we have space for more, if not bail. 444 return 0 445 } 446 447 // Index short & long 448 index0 := base + 1 449 index1 := s - 2 450 451 cv0 := load64(src, index0) 452 cv1 := load64(src, index1) 453 lTable[hash7(cv0, lTableBits)] = uint32(index0) 454 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 455 456 lTable[hash7(cv1, lTableBits)] = uint32(index1) 457 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 458 index0 += 1 459 index1 -= 1 460 cv = load64(src, s) 461 462 // index every second long in between. 463 for index0 < index1 { 464 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) 465 lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) 466 index0 += 2 467 index1 -= 2 468 } 469 } 470 471 emitRemainder: 472 if nextEmit < len(src) { 473 // Bail if we exceed the maximum size. 474 if d+len(src)-nextEmit > dstLimit { 475 return 0 476 } 477 d += emitLiteral(dst[d:], src[nextEmit:]) 478 } 479 return d 480 } 481 482 // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It 483 // assumes that the varint-encoded length of the decompressed bytes has already 484 // been written. 485 // 486 // It also assumes that: 487 // 488 // len(dst) >= MaxEncodedLen(len(src)) && 489 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize 490 func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) { 491 // sLimit is when to stop looking for offset/length copies. The inputMargin 492 // lets us use a fast path for emitLiteral in the main loop, while we are 493 // looking for copies. 494 // Initialize the hash tables. 495 const ( 496 // Long hash matches. 497 lTableBits = 17 498 maxLTableSize = 1 << lTableBits 499 500 // Short hash matches. 501 sTableBits = 14 502 maxSTableSize = 1 << sTableBits 503 504 maxAhead = 8 // maximum bytes ahead without checking sLimit 505 506 debug = false 507 ) 508 509 sLimit := len(src) - inputMargin 510 if sLimit > MaxDictSrcOffset-maxAhead { 511 sLimit = MaxDictSrcOffset - maxAhead 512 } 513 if len(src) < minNonLiteralBlockSize { 514 return 0 515 } 516 517 dict.initBetter() 518 519 var lTable [maxLTableSize]uint32 520 var sTable [maxSTableSize]uint32 521 522 // Bail if we can't compress to at least this. 523 dstLimit := len(src) - len(src)>>5 - 6 524 525 // nextEmit is where in src the next emitLiteral should start from. 526 nextEmit := 0 527 528 // The encoded form must start with a literal, as there are no previous 529 // bytes to copy, so we start looking for hash matches at s == 1. 530 s := 0 531 cv := load64(src, s) 532 533 // We initialize repeat to 0, so we never match on first attempt 534 repeat := len(dict.dict) - dict.repeat 535 536 // While in dict 537 searchDict: 538 for { 539 candidateL := 0 540 nextS := 0 541 for { 542 // Next src position to check 543 nextS = s + (s-nextEmit)>>7 + 1 544 if nextS > sLimit { 545 break searchDict 546 } 547 hashL := hash7(cv, lTableBits) 548 hashS := hash4(cv, sTableBits) 549 candidateL = int(lTable[hashL]) 550 candidateS := int(sTable[hashS]) 551 dictL := int(dict.betterTableLong[hashL]) 552 dictS := int(dict.betterTableShort[hashS]) 553 lTable[hashL] = uint32(s) 554 sTable[hashS] = uint32(s) 555 556 valLong := load64(src, candidateL) 557 valShort := load64(src, candidateS) 558 559 // If long matches at least 8 bytes, use that. 560 if s != 0 { 561 if cv == valLong { 562 goto emitMatch 563 } 564 if cv == valShort { 565 candidateL = candidateS 566 goto emitMatch 567 } 568 } 569 570 // Check dict repeat. 571 if repeat >= s+4 { 572 candidate := len(dict.dict) - repeat + s 573 if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) { 574 // Extend back 575 base := s 576 for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; { 577 i-- 578 base-- 579 } 580 d += emitLiteral(dst[d:], src[nextEmit:base]) 581 if debug && nextEmit != base { 582 fmt.Println("emitted ", base-nextEmit, "literals") 583 } 584 s += 4 585 candidate += 4 586 for candidate < len(dict.dict)-8 && s <= len(src)-8 { 587 if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 { 588 s += bits.TrailingZeros64(diff) >> 3 589 break 590 } 591 s += 8 592 candidate += 8 593 } 594 d += emitRepeat(dst[d:], repeat, s-base) 595 if debug { 596 fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s) 597 } 598 nextEmit = s 599 if s >= sLimit { 600 break searchDict 601 } 602 cv = load64(src, s) 603 // Index in-between 604 index0 := base + 1 605 index1 := s - 2 606 607 cv = load64(src, s) 608 for index0 < index1 { 609 cv0 := load64(src, index0) 610 cv1 := load64(src, index1) 611 lTable[hash7(cv0, lTableBits)] = uint32(index0) 612 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 613 614 lTable[hash7(cv1, lTableBits)] = uint32(index1) 615 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 616 index0 += 2 617 index1 -= 2 618 } 619 continue 620 } 621 } 622 // Don't try to find match at s==0 623 if s == 0 { 624 cv = load64(src, nextS) 625 s = nextS 626 continue 627 } 628 629 // Long likely matches 7, so take that. 630 if uint32(cv) == uint32(valLong) { 631 goto emitMatch 632 } 633 634 // Long dict... 635 if uint32(cv) == load32(dict.dict, dictL) { 636 candidateL = dictL 637 goto emitDict 638 } 639 640 // Check our short candidate 641 if uint32(cv) == uint32(valShort) { 642 // Try a long candidate at s+1 643 hashL = hash7(cv>>8, lTableBits) 644 candidateL = int(lTable[hashL]) 645 lTable[hashL] = uint32(s + 1) 646 if uint32(cv>>8) == load32(src, candidateL) { 647 s++ 648 goto emitMatch 649 } 650 // Use our short candidate. 651 candidateL = candidateS 652 goto emitMatch 653 } 654 if uint32(cv) == load32(dict.dict, dictS) { 655 // Try a long candidate at s+1 656 hashL = hash7(cv>>8, lTableBits) 657 candidateL = int(lTable[hashL]) 658 lTable[hashL] = uint32(s + 1) 659 if uint32(cv>>8) == load32(src, candidateL) { 660 s++ 661 goto emitMatch 662 } 663 candidateL = dictS 664 goto emitDict 665 } 666 cv = load64(src, nextS) 667 s = nextS 668 } 669 emitDict: 670 { 671 if debug { 672 if load32(dict.dict, candidateL) != load32(src, s) { 673 panic("dict emit mismatch") 674 } 675 } 676 // Extend backwards. 677 // The top bytes will be rechecked to get the full match. 678 for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] { 679 candidateL-- 680 s-- 681 } 682 683 // Bail if we exceed the maximum size. 684 if d+(s-nextEmit) > dstLimit { 685 return 0 686 } 687 688 // A 4-byte match has been found. We'll later see if more than 4 bytes 689 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit 690 // them as literal bytes. 691 692 d += emitLiteral(dst[d:], src[nextEmit:s]) 693 if debug && nextEmit != s { 694 fmt.Println("emitted ", s-nextEmit, "literals") 695 } 696 { 697 // Invariant: we have a 4-byte match at s, and no need to emit any 698 // literal bytes prior to s. 699 base := s 700 offset := s + (len(dict.dict)) - candidateL 701 702 // Extend the 4-byte match as long as possible. 703 s += 4 704 candidateL += 4 705 for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 { 706 if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 { 707 s += bits.TrailingZeros64(diff) >> 3 708 break 709 } 710 s += 8 711 candidateL += 8 712 } 713 714 if repeat == offset { 715 if debug { 716 fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL) 717 } 718 d += emitRepeat(dst[d:], offset, s-base) 719 } else { 720 if debug { 721 fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL) 722 } 723 // Matches longer than 64 are split. 724 if s <= sLimit || s-base < 8 { 725 d += emitCopy(dst[d:], offset, s-base) 726 } else { 727 // Split to ensure we don't start a copy within next block. 728 d += emitCopy(dst[d:], offset, 4) 729 d += emitRepeat(dst[d:], offset, s-base-4) 730 } 731 repeat = offset 732 } 733 if false { 734 // Validate match. 735 if s <= candidateL { 736 panic("s <= candidate") 737 } 738 a := src[base:s] 739 b := dict.dict[base-repeat : base-repeat+(s-base)] 740 if !bytes.Equal(a, b) { 741 panic("mismatch") 742 } 743 } 744 745 nextEmit = s 746 if s >= sLimit { 747 break searchDict 748 } 749 750 if d > dstLimit { 751 // Do we have space for more, if not bail. 752 return 0 753 } 754 755 // Index short & long 756 index0 := base + 1 757 index1 := s - 2 758 759 cv0 := load64(src, index0) 760 cv1 := load64(src, index1) 761 lTable[hash7(cv0, lTableBits)] = uint32(index0) 762 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 763 764 lTable[hash7(cv1, lTableBits)] = uint32(index1) 765 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 766 index0 += 1 767 index1 -= 1 768 cv = load64(src, s) 769 770 // index every second long in between. 771 for index0 < index1 { 772 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) 773 lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) 774 index0 += 2 775 index1 -= 2 776 } 777 } 778 continue 779 } 780 emitMatch: 781 782 // Extend backwards 783 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { 784 candidateL-- 785 s-- 786 } 787 788 // Bail if we exceed the maximum size. 789 if d+(s-nextEmit) > dstLimit { 790 return 0 791 } 792 793 base := s 794 offset := base - candidateL 795 796 // Extend the 4-byte match as long as possible. 797 s += 4 798 candidateL += 4 799 for s < len(src) { 800 if len(src)-s < 8 { 801 if src[s] == src[candidateL] { 802 s++ 803 candidateL++ 804 continue 805 } 806 break 807 } 808 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { 809 s += bits.TrailingZeros64(diff) >> 3 810 break 811 } 812 s += 8 813 candidateL += 8 814 } 815 816 if offset > 65535 && s-base <= 5 && repeat != offset { 817 // Bail if the match is equal or worse to the encoding. 818 s = nextS + 1 819 if s >= sLimit { 820 goto emitRemainder 821 } 822 cv = load64(src, s) 823 continue 824 } 825 826 d += emitLiteral(dst[d:], src[nextEmit:base]) 827 if debug && nextEmit != s { 828 fmt.Println("emitted ", s-nextEmit, "literals") 829 } 830 if repeat == offset { 831 if debug { 832 fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s) 833 } 834 d += emitRepeat(dst[d:], offset, s-base) 835 } else { 836 if debug { 837 fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s) 838 } 839 d += emitCopy(dst[d:], offset, s-base) 840 repeat = offset 841 } 842 843 nextEmit = s 844 if s >= sLimit { 845 goto emitRemainder 846 } 847 848 if d > dstLimit { 849 // Do we have space for more, if not bail. 850 return 0 851 } 852 853 // Index short & long 854 index0 := base + 1 855 index1 := s - 2 856 857 cv0 := load64(src, index0) 858 cv1 := load64(src, index1) 859 lTable[hash7(cv0, lTableBits)] = uint32(index0) 860 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 861 862 lTable[hash7(cv1, lTableBits)] = uint32(index1) 863 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 864 index0 += 1 865 index1 -= 1 866 cv = load64(src, s) 867 868 // index every second long in between. 869 for index0 < index1 { 870 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) 871 lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) 872 index0 += 2 873 index1 -= 2 874 } 875 } 876 877 // Search without dict: 878 if repeat > s { 879 repeat = 0 880 } 881 882 // No more dict 883 sLimit = len(src) - inputMargin 884 if s >= sLimit { 885 goto emitRemainder 886 } 887 cv = load64(src, s) 888 if debug { 889 fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s) 890 } 891 for { 892 candidateL := 0 893 nextS := 0 894 for { 895 // Next src position to check 896 nextS = s + (s-nextEmit)>>7 + 1 897 if nextS > sLimit { 898 goto emitRemainder 899 } 900 hashL := hash7(cv, lTableBits) 901 hashS := hash4(cv, sTableBits) 902 candidateL = int(lTable[hashL]) 903 candidateS := int(sTable[hashS]) 904 lTable[hashL] = uint32(s) 905 sTable[hashS] = uint32(s) 906 907 valLong := load64(src, candidateL) 908 valShort := load64(src, candidateS) 909 910 // If long matches at least 8 bytes, use that. 911 if cv == valLong { 912 break 913 } 914 if cv == valShort { 915 candidateL = candidateS 916 break 917 } 918 919 // Check repeat at offset checkRep. 920 const checkRep = 1 921 // Minimum length of a repeat. Tested with various values. 922 // While 4-5 offers improvements in some, 6 reduces 923 // regressions significantly. 924 const wantRepeatBytes = 6 925 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) 926 if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { 927 base := s + checkRep 928 // Extend back 929 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { 930 i-- 931 base-- 932 } 933 d += emitLiteral(dst[d:], src[nextEmit:base]) 934 935 // Extend forward 936 candidate := s - repeat + wantRepeatBytes + checkRep 937 s += wantRepeatBytes + checkRep 938 for s < len(src) { 939 if len(src)-s < 8 { 940 if src[s] == src[candidate] { 941 s++ 942 candidate++ 943 continue 944 } 945 break 946 } 947 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { 948 s += bits.TrailingZeros64(diff) >> 3 949 break 950 } 951 s += 8 952 candidate += 8 953 } 954 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. 955 d += emitRepeat(dst[d:], repeat, s-base) 956 nextEmit = s 957 if s >= sLimit { 958 goto emitRemainder 959 } 960 // Index in-between 961 index0 := base + 1 962 index1 := s - 2 963 964 cv = load64(src, s) 965 for index0 < index1 { 966 cv0 := load64(src, index0) 967 cv1 := load64(src, index1) 968 lTable[hash7(cv0, lTableBits)] = uint32(index0) 969 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 970 971 lTable[hash7(cv1, lTableBits)] = uint32(index1) 972 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 973 index0 += 2 974 index1 -= 2 975 } 976 977 cv = load64(src, s) 978 continue 979 } 980 981 // Long likely matches 7, so take that. 982 if uint32(cv) == uint32(valLong) { 983 break 984 } 985 986 // Check our short candidate 987 if uint32(cv) == uint32(valShort) { 988 // Try a long candidate at s+1 989 hashL = hash7(cv>>8, lTableBits) 990 candidateL = int(lTable[hashL]) 991 lTable[hashL] = uint32(s + 1) 992 if uint32(cv>>8) == load32(src, candidateL) { 993 s++ 994 break 995 } 996 // Use our short candidate. 997 candidateL = candidateS 998 break 999 } 1000 1001 cv = load64(src, nextS) 1002 s = nextS 1003 } 1004 1005 // Extend backwards 1006 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { 1007 candidateL-- 1008 s-- 1009 } 1010 1011 // Bail if we exceed the maximum size. 1012 if d+(s-nextEmit) > dstLimit { 1013 return 0 1014 } 1015 1016 base := s 1017 offset := base - candidateL 1018 1019 // Extend the 4-byte match as long as possible. 1020 s += 4 1021 candidateL += 4 1022 for s < len(src) { 1023 if len(src)-s < 8 { 1024 if src[s] == src[candidateL] { 1025 s++ 1026 candidateL++ 1027 continue 1028 } 1029 break 1030 } 1031 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { 1032 s += bits.TrailingZeros64(diff) >> 3 1033 break 1034 } 1035 s += 8 1036 candidateL += 8 1037 } 1038 1039 if offset > 65535 && s-base <= 5 && repeat != offset { 1040 // Bail if the match is equal or worse to the encoding. 1041 s = nextS + 1 1042 if s >= sLimit { 1043 goto emitRemainder 1044 } 1045 cv = load64(src, s) 1046 continue 1047 } 1048 1049 d += emitLiteral(dst[d:], src[nextEmit:base]) 1050 if repeat == offset { 1051 d += emitRepeat(dst[d:], offset, s-base) 1052 } else { 1053 d += emitCopy(dst[d:], offset, s-base) 1054 repeat = offset 1055 } 1056 1057 nextEmit = s 1058 if s >= sLimit { 1059 goto emitRemainder 1060 } 1061 1062 if d > dstLimit { 1063 // Do we have space for more, if not bail. 1064 return 0 1065 } 1066 1067 // Index short & long 1068 index0 := base + 1 1069 index1 := s - 2 1070 1071 cv0 := load64(src, index0) 1072 cv1 := load64(src, index1) 1073 lTable[hash7(cv0, lTableBits)] = uint32(index0) 1074 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) 1075 1076 lTable[hash7(cv1, lTableBits)] = uint32(index1) 1077 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) 1078 index0 += 1 1079 index1 -= 1 1080 cv = load64(src, s) 1081 1082 // index every second long in between. 1083 for index0 < index1 { 1084 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) 1085 lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) 1086 index0 += 2 1087 index1 -= 2 1088 } 1089 } 1090 1091 emitRemainder: 1092 if nextEmit < len(src) { 1093 // Bail if we exceed the maximum size. 1094 if d+len(src)-nextEmit > dstLimit { 1095 return 0 1096 } 1097 d += emitLiteral(dst[d:], src[nextEmit:]) 1098 } 1099 return d 1100 }