lz4convert.go (13649B)
1 // Copyright (c) 2022 Klaus Post. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package s2 6 7 import ( 8 "encoding/binary" 9 "errors" 10 "fmt" 11 ) 12 13 // LZ4Converter provides conversion from LZ4 blocks as defined here: 14 // https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md 15 type LZ4Converter struct { 16 } 17 18 // ErrDstTooSmall is returned when provided destination is too small. 19 var ErrDstTooSmall = errors.New("s2: destination too small") 20 21 // ConvertBlock will convert an LZ4 block and append it as an S2 22 // block without block length to dst. 23 // The uncompressed size is returned as well. 24 // dst must have capacity to contain the entire compressed block. 25 func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) { 26 if len(src) == 0 { 27 return dst, 0, nil 28 } 29 const debug = false 30 const inline = true 31 const lz4MinMatch = 4 32 33 s, d := 0, len(dst) 34 dst = dst[:cap(dst)] 35 if !debug && hasAmd64Asm { 36 res, sz := cvtLZ4BlockAsm(dst[d:], src) 37 if res < 0 { 38 const ( 39 errCorrupt = -1 40 errDstTooSmall = -2 41 ) 42 switch res { 43 case errCorrupt: 44 return nil, 0, ErrCorrupt 45 case errDstTooSmall: 46 return nil, 0, ErrDstTooSmall 47 default: 48 return nil, 0, fmt.Errorf("unexpected result: %d", res) 49 } 50 } 51 if d+sz > len(dst) { 52 return nil, 0, ErrDstTooSmall 53 } 54 return dst[:d+sz], res, nil 55 } 56 57 dLimit := len(dst) - 10 58 var lastOffset uint16 59 var uncompressed int 60 if debug { 61 fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst)) 62 } 63 64 for { 65 if s >= len(src) { 66 return dst[:d], 0, ErrCorrupt 67 } 68 // Read literal info 69 token := src[s] 70 ll := int(token >> 4) 71 ml := int(lz4MinMatch + (token & 0xf)) 72 73 // If upper nibble is 15, literal length is extended 74 if token >= 0xf0 { 75 for { 76 s++ 77 if s >= len(src) { 78 if debug { 79 fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src)) 80 } 81 return dst[:d], 0, ErrCorrupt 82 } 83 val := src[s] 84 ll += int(val) 85 if val != 255 { 86 break 87 } 88 } 89 } 90 // Skip past token 91 if s+ll >= len(src) { 92 if debug { 93 fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src)) 94 } 95 return nil, 0, ErrCorrupt 96 } 97 s++ 98 if ll > 0 { 99 if d+ll > dLimit { 100 return nil, 0, ErrDstTooSmall 101 } 102 if debug { 103 fmt.Printf("emit %d literals\n", ll) 104 } 105 d += emitLiteralGo(dst[d:], src[s:s+ll]) 106 s += ll 107 uncompressed += ll 108 } 109 110 // Check if we are done... 111 if s == len(src) && ml == lz4MinMatch { 112 break 113 } 114 // 2 byte offset 115 if s >= len(src)-2 { 116 if debug { 117 fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2) 118 } 119 return nil, 0, ErrCorrupt 120 } 121 offset := binary.LittleEndian.Uint16(src[s:]) 122 s += 2 123 if offset == 0 { 124 if debug { 125 fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s) 126 } 127 return nil, 0, ErrCorrupt 128 } 129 if int(offset) > uncompressed { 130 if debug { 131 fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed) 132 } 133 return nil, 0, ErrCorrupt 134 } 135 136 if ml == lz4MinMatch+15 { 137 for { 138 if s >= len(src) { 139 if debug { 140 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) 141 } 142 return nil, 0, ErrCorrupt 143 } 144 val := src[s] 145 s++ 146 ml += int(val) 147 if val != 255 { 148 if s >= len(src) { 149 if debug { 150 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) 151 } 152 return nil, 0, ErrCorrupt 153 } 154 break 155 } 156 } 157 } 158 if offset == lastOffset { 159 if debug { 160 fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset) 161 } 162 if !inline { 163 d += emitRepeat16(dst[d:], offset, ml) 164 } else { 165 length := ml 166 dst := dst[d:] 167 for len(dst) > 5 { 168 // Repeat offset, make length cheaper 169 length -= 4 170 if length <= 4 { 171 dst[0] = uint8(length)<<2 | tagCopy1 172 dst[1] = 0 173 d += 2 174 break 175 } 176 if length < 8 && offset < 2048 { 177 // Encode WITH offset 178 dst[1] = uint8(offset) 179 dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 180 d += 2 181 break 182 } 183 if length < (1<<8)+4 { 184 length -= 4 185 dst[2] = uint8(length) 186 dst[1] = 0 187 dst[0] = 5<<2 | tagCopy1 188 d += 3 189 break 190 } 191 if length < (1<<16)+(1<<8) { 192 length -= 1 << 8 193 dst[3] = uint8(length >> 8) 194 dst[2] = uint8(length >> 0) 195 dst[1] = 0 196 dst[0] = 6<<2 | tagCopy1 197 d += 4 198 break 199 } 200 const maxRepeat = (1 << 24) - 1 201 length -= 1 << 16 202 left := 0 203 if length > maxRepeat { 204 left = length - maxRepeat + 4 205 length = maxRepeat - 4 206 } 207 dst[4] = uint8(length >> 16) 208 dst[3] = uint8(length >> 8) 209 dst[2] = uint8(length >> 0) 210 dst[1] = 0 211 dst[0] = 7<<2 | tagCopy1 212 if left > 0 { 213 d += 5 + emitRepeat16(dst[5:], offset, left) 214 break 215 } 216 d += 5 217 break 218 } 219 } 220 } else { 221 if debug { 222 fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset) 223 } 224 if !inline { 225 d += emitCopy16(dst[d:], offset, ml) 226 } else { 227 length := ml 228 dst := dst[d:] 229 for len(dst) > 5 { 230 // Offset no more than 2 bytes. 231 if length > 64 { 232 off := 3 233 if offset < 2048 { 234 // emit 8 bytes as tagCopy1, rest as repeats. 235 dst[1] = uint8(offset) 236 dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1 237 length -= 8 238 off = 2 239 } else { 240 // Emit a length 60 copy, encoded as 3 bytes. 241 // Emit remaining as repeat value (minimum 4 bytes). 242 dst[2] = uint8(offset >> 8) 243 dst[1] = uint8(offset) 244 dst[0] = 59<<2 | tagCopy2 245 length -= 60 246 } 247 // Emit remaining as repeats, at least 4 bytes remain. 248 d += off + emitRepeat16(dst[off:], offset, length) 249 break 250 } 251 if length >= 12 || offset >= 2048 { 252 // Emit the remaining copy, encoded as 3 bytes. 253 dst[2] = uint8(offset >> 8) 254 dst[1] = uint8(offset) 255 dst[0] = uint8(length-1)<<2 | tagCopy2 256 d += 3 257 break 258 } 259 // Emit the remaining copy, encoded as 2 bytes. 260 dst[1] = uint8(offset) 261 dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 262 d += 2 263 break 264 } 265 } 266 lastOffset = offset 267 } 268 uncompressed += ml 269 if d > dLimit { 270 return nil, 0, ErrDstTooSmall 271 } 272 } 273 274 return dst[:d], uncompressed, nil 275 } 276 277 // ConvertBlockSnappy will convert an LZ4 block and append it 278 // as a Snappy block without block length to dst. 279 // The uncompressed size is returned as well. 280 // dst must have capacity to contain the entire compressed block. 281 func (l *LZ4Converter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) { 282 if len(src) == 0 { 283 return dst, 0, nil 284 } 285 const debug = false 286 const lz4MinMatch = 4 287 288 s, d := 0, len(dst) 289 dst = dst[:cap(dst)] 290 // Use assembly when possible 291 if !debug && hasAmd64Asm { 292 res, sz := cvtLZ4BlockSnappyAsm(dst[d:], src) 293 if res < 0 { 294 const ( 295 errCorrupt = -1 296 errDstTooSmall = -2 297 ) 298 switch res { 299 case errCorrupt: 300 return nil, 0, ErrCorrupt 301 case errDstTooSmall: 302 return nil, 0, ErrDstTooSmall 303 default: 304 return nil, 0, fmt.Errorf("unexpected result: %d", res) 305 } 306 } 307 if d+sz > len(dst) { 308 return nil, 0, ErrDstTooSmall 309 } 310 return dst[:d+sz], res, nil 311 } 312 313 dLimit := len(dst) - 10 314 var uncompressed int 315 if debug { 316 fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst)) 317 } 318 319 for { 320 if s >= len(src) { 321 return nil, 0, ErrCorrupt 322 } 323 // Read literal info 324 token := src[s] 325 ll := int(token >> 4) 326 ml := int(lz4MinMatch + (token & 0xf)) 327 328 // If upper nibble is 15, literal length is extended 329 if token >= 0xf0 { 330 for { 331 s++ 332 if s >= len(src) { 333 if debug { 334 fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src)) 335 } 336 return nil, 0, ErrCorrupt 337 } 338 val := src[s] 339 ll += int(val) 340 if val != 255 { 341 break 342 } 343 } 344 } 345 // Skip past token 346 if s+ll >= len(src) { 347 if debug { 348 fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src)) 349 } 350 return nil, 0, ErrCorrupt 351 } 352 s++ 353 if ll > 0 { 354 if d+ll > dLimit { 355 return nil, 0, ErrDstTooSmall 356 } 357 if debug { 358 fmt.Printf("emit %d literals\n", ll) 359 } 360 d += emitLiteralGo(dst[d:], src[s:s+ll]) 361 s += ll 362 uncompressed += ll 363 } 364 365 // Check if we are done... 366 if s == len(src) && ml == lz4MinMatch { 367 break 368 } 369 // 2 byte offset 370 if s >= len(src)-2 { 371 if debug { 372 fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2) 373 } 374 return nil, 0, ErrCorrupt 375 } 376 offset := binary.LittleEndian.Uint16(src[s:]) 377 s += 2 378 if offset == 0 { 379 if debug { 380 fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s) 381 } 382 return nil, 0, ErrCorrupt 383 } 384 if int(offset) > uncompressed { 385 if debug { 386 fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed) 387 } 388 return nil, 0, ErrCorrupt 389 } 390 391 if ml == lz4MinMatch+15 { 392 for { 393 if s >= len(src) { 394 if debug { 395 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) 396 } 397 return nil, 0, ErrCorrupt 398 } 399 val := src[s] 400 s++ 401 ml += int(val) 402 if val != 255 { 403 if s >= len(src) { 404 if debug { 405 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) 406 } 407 return nil, 0, ErrCorrupt 408 } 409 break 410 } 411 } 412 } 413 if debug { 414 fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset) 415 } 416 length := ml 417 // d += emitCopyNoRepeat(dst[d:], int(offset), ml) 418 for length > 0 { 419 if d >= dLimit { 420 return nil, 0, ErrDstTooSmall 421 } 422 423 // Offset no more than 2 bytes. 424 if length > 64 { 425 // Emit a length 64 copy, encoded as 3 bytes. 426 dst[d+2] = uint8(offset >> 8) 427 dst[d+1] = uint8(offset) 428 dst[d+0] = 63<<2 | tagCopy2 429 length -= 64 430 d += 3 431 continue 432 } 433 if length >= 12 || offset >= 2048 || length < 4 { 434 // Emit the remaining copy, encoded as 3 bytes. 435 dst[d+2] = uint8(offset >> 8) 436 dst[d+1] = uint8(offset) 437 dst[d+0] = uint8(length-1)<<2 | tagCopy2 438 d += 3 439 break 440 } 441 // Emit the remaining copy, encoded as 2 bytes. 442 dst[d+1] = uint8(offset) 443 dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 444 d += 2 445 break 446 } 447 uncompressed += ml 448 if d > dLimit { 449 return nil, 0, ErrDstTooSmall 450 } 451 } 452 453 return dst[:d], uncompressed, nil 454 } 455 456 // emitRepeat writes a repeat chunk and returns the number of bytes written. 457 // Length must be at least 4 and < 1<<24 458 func emitRepeat16(dst []byte, offset uint16, length int) int { 459 // Repeat offset, make length cheaper 460 length -= 4 461 if length <= 4 { 462 dst[0] = uint8(length)<<2 | tagCopy1 463 dst[1] = 0 464 return 2 465 } 466 if length < 8 && offset < 2048 { 467 // Encode WITH offset 468 dst[1] = uint8(offset) 469 dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 470 return 2 471 } 472 if length < (1<<8)+4 { 473 length -= 4 474 dst[2] = uint8(length) 475 dst[1] = 0 476 dst[0] = 5<<2 | tagCopy1 477 return 3 478 } 479 if length < (1<<16)+(1<<8) { 480 length -= 1 << 8 481 dst[3] = uint8(length >> 8) 482 dst[2] = uint8(length >> 0) 483 dst[1] = 0 484 dst[0] = 6<<2 | tagCopy1 485 return 4 486 } 487 const maxRepeat = (1 << 24) - 1 488 length -= 1 << 16 489 left := 0 490 if length > maxRepeat { 491 left = length - maxRepeat + 4 492 length = maxRepeat - 4 493 } 494 dst[4] = uint8(length >> 16) 495 dst[3] = uint8(length >> 8) 496 dst[2] = uint8(length >> 0) 497 dst[1] = 0 498 dst[0] = 7<<2 | tagCopy1 499 if left > 0 { 500 return 5 + emitRepeat16(dst[5:], offset, left) 501 } 502 return 5 503 } 504 505 // emitCopy writes a copy chunk and returns the number of bytes written. 506 // 507 // It assumes that: 508 // 509 // dst is long enough to hold the encoded bytes 510 // 1 <= offset && offset <= math.MaxUint16 511 // 4 <= length && length <= math.MaxUint32 512 func emitCopy16(dst []byte, offset uint16, length int) int { 513 // Offset no more than 2 bytes. 514 if length > 64 { 515 off := 3 516 if offset < 2048 { 517 // emit 8 bytes as tagCopy1, rest as repeats. 518 dst[1] = uint8(offset) 519 dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1 520 length -= 8 521 off = 2 522 } else { 523 // Emit a length 60 copy, encoded as 3 bytes. 524 // Emit remaining as repeat value (minimum 4 bytes). 525 dst[2] = uint8(offset >> 8) 526 dst[1] = uint8(offset) 527 dst[0] = 59<<2 | tagCopy2 528 length -= 60 529 } 530 // Emit remaining as repeats, at least 4 bytes remain. 531 return off + emitRepeat16(dst[off:], offset, length) 532 } 533 if length >= 12 || offset >= 2048 { 534 // Emit the remaining copy, encoded as 3 bytes. 535 dst[2] = uint8(offset >> 8) 536 dst[1] = uint8(offset) 537 dst[0] = uint8(length-1)<<2 | tagCopy2 538 return 3 539 } 540 // Emit the remaining copy, encoded as 2 bytes. 541 dst[1] = uint8(offset) 542 dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 543 return 2 544 } 545 546 // emitLiteral writes a literal chunk and returns the number of bytes written. 547 // 548 // It assumes that: 549 // 550 // dst is long enough to hold the encoded bytes 551 // 0 <= len(lit) && len(lit) <= math.MaxUint32 552 func emitLiteralGo(dst, lit []byte) int { 553 if len(lit) == 0 { 554 return 0 555 } 556 i, n := 0, uint(len(lit)-1) 557 switch { 558 case n < 60: 559 dst[0] = uint8(n)<<2 | tagLiteral 560 i = 1 561 case n < 1<<8: 562 dst[1] = uint8(n) 563 dst[0] = 60<<2 | tagLiteral 564 i = 2 565 case n < 1<<16: 566 dst[2] = uint8(n >> 8) 567 dst[1] = uint8(n) 568 dst[0] = 61<<2 | tagLiteral 569 i = 3 570 case n < 1<<24: 571 dst[3] = uint8(n >> 16) 572 dst[2] = uint8(n >> 8) 573 dst[1] = uint8(n) 574 dst[0] = 62<<2 | tagLiteral 575 i = 4 576 default: 577 dst[4] = uint8(n >> 24) 578 dst[3] = uint8(n >> 16) 579 dst[2] = uint8(n >> 8) 580 dst[1] = uint8(n) 581 dst[0] = 63<<2 | tagLiteral 582 i = 5 583 } 584 return i + copy(dst[i:], lit) 585 }