s2.go (4732B)
1 // Copyright 2011 The Snappy-Go Authors. All rights reserved. 2 // Copyright (c) 2019 Klaus Post. All rights reserved. 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // Package s2 implements the S2 compression format. 7 // 8 // S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput, 9 // which is why it features concurrent compression for bigger payloads. 10 // 11 // Decoding is compatible with Snappy compressed content, 12 // but content compressed with S2 cannot be decompressed by Snappy. 13 // 14 // For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2 15 // 16 // There are actually two S2 formats: block and stream. They are related, 17 // but different: trying to decompress block-compressed data as a S2 stream 18 // will fail, and vice versa. The block format is the Decode and Encode 19 // functions and the stream format is the Reader and Writer types. 20 // 21 // A "better" compression option is available. This will trade some compression 22 // speed 23 // 24 // The block format, the more common case, is used when the complete size (the 25 // number of bytes) of the original data is known upfront, at the time 26 // compression starts. The stream format, also known as the framing format, is 27 // for when that isn't always true. 28 // 29 // Blocks to not offer much data protection, so it is up to you to 30 // add data validation of decompressed blocks. 31 // 32 // Streams perform CRC validation of the decompressed data. 33 // Stream compression will also be performed on multiple CPU cores concurrently 34 // significantly improving throughput. 35 package s2 36 37 import ( 38 "bytes" 39 "hash/crc32" 40 ) 41 42 /* 43 Each encoded block begins with the varint-encoded length of the decoded data, 44 followed by a sequence of chunks. Chunks begin and end on byte boundaries. The 45 first byte of each chunk is broken into its 2 least and 6 most significant bits 46 called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. 47 Zero means a literal tag. All other values mean a copy tag. 48 49 For literal tags: 50 - If m < 60, the next 1 + m bytes are literal bytes. 51 - Otherwise, let n be the little-endian unsigned integer denoted by the next 52 m - 59 bytes. The next 1 + n bytes after that are literal bytes. 53 54 For copy tags, length bytes are copied from offset bytes ago, in the style of 55 Lempel-Ziv compression algorithms. In particular: 56 - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). 57 The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 58 of the offset. The next byte is bits 0-7 of the offset. 59 - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). 60 The length is 1 + m. The offset is the little-endian unsigned integer 61 denoted by the next 2 bytes. 62 - For l == 3, the offset ranges in [0, 1<<32) and the length in 63 [1, 65). The length is 1 + m. The offset is the little-endian unsigned 64 integer denoted by the next 4 bytes. 65 */ 66 const ( 67 tagLiteral = 0x00 68 tagCopy1 = 0x01 69 tagCopy2 = 0x02 70 tagCopy4 = 0x03 71 ) 72 73 const ( 74 checksumSize = 4 75 chunkHeaderSize = 4 76 magicChunk = "\xff\x06\x00\x00" + magicBody 77 magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy 78 magicBodySnappy = "sNaPpY" 79 magicBody = "S2sTwO" 80 81 // maxBlockSize is the maximum size of the input to encodeBlock. 82 // 83 // For the framing format (Writer type instead of Encode function), 84 // this is the maximum uncompressed size of a block. 85 maxBlockSize = 4 << 20 86 87 // minBlockSize is the minimum size of block setting when creating a writer. 88 minBlockSize = 4 << 10 89 90 skippableFrameHeader = 4 91 maxChunkSize = 1<<24 - 1 // 16777215 92 93 // Default block size 94 defaultBlockSize = 1 << 20 95 96 // maxSnappyBlockSize is the maximum snappy block size. 97 maxSnappyBlockSize = 1 << 16 98 99 obufHeaderLen = checksumSize + chunkHeaderSize 100 ) 101 102 const ( 103 chunkTypeCompressedData = 0x00 104 chunkTypeUncompressedData = 0x01 105 ChunkTypeIndex = 0x99 106 chunkTypePadding = 0xfe 107 chunkTypeStreamIdentifier = 0xff 108 ) 109 110 var crcTable = crc32.MakeTable(crc32.Castagnoli) 111 112 // crc implements the checksum specified in section 3 of 113 // https://github.com/google/snappy/blob/master/framing_format.txt 114 func crc(b []byte) uint32 { 115 c := crc32.Update(0, crcTable, b) 116 return c>>15 | c<<17 + 0xa282ead8 117 } 118 119 // literalExtraSize returns the extra size of encoding n literals. 120 // n should be >= 0 and <= math.MaxUint32. 121 func literalExtraSize(n int64) int64 { 122 if n == 0 { 123 return 0 124 } 125 switch { 126 case n < 60: 127 return 1 128 case n < 1<<8: 129 return 2 130 case n < 1<<16: 131 return 3 132 case n < 1<<24: 133 return 4 134 default: 135 return 5 136 } 137 } 138 139 type byter interface { 140 Bytes() []byte 141 } 142 143 var _ byter = &bytes.Buffer{}