dict.go (9096B)
1 // Copyright (c) 2022+ Klaus Post. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package s2 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "sync" 11 ) 12 13 const ( 14 // MinDictSize is the minimum dictionary size when repeat has been read. 15 MinDictSize = 16 16 17 // MaxDictSize is the maximum dictionary size when repeat has been read. 18 MaxDictSize = 65536 19 20 // MaxDictSrcOffset is the maximum offset where a dictionary entry can start. 21 MaxDictSrcOffset = 65535 22 ) 23 24 // Dict contains a dictionary that can be used for encoding and decoding s2 25 type Dict struct { 26 dict []byte 27 repeat int // Repeat as index of dict 28 29 fast, better, best sync.Once 30 fastTable *[1 << 14]uint16 31 32 betterTableShort *[1 << 14]uint16 33 betterTableLong *[1 << 17]uint16 34 35 bestTableShort *[1 << 16]uint32 36 bestTableLong *[1 << 19]uint32 37 } 38 39 // NewDict will read a dictionary. 40 // It will return nil if the dictionary is invalid. 41 func NewDict(dict []byte) *Dict { 42 if len(dict) == 0 { 43 return nil 44 } 45 var d Dict 46 // Repeat is the first value of the dict 47 r, n := binary.Uvarint(dict) 48 if n <= 0 { 49 return nil 50 } 51 dict = dict[n:] 52 d.dict = dict 53 if cap(d.dict) < len(d.dict)+16 { 54 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...) 55 } 56 if len(dict) < MinDictSize || len(dict) > MaxDictSize { 57 return nil 58 } 59 d.repeat = int(r) 60 if d.repeat > len(dict) { 61 return nil 62 } 63 return &d 64 } 65 66 // Bytes will return a serialized version of the dictionary. 67 // The output can be sent to NewDict. 68 func (d *Dict) Bytes() []byte { 69 dst := make([]byte, binary.MaxVarintLen16+len(d.dict)) 70 return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...) 71 } 72 73 // MakeDict will create a dictionary. 74 // 'data' must be at least MinDictSize. 75 // If data is longer than MaxDictSize only the last MaxDictSize bytes will be used. 76 // If searchStart is set the start repeat value will be set to the last 77 // match of this content. 78 // If no matches are found, it will attempt to find shorter matches. 79 // This content should match the typical start of a block. 80 // If at least 4 bytes cannot be matched, repeat is set to start of block. 81 func MakeDict(data []byte, searchStart []byte) *Dict { 82 if len(data) == 0 { 83 return nil 84 } 85 if len(data) > MaxDictSize { 86 data = data[len(data)-MaxDictSize:] 87 } 88 var d Dict 89 dict := data 90 d.dict = dict 91 if cap(d.dict) < len(d.dict)+16 { 92 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...) 93 } 94 if len(dict) < MinDictSize { 95 return nil 96 } 97 98 // Find the longest match possible, last entry if multiple. 99 for s := len(searchStart); s > 4; s-- { 100 if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 { 101 d.repeat = idx 102 break 103 } 104 } 105 106 return &d 107 } 108 109 // Encode returns the encoded form of src. The returned slice may be a sub- 110 // slice of dst if dst was large enough to hold the entire encoded block. 111 // Otherwise, a newly allocated slice will be returned. 112 // 113 // The dst and src must not overlap. It is valid to pass a nil dst. 114 // 115 // The blocks will require the same amount of memory to decode as encoding, 116 // and does not make for concurrent decoding. 117 // Also note that blocks do not contain CRC information, so corruption may be undetected. 118 // 119 // If you need to encode larger amounts of data, consider using 120 // the streaming interface which gives all of these features. 121 func (d *Dict) Encode(dst, src []byte) []byte { 122 if n := MaxEncodedLen(len(src)); n < 0 { 123 panic(ErrTooLarge) 124 } else if cap(dst) < n { 125 dst = make([]byte, n) 126 } else { 127 dst = dst[:n] 128 } 129 130 // The block starts with the varint-encoded length of the decompressed bytes. 131 dstP := binary.PutUvarint(dst, uint64(len(src))) 132 133 if len(src) == 0 { 134 return dst[:dstP] 135 } 136 if len(src) < minNonLiteralBlockSize { 137 dstP += emitLiteral(dst[dstP:], src) 138 return dst[:dstP] 139 } 140 n := encodeBlockDictGo(dst[dstP:], src, d) 141 if n > 0 { 142 dstP += n 143 return dst[:dstP] 144 } 145 // Not compressible 146 dstP += emitLiteral(dst[dstP:], src) 147 return dst[:dstP] 148 } 149 150 // EncodeBetter returns the encoded form of src. The returned slice may be a sub- 151 // slice of dst if dst was large enough to hold the entire encoded block. 152 // Otherwise, a newly allocated slice will be returned. 153 // 154 // EncodeBetter compresses better than Encode but typically with a 155 // 10-40% speed decrease on both compression and decompression. 156 // 157 // The dst and src must not overlap. It is valid to pass a nil dst. 158 // 159 // The blocks will require the same amount of memory to decode as encoding, 160 // and does not make for concurrent decoding. 161 // Also note that blocks do not contain CRC information, so corruption may be undetected. 162 // 163 // If you need to encode larger amounts of data, consider using 164 // the streaming interface which gives all of these features. 165 func (d *Dict) EncodeBetter(dst, src []byte) []byte { 166 if n := MaxEncodedLen(len(src)); n < 0 { 167 panic(ErrTooLarge) 168 } else if len(dst) < n { 169 dst = make([]byte, n) 170 } 171 172 // The block starts with the varint-encoded length of the decompressed bytes. 173 dstP := binary.PutUvarint(dst, uint64(len(src))) 174 175 if len(src) == 0 { 176 return dst[:dstP] 177 } 178 if len(src) < minNonLiteralBlockSize { 179 dstP += emitLiteral(dst[dstP:], src) 180 return dst[:dstP] 181 } 182 n := encodeBlockBetterDict(dst[dstP:], src, d) 183 if n > 0 { 184 dstP += n 185 return dst[:dstP] 186 } 187 // Not compressible 188 dstP += emitLiteral(dst[dstP:], src) 189 return dst[:dstP] 190 } 191 192 // EncodeBest returns the encoded form of src. The returned slice may be a sub- 193 // slice of dst if dst was large enough to hold the entire encoded block. 194 // Otherwise, a newly allocated slice will be returned. 195 // 196 // EncodeBest compresses as good as reasonably possible but with a 197 // big speed decrease. 198 // 199 // The dst and src must not overlap. It is valid to pass a nil dst. 200 // 201 // The blocks will require the same amount of memory to decode as encoding, 202 // and does not make for concurrent decoding. 203 // Also note that blocks do not contain CRC information, so corruption may be undetected. 204 // 205 // If you need to encode larger amounts of data, consider using 206 // the streaming interface which gives all of these features. 207 func (d *Dict) EncodeBest(dst, src []byte) []byte { 208 if n := MaxEncodedLen(len(src)); n < 0 { 209 panic(ErrTooLarge) 210 } else if len(dst) < n { 211 dst = make([]byte, n) 212 } 213 214 // The block starts with the varint-encoded length of the decompressed bytes. 215 dstP := binary.PutUvarint(dst, uint64(len(src))) 216 217 if len(src) == 0 { 218 return dst[:dstP] 219 } 220 if len(src) < minNonLiteralBlockSize { 221 dstP += emitLiteral(dst[dstP:], src) 222 return dst[:dstP] 223 } 224 n := encodeBlockBest(dst[dstP:], src, d) 225 if n > 0 { 226 dstP += n 227 return dst[:dstP] 228 } 229 // Not compressible 230 dstP += emitLiteral(dst[dstP:], src) 231 return dst[:dstP] 232 } 233 234 // Decode returns the decoded form of src. The returned slice may be a sub- 235 // slice of dst if dst was large enough to hold the entire decoded block. 236 // Otherwise, a newly allocated slice will be returned. 237 // 238 // The dst and src must not overlap. It is valid to pass a nil dst. 239 func (d *Dict) Decode(dst, src []byte) ([]byte, error) { 240 dLen, s, err := decodedLen(src) 241 if err != nil { 242 return nil, err 243 } 244 if dLen <= cap(dst) { 245 dst = dst[:dLen] 246 } else { 247 dst = make([]byte, dLen) 248 } 249 if s2DecodeDict(dst, src[s:], d) != 0 { 250 return nil, ErrCorrupt 251 } 252 return dst, nil 253 } 254 255 func (d *Dict) initFast() { 256 d.fast.Do(func() { 257 const ( 258 tableBits = 14 259 maxTableSize = 1 << tableBits 260 ) 261 262 var table [maxTableSize]uint16 263 // We stop so any entry of length 8 can always be read. 264 for i := 0; i < len(d.dict)-8-2; i += 3 { 265 x0 := load64(d.dict, i) 266 h0 := hash6(x0, tableBits) 267 h1 := hash6(x0>>8, tableBits) 268 h2 := hash6(x0>>16, tableBits) 269 table[h0] = uint16(i) 270 table[h1] = uint16(i + 1) 271 table[h2] = uint16(i + 2) 272 } 273 d.fastTable = &table 274 }) 275 } 276 277 func (d *Dict) initBetter() { 278 d.better.Do(func() { 279 const ( 280 // Long hash matches. 281 lTableBits = 17 282 maxLTableSize = 1 << lTableBits 283 284 // Short hash matches. 285 sTableBits = 14 286 maxSTableSize = 1 << sTableBits 287 ) 288 289 var lTable [maxLTableSize]uint16 290 var sTable [maxSTableSize]uint16 291 292 // We stop so any entry of length 8 can always be read. 293 for i := 0; i < len(d.dict)-8; i++ { 294 cv := load64(d.dict, i) 295 lTable[hash7(cv, lTableBits)] = uint16(i) 296 sTable[hash4(cv, sTableBits)] = uint16(i) 297 } 298 d.betterTableShort = &sTable 299 d.betterTableLong = &lTable 300 }) 301 } 302 303 func (d *Dict) initBest() { 304 d.best.Do(func() { 305 const ( 306 // Long hash matches. 307 lTableBits = 19 308 maxLTableSize = 1 << lTableBits 309 310 // Short hash matches. 311 sTableBits = 16 312 maxSTableSize = 1 << sTableBits 313 ) 314 315 var lTable [maxLTableSize]uint32 316 var sTable [maxSTableSize]uint32 317 318 // We stop so any entry of length 8 can always be read. 319 for i := 0; i < len(d.dict)-8; i++ { 320 cv := load64(d.dict, i) 321 hashL := hash8(cv, lTableBits) 322 hashS := hash4(cv, sTableBits) 323 candidateL := lTable[hashL] 324 candidateS := sTable[hashS] 325 lTable[hashL] = uint32(i) | candidateL<<16 326 sTable[hashS] = uint32(i) | candidateS<<16 327 } 328 d.bestTableShort = &sTable 329 d.bestTableLong = &lTable 330 }) 331 }