gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

dict.go (9096B)


      1 // Copyright (c) 2022+ Klaus Post. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package s2
      6 
      7 import (
      8 	"bytes"
      9 	"encoding/binary"
     10 	"sync"
     11 )
     12 
     13 const (
     14 	// MinDictSize is the minimum dictionary size when repeat has been read.
     15 	MinDictSize = 16
     16 
     17 	// MaxDictSize is the maximum dictionary size when repeat has been read.
     18 	MaxDictSize = 65536
     19 
     20 	// MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
     21 	MaxDictSrcOffset = 65535
     22 )
     23 
     24 // Dict contains a dictionary that can be used for encoding and decoding s2
     25 type Dict struct {
     26 	dict   []byte
     27 	repeat int // Repeat as index of dict
     28 
     29 	fast, better, best sync.Once
     30 	fastTable          *[1 << 14]uint16
     31 
     32 	betterTableShort *[1 << 14]uint16
     33 	betterTableLong  *[1 << 17]uint16
     34 
     35 	bestTableShort *[1 << 16]uint32
     36 	bestTableLong  *[1 << 19]uint32
     37 }
     38 
     39 // NewDict will read a dictionary.
     40 // It will return nil if the dictionary is invalid.
     41 func NewDict(dict []byte) *Dict {
     42 	if len(dict) == 0 {
     43 		return nil
     44 	}
     45 	var d Dict
     46 	// Repeat is the first value of the dict
     47 	r, n := binary.Uvarint(dict)
     48 	if n <= 0 {
     49 		return nil
     50 	}
     51 	dict = dict[n:]
     52 	d.dict = dict
     53 	if cap(d.dict) < len(d.dict)+16 {
     54 		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
     55 	}
     56 	if len(dict) < MinDictSize || len(dict) > MaxDictSize {
     57 		return nil
     58 	}
     59 	d.repeat = int(r)
     60 	if d.repeat > len(dict) {
     61 		return nil
     62 	}
     63 	return &d
     64 }
     65 
     66 // Bytes will return a serialized version of the dictionary.
     67 // The output can be sent to NewDict.
     68 func (d *Dict) Bytes() []byte {
     69 	dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
     70 	return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
     71 }
     72 
     73 // MakeDict will create a dictionary.
     74 // 'data' must be at least MinDictSize.
     75 // If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
     76 // If searchStart is set the start repeat value will be set to the last
     77 // match of this content.
     78 // If no matches are found, it will attempt to find shorter matches.
     79 // This content should match the typical start of a block.
     80 // If at least 4 bytes cannot be matched, repeat is set to start of block.
     81 func MakeDict(data []byte, searchStart []byte) *Dict {
     82 	if len(data) == 0 {
     83 		return nil
     84 	}
     85 	if len(data) > MaxDictSize {
     86 		data = data[len(data)-MaxDictSize:]
     87 	}
     88 	var d Dict
     89 	dict := data
     90 	d.dict = dict
     91 	if cap(d.dict) < len(d.dict)+16 {
     92 		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
     93 	}
     94 	if len(dict) < MinDictSize {
     95 		return nil
     96 	}
     97 
     98 	// Find the longest match possible, last entry if multiple.
     99 	for s := len(searchStart); s > 4; s-- {
    100 		if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
    101 			d.repeat = idx
    102 			break
    103 		}
    104 	}
    105 
    106 	return &d
    107 }
    108 
    109 // Encode returns the encoded form of src. The returned slice may be a sub-
    110 // slice of dst if dst was large enough to hold the entire encoded block.
    111 // Otherwise, a newly allocated slice will be returned.
    112 //
    113 // The dst and src must not overlap. It is valid to pass a nil dst.
    114 //
    115 // The blocks will require the same amount of memory to decode as encoding,
    116 // and does not make for concurrent decoding.
    117 // Also note that blocks do not contain CRC information, so corruption may be undetected.
    118 //
    119 // If you need to encode larger amounts of data, consider using
    120 // the streaming interface which gives all of these features.
    121 func (d *Dict) Encode(dst, src []byte) []byte {
    122 	if n := MaxEncodedLen(len(src)); n < 0 {
    123 		panic(ErrTooLarge)
    124 	} else if cap(dst) < n {
    125 		dst = make([]byte, n)
    126 	} else {
    127 		dst = dst[:n]
    128 	}
    129 
    130 	// The block starts with the varint-encoded length of the decompressed bytes.
    131 	dstP := binary.PutUvarint(dst, uint64(len(src)))
    132 
    133 	if len(src) == 0 {
    134 		return dst[:dstP]
    135 	}
    136 	if len(src) < minNonLiteralBlockSize {
    137 		dstP += emitLiteral(dst[dstP:], src)
    138 		return dst[:dstP]
    139 	}
    140 	n := encodeBlockDictGo(dst[dstP:], src, d)
    141 	if n > 0 {
    142 		dstP += n
    143 		return dst[:dstP]
    144 	}
    145 	// Not compressible
    146 	dstP += emitLiteral(dst[dstP:], src)
    147 	return dst[:dstP]
    148 }
    149 
    150 // EncodeBetter returns the encoded form of src. The returned slice may be a sub-
    151 // slice of dst if dst was large enough to hold the entire encoded block.
    152 // Otherwise, a newly allocated slice will be returned.
    153 //
    154 // EncodeBetter compresses better than Encode but typically with a
    155 // 10-40% speed decrease on both compression and decompression.
    156 //
    157 // The dst and src must not overlap. It is valid to pass a nil dst.
    158 //
    159 // The blocks will require the same amount of memory to decode as encoding,
    160 // and does not make for concurrent decoding.
    161 // Also note that blocks do not contain CRC information, so corruption may be undetected.
    162 //
    163 // If you need to encode larger amounts of data, consider using
    164 // the streaming interface which gives all of these features.
    165 func (d *Dict) EncodeBetter(dst, src []byte) []byte {
    166 	if n := MaxEncodedLen(len(src)); n < 0 {
    167 		panic(ErrTooLarge)
    168 	} else if len(dst) < n {
    169 		dst = make([]byte, n)
    170 	}
    171 
    172 	// The block starts with the varint-encoded length of the decompressed bytes.
    173 	dstP := binary.PutUvarint(dst, uint64(len(src)))
    174 
    175 	if len(src) == 0 {
    176 		return dst[:dstP]
    177 	}
    178 	if len(src) < minNonLiteralBlockSize {
    179 		dstP += emitLiteral(dst[dstP:], src)
    180 		return dst[:dstP]
    181 	}
    182 	n := encodeBlockBetterDict(dst[dstP:], src, d)
    183 	if n > 0 {
    184 		dstP += n
    185 		return dst[:dstP]
    186 	}
    187 	// Not compressible
    188 	dstP += emitLiteral(dst[dstP:], src)
    189 	return dst[:dstP]
    190 }
    191 
    192 // EncodeBest returns the encoded form of src. The returned slice may be a sub-
    193 // slice of dst if dst was large enough to hold the entire encoded block.
    194 // Otherwise, a newly allocated slice will be returned.
    195 //
    196 // EncodeBest compresses as good as reasonably possible but with a
    197 // big speed decrease.
    198 //
    199 // The dst and src must not overlap. It is valid to pass a nil dst.
    200 //
    201 // The blocks will require the same amount of memory to decode as encoding,
    202 // and does not make for concurrent decoding.
    203 // Also note that blocks do not contain CRC information, so corruption may be undetected.
    204 //
    205 // If you need to encode larger amounts of data, consider using
    206 // the streaming interface which gives all of these features.
    207 func (d *Dict) EncodeBest(dst, src []byte) []byte {
    208 	if n := MaxEncodedLen(len(src)); n < 0 {
    209 		panic(ErrTooLarge)
    210 	} else if len(dst) < n {
    211 		dst = make([]byte, n)
    212 	}
    213 
    214 	// The block starts with the varint-encoded length of the decompressed bytes.
    215 	dstP := binary.PutUvarint(dst, uint64(len(src)))
    216 
    217 	if len(src) == 0 {
    218 		return dst[:dstP]
    219 	}
    220 	if len(src) < minNonLiteralBlockSize {
    221 		dstP += emitLiteral(dst[dstP:], src)
    222 		return dst[:dstP]
    223 	}
    224 	n := encodeBlockBest(dst[dstP:], src, d)
    225 	if n > 0 {
    226 		dstP += n
    227 		return dst[:dstP]
    228 	}
    229 	// Not compressible
    230 	dstP += emitLiteral(dst[dstP:], src)
    231 	return dst[:dstP]
    232 }
    233 
    234 // Decode returns the decoded form of src. The returned slice may be a sub-
    235 // slice of dst if dst was large enough to hold the entire decoded block.
    236 // Otherwise, a newly allocated slice will be returned.
    237 //
    238 // The dst and src must not overlap. It is valid to pass a nil dst.
    239 func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
    240 	dLen, s, err := decodedLen(src)
    241 	if err != nil {
    242 		return nil, err
    243 	}
    244 	if dLen <= cap(dst) {
    245 		dst = dst[:dLen]
    246 	} else {
    247 		dst = make([]byte, dLen)
    248 	}
    249 	if s2DecodeDict(dst, src[s:], d) != 0 {
    250 		return nil, ErrCorrupt
    251 	}
    252 	return dst, nil
    253 }
    254 
    255 func (d *Dict) initFast() {
    256 	d.fast.Do(func() {
    257 		const (
    258 			tableBits    = 14
    259 			maxTableSize = 1 << tableBits
    260 		)
    261 
    262 		var table [maxTableSize]uint16
    263 		// We stop so any entry of length 8 can always be read.
    264 		for i := 0; i < len(d.dict)-8-2; i += 3 {
    265 			x0 := load64(d.dict, i)
    266 			h0 := hash6(x0, tableBits)
    267 			h1 := hash6(x0>>8, tableBits)
    268 			h2 := hash6(x0>>16, tableBits)
    269 			table[h0] = uint16(i)
    270 			table[h1] = uint16(i + 1)
    271 			table[h2] = uint16(i + 2)
    272 		}
    273 		d.fastTable = &table
    274 	})
    275 }
    276 
    277 func (d *Dict) initBetter() {
    278 	d.better.Do(func() {
    279 		const (
    280 			// Long hash matches.
    281 			lTableBits    = 17
    282 			maxLTableSize = 1 << lTableBits
    283 
    284 			// Short hash matches.
    285 			sTableBits    = 14
    286 			maxSTableSize = 1 << sTableBits
    287 		)
    288 
    289 		var lTable [maxLTableSize]uint16
    290 		var sTable [maxSTableSize]uint16
    291 
    292 		// We stop so any entry of length 8 can always be read.
    293 		for i := 0; i < len(d.dict)-8; i++ {
    294 			cv := load64(d.dict, i)
    295 			lTable[hash7(cv, lTableBits)] = uint16(i)
    296 			sTable[hash4(cv, sTableBits)] = uint16(i)
    297 		}
    298 		d.betterTableShort = &sTable
    299 		d.betterTableLong = &lTable
    300 	})
    301 }
    302 
    303 func (d *Dict) initBest() {
    304 	d.best.Do(func() {
    305 		const (
    306 			// Long hash matches.
    307 			lTableBits    = 19
    308 			maxLTableSize = 1 << lTableBits
    309 
    310 			// Short hash matches.
    311 			sTableBits    = 16
    312 			maxSTableSize = 1 << sTableBits
    313 		)
    314 
    315 		var lTable [maxLTableSize]uint32
    316 		var sTable [maxSTableSize]uint32
    317 
    318 		// We stop so any entry of length 8 can always be read.
    319 		for i := 0; i < len(d.dict)-8; i++ {
    320 			cv := load64(d.dict, i)
    321 			hashL := hash8(cv, lTableBits)
    322 			hashS := hash4(cv, sTableBits)
    323 			candidateL := lTable[hashL]
    324 			candidateS := sTable[hashS]
    325 			lTable[hashL] = uint32(i) | candidateL<<16
    326 			sTable[hashS] = uint32(i) | candidateS<<16
    327 		}
    328 		d.bestTableShort = &sTable
    329 		d.bestTableLong = &lTable
    330 	})
    331 }