token.go (11051B)
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package flate 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "fmt" 11 "io" 12 "math" 13 ) 14 15 const ( 16 // bits 0-16 xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits 17 // bits 16-22 offsetcode - 5 bits 18 // bits 22-30 xlength = length - MIN_MATCH_LENGTH - 8 bits 19 // bits 30-32 type 0 = literal 1=EOF 2=Match 3=Unused - 2 bits 20 lengthShift = 22 21 offsetMask = 1<<lengthShift - 1 22 typeMask = 3 << 30 23 literalType = 0 << 30 24 matchType = 1 << 30 25 matchOffsetOnlyMask = 0xffff 26 ) 27 28 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH) 29 // is lengthCodes[length - MIN_MATCH_LENGTH] 30 var lengthCodes = [256]uint8{ 31 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 32 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, 33 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 34 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 35 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 36 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 37 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 38 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 39 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 40 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 41 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 42 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 43 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 44 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 45 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 46 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 47 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 48 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 49 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 50 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 51 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 52 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 53 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 54 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 55 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 56 27, 27, 27, 27, 27, 28, 57 } 58 59 // lengthCodes1 is length codes, but starting at 1. 60 var lengthCodes1 = [256]uint8{ 61 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 62 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, 63 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 64 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 65 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 66 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 67 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 68 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 69 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 70 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 71 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 72 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 73 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 74 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 75 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 76 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 77 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 78 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 79 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 80 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 81 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 82 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 83 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 84 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 85 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 86 28, 28, 28, 28, 28, 29, 87 } 88 89 var offsetCodes = [256]uint32{ 90 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 91 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 92 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 93 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 94 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 95 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 96 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 97 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 98 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 99 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 100 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 101 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 102 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 103 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 104 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 105 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 106 } 107 108 // offsetCodes14 are offsetCodes, but with 14 added. 109 var offsetCodes14 = [256]uint32{ 110 14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 111 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 112 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 113 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 114 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 115 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 116 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 117 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 118 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 119 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 120 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 121 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 122 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 123 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 124 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 125 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 126 } 127 128 type token uint32 129 130 type tokens struct { 131 extraHist [32]uint16 // codes 256->maxnumlit 132 offHist [32]uint16 // offset codes 133 litHist [256]uint16 // codes 0->255 134 nFilled int 135 n uint16 // Must be able to contain maxStoreBlockSize 136 tokens [maxStoreBlockSize + 1]token 137 } 138 139 func (t *tokens) Reset() { 140 if t.n == 0 { 141 return 142 } 143 t.n = 0 144 t.nFilled = 0 145 for i := range t.litHist[:] { 146 t.litHist[i] = 0 147 } 148 for i := range t.extraHist[:] { 149 t.extraHist[i] = 0 150 } 151 for i := range t.offHist[:] { 152 t.offHist[i] = 0 153 } 154 } 155 156 func (t *tokens) Fill() { 157 if t.n == 0 { 158 return 159 } 160 for i, v := range t.litHist[:] { 161 if v == 0 { 162 t.litHist[i] = 1 163 t.nFilled++ 164 } 165 } 166 for i, v := range t.extraHist[:literalCount-256] { 167 if v == 0 { 168 t.nFilled++ 169 t.extraHist[i] = 1 170 } 171 } 172 for i, v := range t.offHist[:offsetCodeCount] { 173 if v == 0 { 174 t.offHist[i] = 1 175 } 176 } 177 } 178 179 func indexTokens(in []token) tokens { 180 var t tokens 181 t.indexTokens(in) 182 return t 183 } 184 185 func (t *tokens) indexTokens(in []token) { 186 t.Reset() 187 for _, tok := range in { 188 if tok < matchType { 189 t.AddLiteral(tok.literal()) 190 continue 191 } 192 t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask) 193 } 194 } 195 196 // emitLiteral writes a literal chunk and returns the number of bytes written. 197 func emitLiteral(dst *tokens, lit []byte) { 198 for _, v := range lit { 199 dst.tokens[dst.n] = token(v) 200 dst.litHist[v]++ 201 dst.n++ 202 } 203 } 204 205 func (t *tokens) AddLiteral(lit byte) { 206 t.tokens[t.n] = token(lit) 207 t.litHist[lit]++ 208 t.n++ 209 } 210 211 // from https://stackoverflow.com/a/28730362 212 func mFastLog2(val float32) float32 { 213 ux := int32(math.Float32bits(val)) 214 log2 := (float32)(((ux >> 23) & 255) - 128) 215 ux &= -0x7f800001 216 ux += 127 << 23 217 uval := math.Float32frombits(uint32(ux)) 218 log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759 219 return log2 220 } 221 222 // EstimatedBits will return an minimum size estimated by an *optimal* 223 // compression of the block. 224 // The size of the block 225 func (t *tokens) EstimatedBits() int { 226 shannon := float32(0) 227 bits := int(0) 228 nMatches := 0 229 total := int(t.n) + t.nFilled 230 if total > 0 { 231 invTotal := 1.0 / float32(total) 232 for _, v := range t.litHist[:] { 233 if v > 0 { 234 n := float32(v) 235 shannon += atLeastOne(-mFastLog2(n*invTotal)) * n 236 } 237 } 238 // Just add 15 for EOB 239 shannon += 15 240 for i, v := range t.extraHist[1 : literalCount-256] { 241 if v > 0 { 242 n := float32(v) 243 shannon += atLeastOne(-mFastLog2(n*invTotal)) * n 244 bits += int(lengthExtraBits[i&31]) * int(v) 245 nMatches += int(v) 246 } 247 } 248 } 249 if nMatches > 0 { 250 invTotal := 1.0 / float32(nMatches) 251 for i, v := range t.offHist[:offsetCodeCount] { 252 if v > 0 { 253 n := float32(v) 254 shannon += atLeastOne(-mFastLog2(n*invTotal)) * n 255 bits += int(offsetExtraBits[i&31]) * int(v) 256 } 257 } 258 } 259 return int(shannon) + bits 260 } 261 262 // AddMatch adds a match to the tokens. 263 // This function is very sensitive to inlining and right on the border. 264 func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { 265 if debugDeflate { 266 if xlength >= maxMatchLength+baseMatchLength { 267 panic(fmt.Errorf("invalid length: %v", xlength)) 268 } 269 if xoffset >= maxMatchOffset+baseMatchOffset { 270 panic(fmt.Errorf("invalid offset: %v", xoffset)) 271 } 272 } 273 oCode := offsetCode(xoffset) 274 xoffset |= oCode << 16 275 276 t.extraHist[lengthCodes1[uint8(xlength)]]++ 277 t.offHist[oCode&31]++ 278 t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset) 279 t.n++ 280 } 281 282 // AddMatchLong adds a match to the tokens, potentially longer than max match length. 283 // Length should NOT have the base subtracted, only offset should. 284 func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) { 285 if debugDeflate { 286 if xoffset >= maxMatchOffset+baseMatchOffset { 287 panic(fmt.Errorf("invalid offset: %v", xoffset)) 288 } 289 } 290 oc := offsetCode(xoffset) 291 xoffset |= oc << 16 292 for xlength > 0 { 293 xl := xlength 294 if xl > 258 { 295 // We need to have at least baseMatchLength left over for next loop. 296 if xl > 258+baseMatchLength { 297 xl = 258 298 } else { 299 xl = 258 - baseMatchLength 300 } 301 } 302 xlength -= xl 303 xl -= baseMatchLength 304 t.extraHist[lengthCodes1[uint8(xl)]]++ 305 t.offHist[oc&31]++ 306 t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset) 307 t.n++ 308 } 309 } 310 311 func (t *tokens) AddEOB() { 312 t.tokens[t.n] = token(endBlockMarker) 313 t.extraHist[0]++ 314 t.n++ 315 } 316 317 func (t *tokens) Slice() []token { 318 return t.tokens[:t.n] 319 } 320 321 // VarInt returns the tokens as varint encoded bytes. 322 func (t *tokens) VarInt() []byte { 323 var b = make([]byte, binary.MaxVarintLen32*int(t.n)) 324 var off int 325 for _, v := range t.tokens[:t.n] { 326 off += binary.PutUvarint(b[off:], uint64(v)) 327 } 328 return b[:off] 329 } 330 331 // FromVarInt restores t to the varint encoded tokens provided. 332 // Any data in t is removed. 333 func (t *tokens) FromVarInt(b []byte) error { 334 var buf = bytes.NewReader(b) 335 var toks []token 336 for { 337 r, err := binary.ReadUvarint(buf) 338 if err == io.EOF { 339 break 340 } 341 if err != nil { 342 return err 343 } 344 toks = append(toks, token(r)) 345 } 346 t.indexTokens(toks) 347 return nil 348 } 349 350 // Returns the type of a token 351 func (t token) typ() uint32 { return uint32(t) & typeMask } 352 353 // Returns the literal of a literal token 354 func (t token) literal() uint8 { return uint8(t) } 355 356 // Returns the extra offset of a match token 357 func (t token) offset() uint32 { return uint32(t) & offsetMask } 358 359 func (t token) length() uint8 { return uint8(t >> lengthShift) } 360 361 // Convert length to code. 362 func lengthCode(len uint8) uint8 { return lengthCodes[len] } 363 364 // Returns the offset code corresponding to a specific offset 365 func offsetCode(off uint32) uint32 { 366 if false { 367 if off < uint32(len(offsetCodes)) { 368 return offsetCodes[off&255] 369 } else if off>>7 < uint32(len(offsetCodes)) { 370 return offsetCodes[(off>>7)&255] + 14 371 } else { 372 return offsetCodes[(off>>14)&255] + 28 373 } 374 } 375 if off < uint32(len(offsetCodes)) { 376 return offsetCodes[uint8(off)] 377 } 378 return offsetCodes14[uint8(off>>7)] 379 }