gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

lexicon.go (5371B)


      1 // Copyright 2020 Google Inc. All rights reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 package s2
     16 
     17 import (
     18 	"encoding/binary"
     19 	"hash/adler32"
     20 	"math"
     21 	"sort"
     22 )
     23 
     24 // TODO(roberts): If any of these are worth making public, change the
     25 // method signatures and type names.
     26 
     27 // emptySetID represents the last ID that will ever be generated.
     28 // (Non-negative IDs are reserved for singleton sets.)
     29 var emptySetID = int32(math.MinInt32)
     30 
     31 // idSetLexicon compactly represents a set of non-negative
     32 // integers such as array indices ("ID sets"). It is especially suitable when
     33 // either (1) there are many duplicate sets, or (2) there are many singleton
     34 // or empty sets. See also sequenceLexicon.
     35 //
     36 // Each distinct ID set is mapped to a 32-bit integer. Empty and singleton
     37 // sets take up no additional space; the set itself is represented
     38 // by the unique ID assigned to the set. Duplicate sets are automatically
     39 // eliminated. Note also that ID sets are referred to using 32-bit integers
     40 // rather than pointers.
     41 type idSetLexicon struct {
     42 	idSets *sequenceLexicon
     43 }
     44 
     45 func newIDSetLexicon() *idSetLexicon {
     46 	return &idSetLexicon{
     47 		idSets: newSequenceLexicon(),
     48 	}
     49 }
     50 
     51 // add adds the given set of integers to the lexicon if it is not already
     52 // present, and return the unique ID for this set. The values are automatically
     53 // sorted and duplicates are removed.
     54 //
     55 // The primary difference between this and sequenceLexicon are:
     56 // 1. Empty and singleton sets are represented implicitly; they use no space.
     57 // 2. Sets are represented rather than sequences; the ordering of values is
     58 //    not important and duplicates are removed.
     59 // 3. The values must be 32-bit non-negative integers only.
     60 func (l *idSetLexicon) add(ids ...int32) int32 {
     61 	// Empty sets have a special ID chosen not to conflict with other IDs.
     62 	if len(ids) == 0 {
     63 		return emptySetID
     64 	}
     65 
     66 	// Singleton sets are represented by their element.
     67 	if len(ids) == 1 {
     68 		return ids[0]
     69 	}
     70 
     71 	// Canonicalize the set by sorting and removing duplicates.
     72 	//
     73 	// Creates a new slice in order to not alter the supplied values.
     74 	set := uniqueInt32s(ids)
     75 
     76 	// Non-singleton sets are represented by the bitwise complement of the ID
     77 	// returned by the sequenceLexicon
     78 	return ^l.idSets.add(set)
     79 }
     80 
     81 // idSet returns the set of integers corresponding to an ID returned by add.
     82 func (l *idSetLexicon) idSet(setID int32) []int32 {
     83 	if setID >= 0 {
     84 		return []int32{setID}
     85 	}
     86 	if setID == emptySetID {
     87 		return []int32{}
     88 	}
     89 
     90 	return l.idSets.sequence(^setID)
     91 }
     92 
     93 func (l *idSetLexicon) clear() {
     94 	l.idSets.clear()
     95 }
     96 
     97 // sequenceLexicon compactly represents a sequence of values (e.g., tuples).
     98 // It automatically eliminates duplicates slices, and maps the remaining
     99 // sequences to sequentially increasing integer IDs. See also idSetLexicon.
    100 //
    101 // Each distinct sequence is mapped to a 32-bit integer.
    102 type sequenceLexicon struct {
    103 	values []int32
    104 	begins []uint32
    105 
    106 	// idSet is a mapping of a sequence hash to sequence index in the lexicon.
    107 	idSet map[uint32]int32
    108 }
    109 
    110 func newSequenceLexicon() *sequenceLexicon {
    111 	return &sequenceLexicon{
    112 		begins: []uint32{0},
    113 		idSet:  make(map[uint32]int32),
    114 	}
    115 }
    116 
    117 // clears all data from the lexicon.
    118 func (l *sequenceLexicon) clear() {
    119 	l.values = nil
    120 	l.begins = []uint32{0}
    121 	l.idSet = make(map[uint32]int32)
    122 }
    123 
    124 // add adds the given value to the lexicon if it is not already present, and
    125 // returns its ID. IDs are assigned sequentially starting from zero.
    126 func (l *sequenceLexicon) add(ids []int32) int32 {
    127 	if id, ok := l.idSet[hashSet(ids)]; ok {
    128 		return id
    129 	}
    130 	for _, v := range ids {
    131 		l.values = append(l.values, v)
    132 	}
    133 	l.begins = append(l.begins, uint32(len(l.values)))
    134 
    135 	id := int32(len(l.begins)) - 2
    136 	l.idSet[hashSet(ids)] = id
    137 
    138 	return id
    139 }
    140 
    141 // sequence returns the original sequence of values for the given ID.
    142 func (l *sequenceLexicon) sequence(id int32) []int32 {
    143 	return l.values[l.begins[id]:l.begins[id+1]]
    144 }
    145 
    146 // size reports the number of value sequences in the lexicon.
    147 func (l *sequenceLexicon) size() int {
    148 	// Subtract one because the list of begins starts out with the first element set to 0.
    149 	return len(l.begins) - 1
    150 }
    151 
    152 // hash returns a hash of this sequence of int32s.
    153 func hashSet(s []int32) uint32 {
    154 	// TODO(roberts): We just need a way to nicely hash all the values down to
    155 	// a 32-bit value. To ensure no unnecessary dependencies we use the core
    156 	// library types available to do this. Is there a better option?
    157 	a := adler32.New()
    158 	binary.Write(a, binary.LittleEndian, s)
    159 	return a.Sum32()
    160 }
    161 
    162 // uniqueInt32s returns the sorted and uniqued set of int32s from the input.
    163 func uniqueInt32s(in []int32) []int32 {
    164 	var vals []int32
    165 	m := make(map[int32]bool)
    166 	for _, i := range in {
    167 		if m[i] {
    168 			continue
    169 		}
    170 		m[i] = true
    171 		vals = append(vals, i)
    172 	}
    173 	sort.Slice(vals, func(i, j int) bool { return vals[i] < vals[j] })
    174 	return vals
    175 }