lexicon.go (5371B)
1 // Copyright 2020 Google Inc. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package s2 16 17 import ( 18 "encoding/binary" 19 "hash/adler32" 20 "math" 21 "sort" 22 ) 23 24 // TODO(roberts): If any of these are worth making public, change the 25 // method signatures and type names. 26 27 // emptySetID represents the last ID that will ever be generated. 28 // (Non-negative IDs are reserved for singleton sets.) 29 var emptySetID = int32(math.MinInt32) 30 31 // idSetLexicon compactly represents a set of non-negative 32 // integers such as array indices ("ID sets"). It is especially suitable when 33 // either (1) there are many duplicate sets, or (2) there are many singleton 34 // or empty sets. See also sequenceLexicon. 35 // 36 // Each distinct ID set is mapped to a 32-bit integer. Empty and singleton 37 // sets take up no additional space; the set itself is represented 38 // by the unique ID assigned to the set. Duplicate sets are automatically 39 // eliminated. Note also that ID sets are referred to using 32-bit integers 40 // rather than pointers. 41 type idSetLexicon struct { 42 idSets *sequenceLexicon 43 } 44 45 func newIDSetLexicon() *idSetLexicon { 46 return &idSetLexicon{ 47 idSets: newSequenceLexicon(), 48 } 49 } 50 51 // add adds the given set of integers to the lexicon if it is not already 52 // present, and return the unique ID for this set. The values are automatically 53 // sorted and duplicates are removed. 54 // 55 // The primary difference between this and sequenceLexicon are: 56 // 1. Empty and singleton sets are represented implicitly; they use no space. 57 // 2. Sets are represented rather than sequences; the ordering of values is 58 // not important and duplicates are removed. 59 // 3. The values must be 32-bit non-negative integers only. 60 func (l *idSetLexicon) add(ids ...int32) int32 { 61 // Empty sets have a special ID chosen not to conflict with other IDs. 62 if len(ids) == 0 { 63 return emptySetID 64 } 65 66 // Singleton sets are represented by their element. 67 if len(ids) == 1 { 68 return ids[0] 69 } 70 71 // Canonicalize the set by sorting and removing duplicates. 72 // 73 // Creates a new slice in order to not alter the supplied values. 74 set := uniqueInt32s(ids) 75 76 // Non-singleton sets are represented by the bitwise complement of the ID 77 // returned by the sequenceLexicon 78 return ^l.idSets.add(set) 79 } 80 81 // idSet returns the set of integers corresponding to an ID returned by add. 82 func (l *idSetLexicon) idSet(setID int32) []int32 { 83 if setID >= 0 { 84 return []int32{setID} 85 } 86 if setID == emptySetID { 87 return []int32{} 88 } 89 90 return l.idSets.sequence(^setID) 91 } 92 93 func (l *idSetLexicon) clear() { 94 l.idSets.clear() 95 } 96 97 // sequenceLexicon compactly represents a sequence of values (e.g., tuples). 98 // It automatically eliminates duplicates slices, and maps the remaining 99 // sequences to sequentially increasing integer IDs. See also idSetLexicon. 100 // 101 // Each distinct sequence is mapped to a 32-bit integer. 102 type sequenceLexicon struct { 103 values []int32 104 begins []uint32 105 106 // idSet is a mapping of a sequence hash to sequence index in the lexicon. 107 idSet map[uint32]int32 108 } 109 110 func newSequenceLexicon() *sequenceLexicon { 111 return &sequenceLexicon{ 112 begins: []uint32{0}, 113 idSet: make(map[uint32]int32), 114 } 115 } 116 117 // clears all data from the lexicon. 118 func (l *sequenceLexicon) clear() { 119 l.values = nil 120 l.begins = []uint32{0} 121 l.idSet = make(map[uint32]int32) 122 } 123 124 // add adds the given value to the lexicon if it is not already present, and 125 // returns its ID. IDs are assigned sequentially starting from zero. 126 func (l *sequenceLexicon) add(ids []int32) int32 { 127 if id, ok := l.idSet[hashSet(ids)]; ok { 128 return id 129 } 130 for _, v := range ids { 131 l.values = append(l.values, v) 132 } 133 l.begins = append(l.begins, uint32(len(l.values))) 134 135 id := int32(len(l.begins)) - 2 136 l.idSet[hashSet(ids)] = id 137 138 return id 139 } 140 141 // sequence returns the original sequence of values for the given ID. 142 func (l *sequenceLexicon) sequence(id int32) []int32 { 143 return l.values[l.begins[id]:l.begins[id+1]] 144 } 145 146 // size reports the number of value sequences in the lexicon. 147 func (l *sequenceLexicon) size() int { 148 // Subtract one because the list of begins starts out with the first element set to 0. 149 return len(l.begins) - 1 150 } 151 152 // hash returns a hash of this sequence of int32s. 153 func hashSet(s []int32) uint32 { 154 // TODO(roberts): We just need a way to nicely hash all the values down to 155 // a 32-bit value. To ensure no unnecessary dependencies we use the core 156 // library types available to do this. Is there a better option? 157 a := adler32.New() 158 binary.Write(a, binary.LittleEndian, s) 159 return a.Sum32() 160 } 161 162 // uniqueInt32s returns the sorted and uniqued set of int32s from the input. 163 func uniqueInt32s(in []int32) []int32 { 164 var vals []int32 165 m := make(map[int32]bool) 166 for _, i := range in { 167 if m[i] { 168 continue 169 } 170 m[i] = true 171 vals = append(vals, i) 172 } 173 sort.Slice(vals, func(i, j int) bool { return vals[i] < vals[j] }) 174 return vals 175 }