prop.go (5862B)
1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bidi 6 7 import "unicode/utf8" 8 9 // Properties provides access to BiDi properties of runes. 10 type Properties struct { 11 entry uint8 12 last uint8 13 } 14 15 var trie = newBidiTrie(0) 16 17 // TODO: using this for bidirule reduces the running time by about 5%. Consider 18 // if this is worth exposing or if we can find a way to speed up the Class 19 // method. 20 // 21 // // CompactClass is like Class, but maps all of the BiDi control classes 22 // // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control. 23 // func (p Properties) CompactClass() Class { 24 // return Class(p.entry & 0x0F) 25 // } 26 27 // Class returns the Bidi class for p. 28 func (p Properties) Class() Class { 29 c := Class(p.entry & 0x0F) 30 if c == Control { 31 c = controlByteToClass[p.last&0xF] 32 } 33 return c 34 } 35 36 // IsBracket reports whether the rune is a bracket. 37 func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 } 38 39 // IsOpeningBracket reports whether the rune is an opening bracket. 40 // IsBracket must return true. 41 func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 } 42 43 // TODO: find a better API and expose. 44 func (p Properties) reverseBracket(r rune) rune { 45 return xorMasks[p.entry>>xorMaskShift] ^ r 46 } 47 48 var controlByteToClass = [16]Class{ 49 0xD: LRO, // U+202D LeftToRightOverride, 50 0xE: RLO, // U+202E RightToLeftOverride, 51 0xA: LRE, // U+202A LeftToRightEmbedding, 52 0xB: RLE, // U+202B RightToLeftEmbedding, 53 0xC: PDF, // U+202C PopDirectionalFormat, 54 0x6: LRI, // U+2066 LeftToRightIsolate, 55 0x7: RLI, // U+2067 RightToLeftIsolate, 56 0x8: FSI, // U+2068 FirstStrongIsolate, 57 0x9: PDI, // U+2069 PopDirectionalIsolate, 58 } 59 60 // LookupRune returns properties for r. 61 func LookupRune(r rune) (p Properties, size int) { 62 var buf [4]byte 63 n := utf8.EncodeRune(buf[:], r) 64 return Lookup(buf[:n]) 65 } 66 67 // TODO: these lookup methods are based on the generated trie code. The returned 68 // sizes have slightly different semantics from the generated code, in that it 69 // always returns size==1 for an illegal UTF-8 byte (instead of the length 70 // of the maximum invalid subsequence). Most Transformers, like unicode/norm, 71 // leave invalid UTF-8 untouched, in which case it has performance benefits to 72 // do so (without changing the semantics). Bidi requires the semantics used here 73 // for the bidirule implementation to be compatible with the Go semantics. 74 // They ultimately should perhaps be adopted by all trie implementations, for 75 // convenience sake. 76 // This unrolled code also boosts performance of the secure/bidirule package by 77 // about 30%. 78 // So, to remove this code: 79 // - add option to trie generator to define return type. 80 // - always return 1 byte size for ill-formed UTF-8 runes. 81 82 // Lookup returns properties for the first rune in s and the width in bytes of 83 // its encoding. The size will be 0 if s does not hold enough bytes to complete 84 // the encoding. 85 func Lookup(s []byte) (p Properties, sz int) { 86 c0 := s[0] 87 switch { 88 case c0 < 0x80: // is ASCII 89 return Properties{entry: bidiValues[c0]}, 1 90 case c0 < 0xC2: 91 return Properties{}, 1 92 case c0 < 0xE0: // 2-byte UTF-8 93 if len(s) < 2 { 94 return Properties{}, 0 95 } 96 i := bidiIndex[c0] 97 c1 := s[1] 98 if c1 < 0x80 || 0xC0 <= c1 { 99 return Properties{}, 1 100 } 101 return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2 102 case c0 < 0xF0: // 3-byte UTF-8 103 if len(s) < 3 { 104 return Properties{}, 0 105 } 106 i := bidiIndex[c0] 107 c1 := s[1] 108 if c1 < 0x80 || 0xC0 <= c1 { 109 return Properties{}, 1 110 } 111 o := uint32(i)<<6 + uint32(c1) 112 i = bidiIndex[o] 113 c2 := s[2] 114 if c2 < 0x80 || 0xC0 <= c2 { 115 return Properties{}, 1 116 } 117 return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3 118 case c0 < 0xF8: // 4-byte UTF-8 119 if len(s) < 4 { 120 return Properties{}, 0 121 } 122 i := bidiIndex[c0] 123 c1 := s[1] 124 if c1 < 0x80 || 0xC0 <= c1 { 125 return Properties{}, 1 126 } 127 o := uint32(i)<<6 + uint32(c1) 128 i = bidiIndex[o] 129 c2 := s[2] 130 if c2 < 0x80 || 0xC0 <= c2 { 131 return Properties{}, 1 132 } 133 o = uint32(i)<<6 + uint32(c2) 134 i = bidiIndex[o] 135 c3 := s[3] 136 if c3 < 0x80 || 0xC0 <= c3 { 137 return Properties{}, 1 138 } 139 return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4 140 } 141 // Illegal rune 142 return Properties{}, 1 143 } 144 145 // LookupString returns properties for the first rune in s and the width in 146 // bytes of its encoding. The size will be 0 if s does not hold enough bytes to 147 // complete the encoding. 148 func LookupString(s string) (p Properties, sz int) { 149 c0 := s[0] 150 switch { 151 case c0 < 0x80: // is ASCII 152 return Properties{entry: bidiValues[c0]}, 1 153 case c0 < 0xC2: 154 return Properties{}, 1 155 case c0 < 0xE0: // 2-byte UTF-8 156 if len(s) < 2 { 157 return Properties{}, 0 158 } 159 i := bidiIndex[c0] 160 c1 := s[1] 161 if c1 < 0x80 || 0xC0 <= c1 { 162 return Properties{}, 1 163 } 164 return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2 165 case c0 < 0xF0: // 3-byte UTF-8 166 if len(s) < 3 { 167 return Properties{}, 0 168 } 169 i := bidiIndex[c0] 170 c1 := s[1] 171 if c1 < 0x80 || 0xC0 <= c1 { 172 return Properties{}, 1 173 } 174 o := uint32(i)<<6 + uint32(c1) 175 i = bidiIndex[o] 176 c2 := s[2] 177 if c2 < 0x80 || 0xC0 <= c2 { 178 return Properties{}, 1 179 } 180 return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3 181 case c0 < 0xF8: // 4-byte UTF-8 182 if len(s) < 4 { 183 return Properties{}, 0 184 } 185 i := bidiIndex[c0] 186 c1 := s[1] 187 if c1 < 0x80 || 0xC0 <= c1 { 188 return Properties{}, 1 189 } 190 o := uint32(i)<<6 + uint32(c1) 191 i = bidiIndex[o] 192 c2 := s[2] 193 if c2 < 0x80 || 0xC0 <= c2 { 194 return Properties{}, 1 195 } 196 o = uint32(i)<<6 + uint32(c2) 197 i = bidiIndex[o] 198 c3 := s[3] 199 if c3 < 0x80 || 0xC0 <= c3 { 200 return Properties{}, 1 201 } 202 return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4 203 } 204 // Illegal rune 205 return Properties{}, 1 206 }