bidirule.go (9558B)
1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bidirule implements the Bidi Rule defined by RFC 5893. 6 // 7 // This package is under development. The API may change without notice and 8 // without preserving backward compatibility. 9 package bidirule 10 11 import ( 12 "errors" 13 "unicode/utf8" 14 15 "golang.org/x/text/transform" 16 "golang.org/x/text/unicode/bidi" 17 ) 18 19 // This file contains an implementation of RFC 5893: Right-to-Left Scripts for 20 // Internationalized Domain Names for Applications (IDNA) 21 // 22 // A label is an individual component of a domain name. Labels are usually 23 // shown separated by dots; for example, the domain name "www.example.com" is 24 // composed of three labels: "www", "example", and "com". 25 // 26 // An RTL label is a label that contains at least one character of class R, AL, 27 // or AN. An LTR label is any label that is not an RTL label. 28 // 29 // A "Bidi domain name" is a domain name that contains at least one RTL label. 30 // 31 // The following guarantees can be made based on the above: 32 // 33 // o In a domain name consisting of only labels that satisfy the rule, 34 // the requirements of Section 3 are satisfied. Note that even LTR 35 // labels and pure ASCII labels have to be tested. 36 // 37 // o In a domain name consisting of only LDH labels (as defined in the 38 // Definitions document [RFC5890]) and labels that satisfy the rule, 39 // the requirements of Section 3 are satisfied as long as a label 40 // that starts with an ASCII digit does not come after a 41 // right-to-left label. 42 // 43 // No guarantee is given for other combinations. 44 45 // ErrInvalid indicates a label is invalid according to the Bidi Rule. 46 var ErrInvalid = errors.New("bidirule: failed Bidi Rule") 47 48 type ruleState uint8 49 50 const ( 51 ruleInitial ruleState = iota 52 ruleLTR 53 ruleLTRFinal 54 ruleRTL 55 ruleRTLFinal 56 ruleInvalid 57 ) 58 59 type ruleTransition struct { 60 next ruleState 61 mask uint16 62 } 63 64 var transitions = [...][2]ruleTransition{ 65 // [2.1] The first character must be a character with Bidi property L, R, or 66 // AL. If it has the R or AL property, it is an RTL label; if it has the L 67 // property, it is an LTR label. 68 ruleInitial: { 69 {ruleLTRFinal, 1 << bidi.L}, 70 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL}, 71 }, 72 ruleRTL: { 73 // [2.3] In an RTL label, the end of the label must be a character with 74 // Bidi property R, AL, EN, or AN, followed by zero or more characters 75 // with Bidi property NSM. 76 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN}, 77 78 // [2.2] In an RTL label, only characters with the Bidi properties R, 79 // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. 80 // We exclude the entries from [2.3] 81 {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM}, 82 }, 83 ruleRTLFinal: { 84 // [2.3] In an RTL label, the end of the label must be a character with 85 // Bidi property R, AL, EN, or AN, followed by zero or more characters 86 // with Bidi property NSM. 87 {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM}, 88 89 // [2.2] In an RTL label, only characters with the Bidi properties R, 90 // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. 91 // We exclude the entries from [2.3] and NSM. 92 {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN}, 93 }, 94 ruleLTR: { 95 // [2.6] In an LTR label, the end of the label must be a character with 96 // Bidi property L or EN, followed by zero or more characters with Bidi 97 // property NSM. 98 {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN}, 99 100 // [2.5] In an LTR label, only characters with the Bidi properties L, 101 // EN, ES, CS, ET, ON, BN, or NSM are allowed. 102 // We exclude the entries from [2.6]. 103 {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM}, 104 }, 105 ruleLTRFinal: { 106 // [2.6] In an LTR label, the end of the label must be a character with 107 // Bidi property L or EN, followed by zero or more characters with Bidi 108 // property NSM. 109 {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM}, 110 111 // [2.5] In an LTR label, only characters with the Bidi properties L, 112 // EN, ES, CS, ET, ON, BN, or NSM are allowed. 113 // We exclude the entries from [2.6]. 114 {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN}, 115 }, 116 ruleInvalid: { 117 {ruleInvalid, 0}, 118 {ruleInvalid, 0}, 119 }, 120 } 121 122 // [2.4] In an RTL label, if an EN is present, no AN may be present, and 123 // vice versa. 124 const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN) 125 126 // From RFC 5893 127 // An RTL label is a label that contains at least one character of type 128 // R, AL, or AN. 129 // 130 // An LTR label is any label that is not an RTL label. 131 132 // Direction reports the direction of the given label as defined by RFC 5893. 133 // The Bidi Rule does not have to be applied to labels of the category 134 // LeftToRight. 135 func Direction(b []byte) bidi.Direction { 136 for i := 0; i < len(b); { 137 e, sz := bidi.Lookup(b[i:]) 138 if sz == 0 { 139 i++ 140 } 141 c := e.Class() 142 if c == bidi.R || c == bidi.AL || c == bidi.AN { 143 return bidi.RightToLeft 144 } 145 i += sz 146 } 147 return bidi.LeftToRight 148 } 149 150 // DirectionString reports the direction of the given label as defined by RFC 151 // 5893. The Bidi Rule does not have to be applied to labels of the category 152 // LeftToRight. 153 func DirectionString(s string) bidi.Direction { 154 for i := 0; i < len(s); { 155 e, sz := bidi.LookupString(s[i:]) 156 if sz == 0 { 157 i++ 158 continue 159 } 160 c := e.Class() 161 if c == bidi.R || c == bidi.AL || c == bidi.AN { 162 return bidi.RightToLeft 163 } 164 i += sz 165 } 166 return bidi.LeftToRight 167 } 168 169 // Valid reports whether b conforms to the BiDi rule. 170 func Valid(b []byte) bool { 171 var t Transformer 172 if n, ok := t.advance(b); !ok || n < len(b) { 173 return false 174 } 175 return t.isFinal() 176 } 177 178 // ValidString reports whether s conforms to the BiDi rule. 179 func ValidString(s string) bool { 180 var t Transformer 181 if n, ok := t.advanceString(s); !ok || n < len(s) { 182 return false 183 } 184 return t.isFinal() 185 } 186 187 // New returns a Transformer that verifies that input adheres to the Bidi Rule. 188 func New() *Transformer { 189 return &Transformer{} 190 } 191 192 // Transformer implements transform.Transform. 193 type Transformer struct { 194 state ruleState 195 hasRTL bool 196 seen uint16 197 } 198 199 // A rule can only be violated for "Bidi Domain names", meaning if one of the 200 // following categories has been observed. 201 func (t *Transformer) isRTL() bool { 202 const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN 203 return t.seen&isRTL != 0 204 } 205 206 // Reset implements transform.Transformer. 207 func (t *Transformer) Reset() { *t = Transformer{} } 208 209 // Transform implements transform.Transformer. This Transformer has state and 210 // needs to be reset between uses. 211 func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 212 if len(dst) < len(src) { 213 src = src[:len(dst)] 214 atEOF = false 215 err = transform.ErrShortDst 216 } 217 n, err1 := t.Span(src, atEOF) 218 copy(dst, src[:n]) 219 if err == nil || err1 != nil && err1 != transform.ErrShortSrc { 220 err = err1 221 } 222 return n, n, err 223 } 224 225 // Span returns the first n bytes of src that conform to the Bidi rule. 226 func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) { 227 if t.state == ruleInvalid && t.isRTL() { 228 return 0, ErrInvalid 229 } 230 n, ok := t.advance(src) 231 switch { 232 case !ok: 233 err = ErrInvalid 234 case n < len(src): 235 if !atEOF { 236 err = transform.ErrShortSrc 237 break 238 } 239 err = ErrInvalid 240 case !t.isFinal(): 241 err = ErrInvalid 242 } 243 return n, err 244 } 245 246 // Precomputing the ASCII values decreases running time for the ASCII fast path 247 // by about 30%. 248 var asciiTable [128]bidi.Properties 249 250 func init() { 251 for i := range asciiTable { 252 p, _ := bidi.LookupRune(rune(i)) 253 asciiTable[i] = p 254 } 255 } 256 257 func (t *Transformer) advance(s []byte) (n int, ok bool) { 258 var e bidi.Properties 259 var sz int 260 for n < len(s) { 261 if s[n] < utf8.RuneSelf { 262 e, sz = asciiTable[s[n]], 1 263 } else { 264 e, sz = bidi.Lookup(s[n:]) 265 if sz <= 1 { 266 if sz == 1 { 267 // We always consider invalid UTF-8 to be invalid, even if 268 // the string has not yet been determined to be RTL. 269 // TODO: is this correct? 270 return n, false 271 } 272 return n, true // incomplete UTF-8 encoding 273 } 274 } 275 // TODO: using CompactClass would result in noticeable speedup. 276 // See unicode/bidi/prop.go:Properties.CompactClass. 277 c := uint16(1 << e.Class()) 278 t.seen |= c 279 if t.seen&exclusiveRTL == exclusiveRTL { 280 t.state = ruleInvalid 281 return n, false 282 } 283 switch tr := transitions[t.state]; { 284 case tr[0].mask&c != 0: 285 t.state = tr[0].next 286 case tr[1].mask&c != 0: 287 t.state = tr[1].next 288 default: 289 t.state = ruleInvalid 290 if t.isRTL() { 291 return n, false 292 } 293 } 294 n += sz 295 } 296 return n, true 297 } 298 299 func (t *Transformer) advanceString(s string) (n int, ok bool) { 300 var e bidi.Properties 301 var sz int 302 for n < len(s) { 303 if s[n] < utf8.RuneSelf { 304 e, sz = asciiTable[s[n]], 1 305 } else { 306 e, sz = bidi.LookupString(s[n:]) 307 if sz <= 1 { 308 if sz == 1 { 309 return n, false // invalid UTF-8 310 } 311 return n, true // incomplete UTF-8 encoding 312 } 313 } 314 // TODO: using CompactClass results in noticeable speedup. 315 // See unicode/bidi/prop.go:Properties.CompactClass. 316 c := uint16(1 << e.Class()) 317 t.seen |= c 318 if t.seen&exclusiveRTL == exclusiveRTL { 319 t.state = ruleInvalid 320 return n, false 321 } 322 switch tr := transitions[t.state]; { 323 case tr[0].mask&c != 0: 324 t.state = tr[0].next 325 case tr[1].mask&c != 0: 326 t.state = tr[1].next 327 default: 328 t.state = ruleInvalid 329 if t.isRTL() { 330 return n, false 331 } 332 } 333 n += sz 334 } 335 return n, true 336 }