parse.go (15241B)
1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package language 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "sort" 12 13 "golang.org/x/text/internal/tag" 14 ) 15 16 // isAlpha returns true if the byte is not a digit. 17 // b must be an ASCII letter or digit. 18 func isAlpha(b byte) bool { 19 return b > '9' 20 } 21 22 // isAlphaNum returns true if the string contains only ASCII letters or digits. 23 func isAlphaNum(s []byte) bool { 24 for _, c := range s { 25 if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { 26 return false 27 } 28 } 29 return true 30 } 31 32 // ErrSyntax is returned by any of the parsing functions when the 33 // input is not well-formed, according to BCP 47. 34 // TODO: return the position at which the syntax error occurred? 35 var ErrSyntax = errors.New("language: tag is not well-formed") 36 37 // ErrDuplicateKey is returned when a tag contains the same key twice with 38 // different values in the -u section. 39 var ErrDuplicateKey = errors.New("language: different values for same key in -u extension") 40 41 // ValueError is returned by any of the parsing functions when the 42 // input is well-formed but the respective subtag is not recognized 43 // as a valid value. 44 type ValueError struct { 45 v [8]byte 46 } 47 48 // NewValueError creates a new ValueError. 49 func NewValueError(tag []byte) ValueError { 50 var e ValueError 51 copy(e.v[:], tag) 52 return e 53 } 54 55 func (e ValueError) tag() []byte { 56 n := bytes.IndexByte(e.v[:], 0) 57 if n == -1 { 58 n = 8 59 } 60 return e.v[:n] 61 } 62 63 // Error implements the error interface. 64 func (e ValueError) Error() string { 65 return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag()) 66 } 67 68 // Subtag returns the subtag for which the error occurred. 69 func (e ValueError) Subtag() string { 70 return string(e.tag()) 71 } 72 73 // scanner is used to scan BCP 47 tokens, which are separated by _ or -. 74 type scanner struct { 75 b []byte 76 bytes [max99thPercentileSize]byte 77 token []byte 78 start int // start position of the current token 79 end int // end position of the current token 80 next int // next point for scan 81 err error 82 done bool 83 } 84 85 func makeScannerString(s string) scanner { 86 scan := scanner{} 87 if len(s) <= len(scan.bytes) { 88 scan.b = scan.bytes[:copy(scan.bytes[:], s)] 89 } else { 90 scan.b = []byte(s) 91 } 92 scan.init() 93 return scan 94 } 95 96 // makeScanner returns a scanner using b as the input buffer. 97 // b is not copied and may be modified by the scanner routines. 98 func makeScanner(b []byte) scanner { 99 scan := scanner{b: b} 100 scan.init() 101 return scan 102 } 103 104 func (s *scanner) init() { 105 for i, c := range s.b { 106 if c == '_' { 107 s.b[i] = '-' 108 } 109 } 110 s.scan() 111 } 112 113 // restToLower converts the string between start and end to lower case. 114 func (s *scanner) toLower(start, end int) { 115 for i := start; i < end; i++ { 116 c := s.b[i] 117 if 'A' <= c && c <= 'Z' { 118 s.b[i] += 'a' - 'A' 119 } 120 } 121 } 122 123 func (s *scanner) setError(e error) { 124 if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) { 125 s.err = e 126 } 127 } 128 129 // resizeRange shrinks or grows the array at position oldStart such that 130 // a new string of size newSize can fit between oldStart and oldEnd. 131 // Sets the scan point to after the resized range. 132 func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) { 133 s.start = oldStart 134 if end := oldStart + newSize; end != oldEnd { 135 diff := end - oldEnd 136 var b []byte 137 if n := len(s.b) + diff; n > cap(s.b) { 138 b = make([]byte, n) 139 copy(b, s.b[:oldStart]) 140 } else { 141 b = s.b[:n] 142 } 143 copy(b[end:], s.b[oldEnd:]) 144 s.b = b 145 s.next = end + (s.next - s.end) 146 s.end = end 147 } 148 } 149 150 // replace replaces the current token with repl. 151 func (s *scanner) replace(repl string) { 152 s.resizeRange(s.start, s.end, len(repl)) 153 copy(s.b[s.start:], repl) 154 } 155 156 // gobble removes the current token from the input. 157 // Caller must call scan after calling gobble. 158 func (s *scanner) gobble(e error) { 159 s.setError(e) 160 if s.start == 0 { 161 s.b = s.b[:+copy(s.b, s.b[s.next:])] 162 s.end = 0 163 } else { 164 s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])] 165 s.end = s.start - 1 166 } 167 s.next = s.start 168 } 169 170 // deleteRange removes the given range from s.b before the current token. 171 func (s *scanner) deleteRange(start, end int) { 172 s.b = s.b[:start+copy(s.b[start:], s.b[end:])] 173 diff := end - start 174 s.next -= diff 175 s.start -= diff 176 s.end -= diff 177 } 178 179 // scan parses the next token of a BCP 47 string. Tokens that are larger 180 // than 8 characters or include non-alphanumeric characters result in an error 181 // and are gobbled and removed from the output. 182 // It returns the end position of the last token consumed. 183 func (s *scanner) scan() (end int) { 184 end = s.end 185 s.token = nil 186 for s.start = s.next; s.next < len(s.b); { 187 i := bytes.IndexByte(s.b[s.next:], '-') 188 if i == -1 { 189 s.end = len(s.b) 190 s.next = len(s.b) 191 i = s.end - s.start 192 } else { 193 s.end = s.next + i 194 s.next = s.end + 1 195 } 196 token := s.b[s.start:s.end] 197 if i < 1 || i > 8 || !isAlphaNum(token) { 198 s.gobble(ErrSyntax) 199 continue 200 } 201 s.token = token 202 return end 203 } 204 if n := len(s.b); n > 0 && s.b[n-1] == '-' { 205 s.setError(ErrSyntax) 206 s.b = s.b[:len(s.b)-1] 207 } 208 s.done = true 209 return end 210 } 211 212 // acceptMinSize parses multiple tokens of the given size or greater. 213 // It returns the end position of the last token consumed. 214 func (s *scanner) acceptMinSize(min int) (end int) { 215 end = s.end 216 s.scan() 217 for ; len(s.token) >= min; s.scan() { 218 end = s.end 219 } 220 return end 221 } 222 223 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing 224 // failed it returns an error and any part of the tag that could be parsed. 225 // If parsing succeeded but an unknown value was found, it returns 226 // ValueError. The Tag returned in this case is just stripped of the unknown 227 // value. All other values are preserved. It accepts tags in the BCP 47 format 228 // and extensions to this standard defined in 229 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. 230 func Parse(s string) (t Tag, err error) { 231 // TODO: consider supporting old-style locale key-value pairs. 232 if s == "" { 233 return Und, ErrSyntax 234 } 235 defer func() { 236 if recover() != nil { 237 t = Und 238 err = ErrSyntax 239 return 240 } 241 }() 242 if len(s) <= maxAltTaglen { 243 b := [maxAltTaglen]byte{} 244 for i, c := range s { 245 // Generating invalid UTF-8 is okay as it won't match. 246 if 'A' <= c && c <= 'Z' { 247 c += 'a' - 'A' 248 } else if c == '_' { 249 c = '-' 250 } 251 b[i] = byte(c) 252 } 253 if t, ok := grandfathered(b); ok { 254 return t, nil 255 } 256 } 257 scan := makeScannerString(s) 258 return parse(&scan, s) 259 } 260 261 func parse(scan *scanner, s string) (t Tag, err error) { 262 t = Und 263 var end int 264 if n := len(scan.token); n <= 1 { 265 scan.toLower(0, len(scan.b)) 266 if n == 0 || scan.token[0] != 'x' { 267 return t, ErrSyntax 268 } 269 end = parseExtensions(scan) 270 } else if n >= 4 { 271 return Und, ErrSyntax 272 } else { // the usual case 273 t, end = parseTag(scan, true) 274 if n := len(scan.token); n == 1 { 275 t.pExt = uint16(end) 276 end = parseExtensions(scan) 277 } else if end < len(scan.b) { 278 scan.setError(ErrSyntax) 279 scan.b = scan.b[:end] 280 } 281 } 282 if int(t.pVariant) < len(scan.b) { 283 if end < len(s) { 284 s = s[:end] 285 } 286 if len(s) > 0 && tag.Compare(s, scan.b) == 0 { 287 t.str = s 288 } else { 289 t.str = string(scan.b) 290 } 291 } else { 292 t.pVariant, t.pExt = 0, 0 293 } 294 return t, scan.err 295 } 296 297 // parseTag parses language, script, region and variants. 298 // It returns a Tag and the end position in the input that was parsed. 299 // If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>. 300 func parseTag(scan *scanner, doNorm bool) (t Tag, end int) { 301 var e error 302 // TODO: set an error if an unknown lang, script or region is encountered. 303 t.LangID, e = getLangID(scan.token) 304 scan.setError(e) 305 scan.replace(t.LangID.String()) 306 langStart := scan.start 307 end = scan.scan() 308 for len(scan.token) == 3 && isAlpha(scan.token[0]) { 309 // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent 310 // to a tag of the form <extlang>. 311 if doNorm { 312 lang, e := getLangID(scan.token) 313 if lang != 0 { 314 t.LangID = lang 315 langStr := lang.String() 316 copy(scan.b[langStart:], langStr) 317 scan.b[langStart+len(langStr)] = '-' 318 scan.start = langStart + len(langStr) + 1 319 } 320 scan.gobble(e) 321 } 322 end = scan.scan() 323 } 324 if len(scan.token) == 4 && isAlpha(scan.token[0]) { 325 t.ScriptID, e = getScriptID(script, scan.token) 326 if t.ScriptID == 0 { 327 scan.gobble(e) 328 } 329 end = scan.scan() 330 } 331 if n := len(scan.token); n >= 2 && n <= 3 { 332 t.RegionID, e = getRegionID(scan.token) 333 if t.RegionID == 0 { 334 scan.gobble(e) 335 } else { 336 scan.replace(t.RegionID.String()) 337 } 338 end = scan.scan() 339 } 340 scan.toLower(scan.start, len(scan.b)) 341 t.pVariant = byte(end) 342 end = parseVariants(scan, end, t) 343 t.pExt = uint16(end) 344 return t, end 345 } 346 347 var separator = []byte{'-'} 348 349 // parseVariants scans tokens as long as each token is a valid variant string. 350 // Duplicate variants are removed. 351 func parseVariants(scan *scanner, end int, t Tag) int { 352 start := scan.start 353 varIDBuf := [4]uint8{} 354 variantBuf := [4][]byte{} 355 varID := varIDBuf[:0] 356 variant := variantBuf[:0] 357 last := -1 358 needSort := false 359 for ; len(scan.token) >= 4; scan.scan() { 360 // TODO: measure the impact of needing this conversion and redesign 361 // the data structure if there is an issue. 362 v, ok := variantIndex[string(scan.token)] 363 if !ok { 364 // unknown variant 365 // TODO: allow user-defined variants? 366 scan.gobble(NewValueError(scan.token)) 367 continue 368 } 369 varID = append(varID, v) 370 variant = append(variant, scan.token) 371 if !needSort { 372 if last < int(v) { 373 last = int(v) 374 } else { 375 needSort = true 376 // There is no legal combinations of more than 7 variants 377 // (and this is by no means a useful sequence). 378 const maxVariants = 8 379 if len(varID) > maxVariants { 380 break 381 } 382 } 383 } 384 end = scan.end 385 } 386 if needSort { 387 sort.Sort(variantsSort{varID, variant}) 388 k, l := 0, -1 389 for i, v := range varID { 390 w := int(v) 391 if l == w { 392 // Remove duplicates. 393 continue 394 } 395 varID[k] = varID[i] 396 variant[k] = variant[i] 397 k++ 398 l = w 399 } 400 if str := bytes.Join(variant[:k], separator); len(str) == 0 { 401 end = start - 1 402 } else { 403 scan.resizeRange(start, end, len(str)) 404 copy(scan.b[scan.start:], str) 405 end = scan.end 406 } 407 } 408 return end 409 } 410 411 type variantsSort struct { 412 i []uint8 413 v [][]byte 414 } 415 416 func (s variantsSort) Len() int { 417 return len(s.i) 418 } 419 420 func (s variantsSort) Swap(i, j int) { 421 s.i[i], s.i[j] = s.i[j], s.i[i] 422 s.v[i], s.v[j] = s.v[j], s.v[i] 423 } 424 425 func (s variantsSort) Less(i, j int) bool { 426 return s.i[i] < s.i[j] 427 } 428 429 type bytesSort struct { 430 b [][]byte 431 n int // first n bytes to compare 432 } 433 434 func (b bytesSort) Len() int { 435 return len(b.b) 436 } 437 438 func (b bytesSort) Swap(i, j int) { 439 b.b[i], b.b[j] = b.b[j], b.b[i] 440 } 441 442 func (b bytesSort) Less(i, j int) bool { 443 for k := 0; k < b.n; k++ { 444 if b.b[i][k] == b.b[j][k] { 445 continue 446 } 447 return b.b[i][k] < b.b[j][k] 448 } 449 return false 450 } 451 452 // parseExtensions parses and normalizes the extensions in the buffer. 453 // It returns the last position of scan.b that is part of any extension. 454 // It also trims scan.b to remove excess parts accordingly. 455 func parseExtensions(scan *scanner) int { 456 start := scan.start 457 exts := [][]byte{} 458 private := []byte{} 459 end := scan.end 460 for len(scan.token) == 1 { 461 extStart := scan.start 462 ext := scan.token[0] 463 end = parseExtension(scan) 464 extension := scan.b[extStart:end] 465 if len(extension) < 3 || (ext != 'x' && len(extension) < 4) { 466 scan.setError(ErrSyntax) 467 end = extStart 468 continue 469 } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) { 470 scan.b = scan.b[:end] 471 return end 472 } else if ext == 'x' { 473 private = extension 474 break 475 } 476 exts = append(exts, extension) 477 } 478 sort.Sort(bytesSort{exts, 1}) 479 if len(private) > 0 { 480 exts = append(exts, private) 481 } 482 scan.b = scan.b[:start] 483 if len(exts) > 0 { 484 scan.b = append(scan.b, bytes.Join(exts, separator)...) 485 } else if start > 0 { 486 // Strip trailing '-'. 487 scan.b = scan.b[:start-1] 488 } 489 return end 490 } 491 492 // parseExtension parses a single extension and returns the position of 493 // the extension end. 494 func parseExtension(scan *scanner) int { 495 start, end := scan.start, scan.end 496 switch scan.token[0] { 497 case 'u': // https://www.ietf.org/rfc/rfc6067.txt 498 attrStart := end 499 scan.scan() 500 for last := []byte{}; len(scan.token) > 2; scan.scan() { 501 if bytes.Compare(scan.token, last) != -1 { 502 // Attributes are unsorted. Start over from scratch. 503 p := attrStart + 1 504 scan.next = p 505 attrs := [][]byte{} 506 for scan.scan(); len(scan.token) > 2; scan.scan() { 507 attrs = append(attrs, scan.token) 508 end = scan.end 509 } 510 sort.Sort(bytesSort{attrs, 3}) 511 copy(scan.b[p:], bytes.Join(attrs, separator)) 512 break 513 } 514 last = scan.token 515 end = scan.end 516 } 517 // Scan key-type sequences. A key is of length 2 and may be followed 518 // by 0 or more "type" subtags from 3 to the maximum of 8 letters. 519 var last, key []byte 520 for attrEnd := end; len(scan.token) == 2; last = key { 521 key = scan.token 522 end = scan.end 523 for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() { 524 end = scan.end 525 } 526 // TODO: check key value validity 527 if bytes.Compare(key, last) != 1 || scan.err != nil { 528 // We have an invalid key or the keys are not sorted. 529 // Start scanning keys from scratch and reorder. 530 p := attrEnd + 1 531 scan.next = p 532 keys := [][]byte{} 533 for scan.scan(); len(scan.token) == 2; { 534 keyStart := scan.start 535 end = scan.end 536 for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() { 537 end = scan.end 538 } 539 keys = append(keys, scan.b[keyStart:end]) 540 } 541 sort.Stable(bytesSort{keys, 2}) 542 if n := len(keys); n > 0 { 543 k := 0 544 for i := 1; i < n; i++ { 545 if !bytes.Equal(keys[k][:2], keys[i][:2]) { 546 k++ 547 keys[k] = keys[i] 548 } else if !bytes.Equal(keys[k], keys[i]) { 549 scan.setError(ErrDuplicateKey) 550 } 551 } 552 keys = keys[:k+1] 553 } 554 reordered := bytes.Join(keys, separator) 555 if e := p + len(reordered); e < end { 556 scan.deleteRange(e, end) 557 end = e 558 } 559 copy(scan.b[p:], reordered) 560 break 561 } 562 } 563 case 't': // https://www.ietf.org/rfc/rfc6497.txt 564 scan.scan() 565 if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) { 566 _, end = parseTag(scan, false) 567 scan.toLower(start, end) 568 } 569 for len(scan.token) == 2 && !isAlpha(scan.token[1]) { 570 end = scan.acceptMinSize(3) 571 } 572 case 'x': 573 end = scan.acceptMinSize(1) 574 default: 575 end = scan.acceptMinSize(2) 576 } 577 return end 578 } 579 580 // getExtension returns the name, body and end position of the extension. 581 func getExtension(s string, p int) (end int, ext string) { 582 if s[p] == '-' { 583 p++ 584 } 585 if s[p] == 'x' { 586 return len(s), s[p:] 587 } 588 end = nextExtension(s, p) 589 return end, s[p:end] 590 } 591 592 // nextExtension finds the next extension within the string, searching 593 // for the -<char>- pattern from position p. 594 // In the fast majority of cases, language tags will have at most 595 // one extension and extensions tend to be small. 596 func nextExtension(s string, p int) int { 597 for n := len(s) - 3; p < n; { 598 if s[p] == '-' { 599 if s[p+2] == '-' { 600 return p 601 } 602 p += 3 603 } else { 604 p++ 605 } 606 } 607 return len(s) 608 }