gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

parse.go (15241B)


      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package language
      6 
      7 import (
      8 	"bytes"
      9 	"errors"
     10 	"fmt"
     11 	"sort"
     12 
     13 	"golang.org/x/text/internal/tag"
     14 )
     15 
     16 // isAlpha returns true if the byte is not a digit.
     17 // b must be an ASCII letter or digit.
     18 func isAlpha(b byte) bool {
     19 	return b > '9'
     20 }
     21 
     22 // isAlphaNum returns true if the string contains only ASCII letters or digits.
     23 func isAlphaNum(s []byte) bool {
     24 	for _, c := range s {
     25 		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
     26 			return false
     27 		}
     28 	}
     29 	return true
     30 }
     31 
     32 // ErrSyntax is returned by any of the parsing functions when the
     33 // input is not well-formed, according to BCP 47.
     34 // TODO: return the position at which the syntax error occurred?
     35 var ErrSyntax = errors.New("language: tag is not well-formed")
     36 
     37 // ErrDuplicateKey is returned when a tag contains the same key twice with
     38 // different values in the -u section.
     39 var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
     40 
     41 // ValueError is returned by any of the parsing functions when the
     42 // input is well-formed but the respective subtag is not recognized
     43 // as a valid value.
     44 type ValueError struct {
     45 	v [8]byte
     46 }
     47 
     48 // NewValueError creates a new ValueError.
     49 func NewValueError(tag []byte) ValueError {
     50 	var e ValueError
     51 	copy(e.v[:], tag)
     52 	return e
     53 }
     54 
     55 func (e ValueError) tag() []byte {
     56 	n := bytes.IndexByte(e.v[:], 0)
     57 	if n == -1 {
     58 		n = 8
     59 	}
     60 	return e.v[:n]
     61 }
     62 
     63 // Error implements the error interface.
     64 func (e ValueError) Error() string {
     65 	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
     66 }
     67 
     68 // Subtag returns the subtag for which the error occurred.
     69 func (e ValueError) Subtag() string {
     70 	return string(e.tag())
     71 }
     72 
     73 // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
     74 type scanner struct {
     75 	b     []byte
     76 	bytes [max99thPercentileSize]byte
     77 	token []byte
     78 	start int // start position of the current token
     79 	end   int // end position of the current token
     80 	next  int // next point for scan
     81 	err   error
     82 	done  bool
     83 }
     84 
     85 func makeScannerString(s string) scanner {
     86 	scan := scanner{}
     87 	if len(s) <= len(scan.bytes) {
     88 		scan.b = scan.bytes[:copy(scan.bytes[:], s)]
     89 	} else {
     90 		scan.b = []byte(s)
     91 	}
     92 	scan.init()
     93 	return scan
     94 }
     95 
     96 // makeScanner returns a scanner using b as the input buffer.
     97 // b is not copied and may be modified by the scanner routines.
     98 func makeScanner(b []byte) scanner {
     99 	scan := scanner{b: b}
    100 	scan.init()
    101 	return scan
    102 }
    103 
    104 func (s *scanner) init() {
    105 	for i, c := range s.b {
    106 		if c == '_' {
    107 			s.b[i] = '-'
    108 		}
    109 	}
    110 	s.scan()
    111 }
    112 
    113 // restToLower converts the string between start and end to lower case.
    114 func (s *scanner) toLower(start, end int) {
    115 	for i := start; i < end; i++ {
    116 		c := s.b[i]
    117 		if 'A' <= c && c <= 'Z' {
    118 			s.b[i] += 'a' - 'A'
    119 		}
    120 	}
    121 }
    122 
    123 func (s *scanner) setError(e error) {
    124 	if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
    125 		s.err = e
    126 	}
    127 }
    128 
    129 // resizeRange shrinks or grows the array at position oldStart such that
    130 // a new string of size newSize can fit between oldStart and oldEnd.
    131 // Sets the scan point to after the resized range.
    132 func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
    133 	s.start = oldStart
    134 	if end := oldStart + newSize; end != oldEnd {
    135 		diff := end - oldEnd
    136 		var b []byte
    137 		if n := len(s.b) + diff; n > cap(s.b) {
    138 			b = make([]byte, n)
    139 			copy(b, s.b[:oldStart])
    140 		} else {
    141 			b = s.b[:n]
    142 		}
    143 		copy(b[end:], s.b[oldEnd:])
    144 		s.b = b
    145 		s.next = end + (s.next - s.end)
    146 		s.end = end
    147 	}
    148 }
    149 
    150 // replace replaces the current token with repl.
    151 func (s *scanner) replace(repl string) {
    152 	s.resizeRange(s.start, s.end, len(repl))
    153 	copy(s.b[s.start:], repl)
    154 }
    155 
    156 // gobble removes the current token from the input.
    157 // Caller must call scan after calling gobble.
    158 func (s *scanner) gobble(e error) {
    159 	s.setError(e)
    160 	if s.start == 0 {
    161 		s.b = s.b[:+copy(s.b, s.b[s.next:])]
    162 		s.end = 0
    163 	} else {
    164 		s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
    165 		s.end = s.start - 1
    166 	}
    167 	s.next = s.start
    168 }
    169 
    170 // deleteRange removes the given range from s.b before the current token.
    171 func (s *scanner) deleteRange(start, end int) {
    172 	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
    173 	diff := end - start
    174 	s.next -= diff
    175 	s.start -= diff
    176 	s.end -= diff
    177 }
    178 
    179 // scan parses the next token of a BCP 47 string.  Tokens that are larger
    180 // than 8 characters or include non-alphanumeric characters result in an error
    181 // and are gobbled and removed from the output.
    182 // It returns the end position of the last token consumed.
    183 func (s *scanner) scan() (end int) {
    184 	end = s.end
    185 	s.token = nil
    186 	for s.start = s.next; s.next < len(s.b); {
    187 		i := bytes.IndexByte(s.b[s.next:], '-')
    188 		if i == -1 {
    189 			s.end = len(s.b)
    190 			s.next = len(s.b)
    191 			i = s.end - s.start
    192 		} else {
    193 			s.end = s.next + i
    194 			s.next = s.end + 1
    195 		}
    196 		token := s.b[s.start:s.end]
    197 		if i < 1 || i > 8 || !isAlphaNum(token) {
    198 			s.gobble(ErrSyntax)
    199 			continue
    200 		}
    201 		s.token = token
    202 		return end
    203 	}
    204 	if n := len(s.b); n > 0 && s.b[n-1] == '-' {
    205 		s.setError(ErrSyntax)
    206 		s.b = s.b[:len(s.b)-1]
    207 	}
    208 	s.done = true
    209 	return end
    210 }
    211 
    212 // acceptMinSize parses multiple tokens of the given size or greater.
    213 // It returns the end position of the last token consumed.
    214 func (s *scanner) acceptMinSize(min int) (end int) {
    215 	end = s.end
    216 	s.scan()
    217 	for ; len(s.token) >= min; s.scan() {
    218 		end = s.end
    219 	}
    220 	return end
    221 }
    222 
    223 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
    224 // failed it returns an error and any part of the tag that could be parsed.
    225 // If parsing succeeded but an unknown value was found, it returns
    226 // ValueError. The Tag returned in this case is just stripped of the unknown
    227 // value. All other values are preserved. It accepts tags in the BCP 47 format
    228 // and extensions to this standard defined in
    229 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
    230 func Parse(s string) (t Tag, err error) {
    231 	// TODO: consider supporting old-style locale key-value pairs.
    232 	if s == "" {
    233 		return Und, ErrSyntax
    234 	}
    235 	defer func() {
    236 		if recover() != nil {
    237 			t = Und
    238 			err = ErrSyntax
    239 			return
    240 		}
    241 	}()
    242 	if len(s) <= maxAltTaglen {
    243 		b := [maxAltTaglen]byte{}
    244 		for i, c := range s {
    245 			// Generating invalid UTF-8 is okay as it won't match.
    246 			if 'A' <= c && c <= 'Z' {
    247 				c += 'a' - 'A'
    248 			} else if c == '_' {
    249 				c = '-'
    250 			}
    251 			b[i] = byte(c)
    252 		}
    253 		if t, ok := grandfathered(b); ok {
    254 			return t, nil
    255 		}
    256 	}
    257 	scan := makeScannerString(s)
    258 	return parse(&scan, s)
    259 }
    260 
    261 func parse(scan *scanner, s string) (t Tag, err error) {
    262 	t = Und
    263 	var end int
    264 	if n := len(scan.token); n <= 1 {
    265 		scan.toLower(0, len(scan.b))
    266 		if n == 0 || scan.token[0] != 'x' {
    267 			return t, ErrSyntax
    268 		}
    269 		end = parseExtensions(scan)
    270 	} else if n >= 4 {
    271 		return Und, ErrSyntax
    272 	} else { // the usual case
    273 		t, end = parseTag(scan, true)
    274 		if n := len(scan.token); n == 1 {
    275 			t.pExt = uint16(end)
    276 			end = parseExtensions(scan)
    277 		} else if end < len(scan.b) {
    278 			scan.setError(ErrSyntax)
    279 			scan.b = scan.b[:end]
    280 		}
    281 	}
    282 	if int(t.pVariant) < len(scan.b) {
    283 		if end < len(s) {
    284 			s = s[:end]
    285 		}
    286 		if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
    287 			t.str = s
    288 		} else {
    289 			t.str = string(scan.b)
    290 		}
    291 	} else {
    292 		t.pVariant, t.pExt = 0, 0
    293 	}
    294 	return t, scan.err
    295 }
    296 
    297 // parseTag parses language, script, region and variants.
    298 // It returns a Tag and the end position in the input that was parsed.
    299 // If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
    300 func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
    301 	var e error
    302 	// TODO: set an error if an unknown lang, script or region is encountered.
    303 	t.LangID, e = getLangID(scan.token)
    304 	scan.setError(e)
    305 	scan.replace(t.LangID.String())
    306 	langStart := scan.start
    307 	end = scan.scan()
    308 	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
    309 		// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
    310 		// to a tag of the form <extlang>.
    311 		if doNorm {
    312 			lang, e := getLangID(scan.token)
    313 			if lang != 0 {
    314 				t.LangID = lang
    315 				langStr := lang.String()
    316 				copy(scan.b[langStart:], langStr)
    317 				scan.b[langStart+len(langStr)] = '-'
    318 				scan.start = langStart + len(langStr) + 1
    319 			}
    320 			scan.gobble(e)
    321 		}
    322 		end = scan.scan()
    323 	}
    324 	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
    325 		t.ScriptID, e = getScriptID(script, scan.token)
    326 		if t.ScriptID == 0 {
    327 			scan.gobble(e)
    328 		}
    329 		end = scan.scan()
    330 	}
    331 	if n := len(scan.token); n >= 2 && n <= 3 {
    332 		t.RegionID, e = getRegionID(scan.token)
    333 		if t.RegionID == 0 {
    334 			scan.gobble(e)
    335 		} else {
    336 			scan.replace(t.RegionID.String())
    337 		}
    338 		end = scan.scan()
    339 	}
    340 	scan.toLower(scan.start, len(scan.b))
    341 	t.pVariant = byte(end)
    342 	end = parseVariants(scan, end, t)
    343 	t.pExt = uint16(end)
    344 	return t, end
    345 }
    346 
    347 var separator = []byte{'-'}
    348 
    349 // parseVariants scans tokens as long as each token is a valid variant string.
    350 // Duplicate variants are removed.
    351 func parseVariants(scan *scanner, end int, t Tag) int {
    352 	start := scan.start
    353 	varIDBuf := [4]uint8{}
    354 	variantBuf := [4][]byte{}
    355 	varID := varIDBuf[:0]
    356 	variant := variantBuf[:0]
    357 	last := -1
    358 	needSort := false
    359 	for ; len(scan.token) >= 4; scan.scan() {
    360 		// TODO: measure the impact of needing this conversion and redesign
    361 		// the data structure if there is an issue.
    362 		v, ok := variantIndex[string(scan.token)]
    363 		if !ok {
    364 			// unknown variant
    365 			// TODO: allow user-defined variants?
    366 			scan.gobble(NewValueError(scan.token))
    367 			continue
    368 		}
    369 		varID = append(varID, v)
    370 		variant = append(variant, scan.token)
    371 		if !needSort {
    372 			if last < int(v) {
    373 				last = int(v)
    374 			} else {
    375 				needSort = true
    376 				// There is no legal combinations of more than 7 variants
    377 				// (and this is by no means a useful sequence).
    378 				const maxVariants = 8
    379 				if len(varID) > maxVariants {
    380 					break
    381 				}
    382 			}
    383 		}
    384 		end = scan.end
    385 	}
    386 	if needSort {
    387 		sort.Sort(variantsSort{varID, variant})
    388 		k, l := 0, -1
    389 		for i, v := range varID {
    390 			w := int(v)
    391 			if l == w {
    392 				// Remove duplicates.
    393 				continue
    394 			}
    395 			varID[k] = varID[i]
    396 			variant[k] = variant[i]
    397 			k++
    398 			l = w
    399 		}
    400 		if str := bytes.Join(variant[:k], separator); len(str) == 0 {
    401 			end = start - 1
    402 		} else {
    403 			scan.resizeRange(start, end, len(str))
    404 			copy(scan.b[scan.start:], str)
    405 			end = scan.end
    406 		}
    407 	}
    408 	return end
    409 }
    410 
    411 type variantsSort struct {
    412 	i []uint8
    413 	v [][]byte
    414 }
    415 
    416 func (s variantsSort) Len() int {
    417 	return len(s.i)
    418 }
    419 
    420 func (s variantsSort) Swap(i, j int) {
    421 	s.i[i], s.i[j] = s.i[j], s.i[i]
    422 	s.v[i], s.v[j] = s.v[j], s.v[i]
    423 }
    424 
    425 func (s variantsSort) Less(i, j int) bool {
    426 	return s.i[i] < s.i[j]
    427 }
    428 
    429 type bytesSort struct {
    430 	b [][]byte
    431 	n int // first n bytes to compare
    432 }
    433 
    434 func (b bytesSort) Len() int {
    435 	return len(b.b)
    436 }
    437 
    438 func (b bytesSort) Swap(i, j int) {
    439 	b.b[i], b.b[j] = b.b[j], b.b[i]
    440 }
    441 
    442 func (b bytesSort) Less(i, j int) bool {
    443 	for k := 0; k < b.n; k++ {
    444 		if b.b[i][k] == b.b[j][k] {
    445 			continue
    446 		}
    447 		return b.b[i][k] < b.b[j][k]
    448 	}
    449 	return false
    450 }
    451 
    452 // parseExtensions parses and normalizes the extensions in the buffer.
    453 // It returns the last position of scan.b that is part of any extension.
    454 // It also trims scan.b to remove excess parts accordingly.
    455 func parseExtensions(scan *scanner) int {
    456 	start := scan.start
    457 	exts := [][]byte{}
    458 	private := []byte{}
    459 	end := scan.end
    460 	for len(scan.token) == 1 {
    461 		extStart := scan.start
    462 		ext := scan.token[0]
    463 		end = parseExtension(scan)
    464 		extension := scan.b[extStart:end]
    465 		if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
    466 			scan.setError(ErrSyntax)
    467 			end = extStart
    468 			continue
    469 		} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
    470 			scan.b = scan.b[:end]
    471 			return end
    472 		} else if ext == 'x' {
    473 			private = extension
    474 			break
    475 		}
    476 		exts = append(exts, extension)
    477 	}
    478 	sort.Sort(bytesSort{exts, 1})
    479 	if len(private) > 0 {
    480 		exts = append(exts, private)
    481 	}
    482 	scan.b = scan.b[:start]
    483 	if len(exts) > 0 {
    484 		scan.b = append(scan.b, bytes.Join(exts, separator)...)
    485 	} else if start > 0 {
    486 		// Strip trailing '-'.
    487 		scan.b = scan.b[:start-1]
    488 	}
    489 	return end
    490 }
    491 
    492 // parseExtension parses a single extension and returns the position of
    493 // the extension end.
    494 func parseExtension(scan *scanner) int {
    495 	start, end := scan.start, scan.end
    496 	switch scan.token[0] {
    497 	case 'u': // https://www.ietf.org/rfc/rfc6067.txt
    498 		attrStart := end
    499 		scan.scan()
    500 		for last := []byte{}; len(scan.token) > 2; scan.scan() {
    501 			if bytes.Compare(scan.token, last) != -1 {
    502 				// Attributes are unsorted. Start over from scratch.
    503 				p := attrStart + 1
    504 				scan.next = p
    505 				attrs := [][]byte{}
    506 				for scan.scan(); len(scan.token) > 2; scan.scan() {
    507 					attrs = append(attrs, scan.token)
    508 					end = scan.end
    509 				}
    510 				sort.Sort(bytesSort{attrs, 3})
    511 				copy(scan.b[p:], bytes.Join(attrs, separator))
    512 				break
    513 			}
    514 			last = scan.token
    515 			end = scan.end
    516 		}
    517 		// Scan key-type sequences. A key is of length 2 and may be followed
    518 		// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
    519 		var last, key []byte
    520 		for attrEnd := end; len(scan.token) == 2; last = key {
    521 			key = scan.token
    522 			end = scan.end
    523 			for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
    524 				end = scan.end
    525 			}
    526 			// TODO: check key value validity
    527 			if bytes.Compare(key, last) != 1 || scan.err != nil {
    528 				// We have an invalid key or the keys are not sorted.
    529 				// Start scanning keys from scratch and reorder.
    530 				p := attrEnd + 1
    531 				scan.next = p
    532 				keys := [][]byte{}
    533 				for scan.scan(); len(scan.token) == 2; {
    534 					keyStart := scan.start
    535 					end = scan.end
    536 					for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
    537 						end = scan.end
    538 					}
    539 					keys = append(keys, scan.b[keyStart:end])
    540 				}
    541 				sort.Stable(bytesSort{keys, 2})
    542 				if n := len(keys); n > 0 {
    543 					k := 0
    544 					for i := 1; i < n; i++ {
    545 						if !bytes.Equal(keys[k][:2], keys[i][:2]) {
    546 							k++
    547 							keys[k] = keys[i]
    548 						} else if !bytes.Equal(keys[k], keys[i]) {
    549 							scan.setError(ErrDuplicateKey)
    550 						}
    551 					}
    552 					keys = keys[:k+1]
    553 				}
    554 				reordered := bytes.Join(keys, separator)
    555 				if e := p + len(reordered); e < end {
    556 					scan.deleteRange(e, end)
    557 					end = e
    558 				}
    559 				copy(scan.b[p:], reordered)
    560 				break
    561 			}
    562 		}
    563 	case 't': // https://www.ietf.org/rfc/rfc6497.txt
    564 		scan.scan()
    565 		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
    566 			_, end = parseTag(scan, false)
    567 			scan.toLower(start, end)
    568 		}
    569 		for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
    570 			end = scan.acceptMinSize(3)
    571 		}
    572 	case 'x':
    573 		end = scan.acceptMinSize(1)
    574 	default:
    575 		end = scan.acceptMinSize(2)
    576 	}
    577 	return end
    578 }
    579 
    580 // getExtension returns the name, body and end position of the extension.
    581 func getExtension(s string, p int) (end int, ext string) {
    582 	if s[p] == '-' {
    583 		p++
    584 	}
    585 	if s[p] == 'x' {
    586 		return len(s), s[p:]
    587 	}
    588 	end = nextExtension(s, p)
    589 	return end, s[p:end]
    590 }
    591 
    592 // nextExtension finds the next extension within the string, searching
    593 // for the -<char>- pattern from position p.
    594 // In the fast majority of cases, language tags will have at most
    595 // one extension and extensions tend to be small.
    596 func nextExtension(s string, p int) int {
    597 	for n := len(s) - 3; p < n; {
    598 		if s[p] == '-' {
    599 			if s[p+2] == '-' {
    600 				return p
    601 			}
    602 			p += 3
    603 		} else {
    604 			p++
    605 		}
    606 	}
    607 	return len(s)
    608 }