gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

language.go (17174B)


      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 //go:generate go run gen.go gen_common.go -output tables.go
      6 
      7 package language // import "golang.org/x/text/internal/language"
      8 
      9 // TODO: Remove above NOTE after:
     10 // - verifying that tables are dropped correctly (most notably matcher tables).
     11 
     12 import (
     13 	"errors"
     14 	"fmt"
     15 	"strings"
     16 )
     17 
     18 const (
     19 	// maxCoreSize is the maximum size of a BCP 47 tag without variants and
     20 	// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
     21 	maxCoreSize = 12
     22 
     23 	// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
     24 	// is large enough to hold at least 99% of the BCP 47 tags.
     25 	max99thPercentileSize = 32
     26 
     27 	// maxSimpleUExtensionSize is the maximum size of a -u extension with one
     28 	// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
     29 	maxSimpleUExtensionSize = 14
     30 )
     31 
     32 // Tag represents a BCP 47 language tag. It is used to specify an instance of a
     33 // specific language or locale. All language tag values are guaranteed to be
     34 // well-formed. The zero value of Tag is Und.
     35 type Tag struct {
     36 	// TODO: the following fields have the form TagTypeID. This name is chosen
     37 	// to allow refactoring the public package without conflicting with its
     38 	// Base, Script, and Region methods. Once the transition is fully completed
     39 	// the ID can be stripped from the name.
     40 
     41 	LangID   Language
     42 	RegionID Region
     43 	// TODO: we will soon run out of positions for ScriptID. Idea: instead of
     44 	// storing lang, region, and ScriptID codes, store only the compact index and
     45 	// have a lookup table from this code to its expansion. This greatly speeds
     46 	// up table lookup, speed up common variant cases.
     47 	// This will also immediately free up 3 extra bytes. Also, the pVariant
     48 	// field can now be moved to the lookup table, as the compact index uniquely
     49 	// determines the offset of a possible variant.
     50 	ScriptID Script
     51 	pVariant byte   // offset in str, includes preceding '-'
     52 	pExt     uint16 // offset of first extension, includes preceding '-'
     53 
     54 	// str is the string representation of the Tag. It will only be used if the
     55 	// tag has variants or extensions.
     56 	str string
     57 }
     58 
     59 // Make is a convenience wrapper for Parse that omits the error.
     60 // In case of an error, a sensible default is returned.
     61 func Make(s string) Tag {
     62 	t, _ := Parse(s)
     63 	return t
     64 }
     65 
     66 // Raw returns the raw base language, script and region, without making an
     67 // attempt to infer their values.
     68 // TODO: consider removing
     69 func (t Tag) Raw() (b Language, s Script, r Region) {
     70 	return t.LangID, t.ScriptID, t.RegionID
     71 }
     72 
     73 // equalTags compares language, script and region subtags only.
     74 func (t Tag) equalTags(a Tag) bool {
     75 	return t.LangID == a.LangID && t.ScriptID == a.ScriptID && t.RegionID == a.RegionID
     76 }
     77 
     78 // IsRoot returns true if t is equal to language "und".
     79 func (t Tag) IsRoot() bool {
     80 	if int(t.pVariant) < len(t.str) {
     81 		return false
     82 	}
     83 	return t.equalTags(Und)
     84 }
     85 
     86 // IsPrivateUse reports whether the Tag consists solely of an IsPrivateUse use
     87 // tag.
     88 func (t Tag) IsPrivateUse() bool {
     89 	return t.str != "" && t.pVariant == 0
     90 }
     91 
     92 // RemakeString is used to update t.str in case lang, script or region changed.
     93 // It is assumed that pExt and pVariant still point to the start of the
     94 // respective parts.
     95 func (t *Tag) RemakeString() {
     96 	if t.str == "" {
     97 		return
     98 	}
     99 	extra := t.str[t.pVariant:]
    100 	if t.pVariant > 0 {
    101 		extra = extra[1:]
    102 	}
    103 	if t.equalTags(Und) && strings.HasPrefix(extra, "x-") {
    104 		t.str = extra
    105 		t.pVariant = 0
    106 		t.pExt = 0
    107 		return
    108 	}
    109 	var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
    110 	b := buf[:t.genCoreBytes(buf[:])]
    111 	if extra != "" {
    112 		diff := len(b) - int(t.pVariant)
    113 		b = append(b, '-')
    114 		b = append(b, extra...)
    115 		t.pVariant = uint8(int(t.pVariant) + diff)
    116 		t.pExt = uint16(int(t.pExt) + diff)
    117 	} else {
    118 		t.pVariant = uint8(len(b))
    119 		t.pExt = uint16(len(b))
    120 	}
    121 	t.str = string(b)
    122 }
    123 
    124 // genCoreBytes writes a string for the base languages, script and region tags
    125 // to the given buffer and returns the number of bytes written. It will never
    126 // write more than maxCoreSize bytes.
    127 func (t *Tag) genCoreBytes(buf []byte) int {
    128 	n := t.LangID.StringToBuf(buf[:])
    129 	if t.ScriptID != 0 {
    130 		n += copy(buf[n:], "-")
    131 		n += copy(buf[n:], t.ScriptID.String())
    132 	}
    133 	if t.RegionID != 0 {
    134 		n += copy(buf[n:], "-")
    135 		n += copy(buf[n:], t.RegionID.String())
    136 	}
    137 	return n
    138 }
    139 
    140 // String returns the canonical string representation of the language tag.
    141 func (t Tag) String() string {
    142 	if t.str != "" {
    143 		return t.str
    144 	}
    145 	if t.ScriptID == 0 && t.RegionID == 0 {
    146 		return t.LangID.String()
    147 	}
    148 	buf := [maxCoreSize]byte{}
    149 	return string(buf[:t.genCoreBytes(buf[:])])
    150 }
    151 
    152 // MarshalText implements encoding.TextMarshaler.
    153 func (t Tag) MarshalText() (text []byte, err error) {
    154 	if t.str != "" {
    155 		text = append(text, t.str...)
    156 	} else if t.ScriptID == 0 && t.RegionID == 0 {
    157 		text = append(text, t.LangID.String()...)
    158 	} else {
    159 		buf := [maxCoreSize]byte{}
    160 		text = buf[:t.genCoreBytes(buf[:])]
    161 	}
    162 	return text, nil
    163 }
    164 
    165 // UnmarshalText implements encoding.TextUnmarshaler.
    166 func (t *Tag) UnmarshalText(text []byte) error {
    167 	tag, err := Parse(string(text))
    168 	*t = tag
    169 	return err
    170 }
    171 
    172 // Variants returns the part of the tag holding all variants or the empty string
    173 // if there are no variants defined.
    174 func (t Tag) Variants() string {
    175 	if t.pVariant == 0 {
    176 		return ""
    177 	}
    178 	return t.str[t.pVariant:t.pExt]
    179 }
    180 
    181 // VariantOrPrivateUseTags returns variants or private use tags.
    182 func (t Tag) VariantOrPrivateUseTags() string {
    183 	if t.pExt > 0 {
    184 		return t.str[t.pVariant:t.pExt]
    185 	}
    186 	return t.str[t.pVariant:]
    187 }
    188 
    189 // HasString reports whether this tag defines more than just the raw
    190 // components.
    191 func (t Tag) HasString() bool {
    192 	return t.str != ""
    193 }
    194 
    195 // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
    196 // specific language are substituted with fields from the parent language.
    197 // The parent for a language may change for newer versions of CLDR.
    198 func (t Tag) Parent() Tag {
    199 	if t.str != "" {
    200 		// Strip the variants and extensions.
    201 		b, s, r := t.Raw()
    202 		t = Tag{LangID: b, ScriptID: s, RegionID: r}
    203 		if t.RegionID == 0 && t.ScriptID != 0 && t.LangID != 0 {
    204 			base, _ := addTags(Tag{LangID: t.LangID})
    205 			if base.ScriptID == t.ScriptID {
    206 				return Tag{LangID: t.LangID}
    207 			}
    208 		}
    209 		return t
    210 	}
    211 	if t.LangID != 0 {
    212 		if t.RegionID != 0 {
    213 			maxScript := t.ScriptID
    214 			if maxScript == 0 {
    215 				max, _ := addTags(t)
    216 				maxScript = max.ScriptID
    217 			}
    218 
    219 			for i := range parents {
    220 				if Language(parents[i].lang) == t.LangID && Script(parents[i].maxScript) == maxScript {
    221 					for _, r := range parents[i].fromRegion {
    222 						if Region(r) == t.RegionID {
    223 							return Tag{
    224 								LangID:   t.LangID,
    225 								ScriptID: Script(parents[i].script),
    226 								RegionID: Region(parents[i].toRegion),
    227 							}
    228 						}
    229 					}
    230 				}
    231 			}
    232 
    233 			// Strip the script if it is the default one.
    234 			base, _ := addTags(Tag{LangID: t.LangID})
    235 			if base.ScriptID != maxScript {
    236 				return Tag{LangID: t.LangID, ScriptID: maxScript}
    237 			}
    238 			return Tag{LangID: t.LangID}
    239 		} else if t.ScriptID != 0 {
    240 			// The parent for an base-script pair with a non-default script is
    241 			// "und" instead of the base language.
    242 			base, _ := addTags(Tag{LangID: t.LangID})
    243 			if base.ScriptID != t.ScriptID {
    244 				return Und
    245 			}
    246 			return Tag{LangID: t.LangID}
    247 		}
    248 	}
    249 	return Und
    250 }
    251 
    252 // ParseExtension parses s as an extension and returns it on success.
    253 func ParseExtension(s string) (ext string, err error) {
    254 	defer func() {
    255 		if recover() != nil {
    256 			ext = ""
    257 			err = ErrSyntax
    258 		}
    259 	}()
    260 
    261 	scan := makeScannerString(s)
    262 	var end int
    263 	if n := len(scan.token); n != 1 {
    264 		return "", ErrSyntax
    265 	}
    266 	scan.toLower(0, len(scan.b))
    267 	end = parseExtension(&scan)
    268 	if end != len(s) {
    269 		return "", ErrSyntax
    270 	}
    271 	return string(scan.b), nil
    272 }
    273 
    274 // HasVariants reports whether t has variants.
    275 func (t Tag) HasVariants() bool {
    276 	return uint16(t.pVariant) < t.pExt
    277 }
    278 
    279 // HasExtensions reports whether t has extensions.
    280 func (t Tag) HasExtensions() bool {
    281 	return int(t.pExt) < len(t.str)
    282 }
    283 
    284 // Extension returns the extension of type x for tag t. It will return
    285 // false for ok if t does not have the requested extension. The returned
    286 // extension will be invalid in this case.
    287 func (t Tag) Extension(x byte) (ext string, ok bool) {
    288 	for i := int(t.pExt); i < len(t.str)-1; {
    289 		var ext string
    290 		i, ext = getExtension(t.str, i)
    291 		if ext[0] == x {
    292 			return ext, true
    293 		}
    294 	}
    295 	return "", false
    296 }
    297 
    298 // Extensions returns all extensions of t.
    299 func (t Tag) Extensions() []string {
    300 	e := []string{}
    301 	for i := int(t.pExt); i < len(t.str)-1; {
    302 		var ext string
    303 		i, ext = getExtension(t.str, i)
    304 		e = append(e, ext)
    305 	}
    306 	return e
    307 }
    308 
    309 // TypeForKey returns the type associated with the given key, where key and type
    310 // are of the allowed values defined for the Unicode locale extension ('u') in
    311 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
    312 // TypeForKey will traverse the inheritance chain to get the correct value.
    313 //
    314 // If there are multiple types associated with a key, only the first will be
    315 // returned. If there is no type associated with a key, it returns the empty
    316 // string.
    317 func (t Tag) TypeForKey(key string) string {
    318 	if _, start, end, _ := t.findTypeForKey(key); end != start {
    319 		s := t.str[start:end]
    320 		if p := strings.IndexByte(s, '-'); p >= 0 {
    321 			s = s[:p]
    322 		}
    323 		return s
    324 	}
    325 	return ""
    326 }
    327 
    328 var (
    329 	errPrivateUse       = errors.New("cannot set a key on a private use tag")
    330 	errInvalidArguments = errors.New("invalid key or type")
    331 )
    332 
    333 // SetTypeForKey returns a new Tag with the key set to type, where key and type
    334 // are of the allowed values defined for the Unicode locale extension ('u') in
    335 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
    336 // An empty value removes an existing pair with the same key.
    337 func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
    338 	if t.IsPrivateUse() {
    339 		return t, errPrivateUse
    340 	}
    341 	if len(key) != 2 {
    342 		return t, errInvalidArguments
    343 	}
    344 
    345 	// Remove the setting if value is "".
    346 	if value == "" {
    347 		start, sep, end, _ := t.findTypeForKey(key)
    348 		if start != sep {
    349 			// Remove a possible empty extension.
    350 			switch {
    351 			case t.str[start-2] != '-': // has previous elements.
    352 			case end == len(t.str), // end of string
    353 				end+2 < len(t.str) && t.str[end+2] == '-': // end of extension
    354 				start -= 2
    355 			}
    356 			if start == int(t.pVariant) && end == len(t.str) {
    357 				t.str = ""
    358 				t.pVariant, t.pExt = 0, 0
    359 			} else {
    360 				t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
    361 			}
    362 		}
    363 		return t, nil
    364 	}
    365 
    366 	if len(value) < 3 || len(value) > 8 {
    367 		return t, errInvalidArguments
    368 	}
    369 
    370 	var (
    371 		buf    [maxCoreSize + maxSimpleUExtensionSize]byte
    372 		uStart int // start of the -u extension.
    373 	)
    374 
    375 	// Generate the tag string if needed.
    376 	if t.str == "" {
    377 		uStart = t.genCoreBytes(buf[:])
    378 		buf[uStart] = '-'
    379 		uStart++
    380 	}
    381 
    382 	// Create new key-type pair and parse it to verify.
    383 	b := buf[uStart:]
    384 	copy(b, "u-")
    385 	copy(b[2:], key)
    386 	b[4] = '-'
    387 	b = b[:5+copy(b[5:], value)]
    388 	scan := makeScanner(b)
    389 	if parseExtensions(&scan); scan.err != nil {
    390 		return t, scan.err
    391 	}
    392 
    393 	// Assemble the replacement string.
    394 	if t.str == "" {
    395 		t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
    396 		t.str = string(buf[:uStart+len(b)])
    397 	} else {
    398 		s := t.str
    399 		start, sep, end, hasExt := t.findTypeForKey(key)
    400 		if start == sep {
    401 			if hasExt {
    402 				b = b[2:]
    403 			}
    404 			t.str = fmt.Sprintf("%s-%s%s", s[:sep], b, s[end:])
    405 		} else {
    406 			t.str = fmt.Sprintf("%s-%s%s", s[:start+3], value, s[end:])
    407 		}
    408 	}
    409 	return t, nil
    410 }
    411 
    412 // findTypeForKey returns the start and end position for the type corresponding
    413 // to key or the point at which to insert the key-value pair if the type
    414 // wasn't found. The hasExt return value reports whether an -u extension was present.
    415 // Note: the extensions are typically very small and are likely to contain
    416 // only one key-type pair.
    417 func (t Tag) findTypeForKey(key string) (start, sep, end int, hasExt bool) {
    418 	p := int(t.pExt)
    419 	if len(key) != 2 || p == len(t.str) || p == 0 {
    420 		return p, p, p, false
    421 	}
    422 	s := t.str
    423 
    424 	// Find the correct extension.
    425 	for p++; s[p] != 'u'; p++ {
    426 		if s[p] > 'u' {
    427 			p--
    428 			return p, p, p, false
    429 		}
    430 		if p = nextExtension(s, p); p == len(s) {
    431 			return len(s), len(s), len(s), false
    432 		}
    433 	}
    434 	// Proceed to the hyphen following the extension name.
    435 	p++
    436 
    437 	// curKey is the key currently being processed.
    438 	curKey := ""
    439 
    440 	// Iterate over keys until we get the end of a section.
    441 	for {
    442 		end = p
    443 		for p++; p < len(s) && s[p] != '-'; p++ {
    444 		}
    445 		n := p - end - 1
    446 		if n <= 2 && curKey == key {
    447 			if sep < end {
    448 				sep++
    449 			}
    450 			return start, sep, end, true
    451 		}
    452 		switch n {
    453 		case 0, // invalid string
    454 			1: // next extension
    455 			return end, end, end, true
    456 		case 2:
    457 			// next key
    458 			curKey = s[end+1 : p]
    459 			if curKey > key {
    460 				return end, end, end, true
    461 			}
    462 			start = end
    463 			sep = p
    464 		}
    465 	}
    466 }
    467 
    468 // ParseBase parses a 2- or 3-letter ISO 639 code.
    469 // It returns a ValueError if s is a well-formed but unknown language identifier
    470 // or another error if another error occurred.
    471 func ParseBase(s string) (l Language, err error) {
    472 	defer func() {
    473 		if recover() != nil {
    474 			l = 0
    475 			err = ErrSyntax
    476 		}
    477 	}()
    478 
    479 	if n := len(s); n < 2 || 3 < n {
    480 		return 0, ErrSyntax
    481 	}
    482 	var buf [3]byte
    483 	return getLangID(buf[:copy(buf[:], s)])
    484 }
    485 
    486 // ParseScript parses a 4-letter ISO 15924 code.
    487 // It returns a ValueError if s is a well-formed but unknown script identifier
    488 // or another error if another error occurred.
    489 func ParseScript(s string) (scr Script, err error) {
    490 	defer func() {
    491 		if recover() != nil {
    492 			scr = 0
    493 			err = ErrSyntax
    494 		}
    495 	}()
    496 
    497 	if len(s) != 4 {
    498 		return 0, ErrSyntax
    499 	}
    500 	var buf [4]byte
    501 	return getScriptID(script, buf[:copy(buf[:], s)])
    502 }
    503 
    504 // EncodeM49 returns the Region for the given UN M.49 code.
    505 // It returns an error if r is not a valid code.
    506 func EncodeM49(r int) (Region, error) {
    507 	return getRegionM49(r)
    508 }
    509 
    510 // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
    511 // It returns a ValueError if s is a well-formed but unknown region identifier
    512 // or another error if another error occurred.
    513 func ParseRegion(s string) (r Region, err error) {
    514 	defer func() {
    515 		if recover() != nil {
    516 			r = 0
    517 			err = ErrSyntax
    518 		}
    519 	}()
    520 
    521 	if n := len(s); n < 2 || 3 < n {
    522 		return 0, ErrSyntax
    523 	}
    524 	var buf [3]byte
    525 	return getRegionID(buf[:copy(buf[:], s)])
    526 }
    527 
    528 // IsCountry returns whether this region is a country or autonomous area. This
    529 // includes non-standard definitions from CLDR.
    530 func (r Region) IsCountry() bool {
    531 	if r == 0 || r.IsGroup() || r.IsPrivateUse() && r != _XK {
    532 		return false
    533 	}
    534 	return true
    535 }
    536 
    537 // IsGroup returns whether this region defines a collection of regions. This
    538 // includes non-standard definitions from CLDR.
    539 func (r Region) IsGroup() bool {
    540 	if r == 0 {
    541 		return false
    542 	}
    543 	return int(regionInclusion[r]) < len(regionContainment)
    544 }
    545 
    546 // Contains returns whether Region c is contained by Region r. It returns true
    547 // if c == r.
    548 func (r Region) Contains(c Region) bool {
    549 	if r == c {
    550 		return true
    551 	}
    552 	g := regionInclusion[r]
    553 	if g >= nRegionGroups {
    554 		return false
    555 	}
    556 	m := regionContainment[g]
    557 
    558 	d := regionInclusion[c]
    559 	b := regionInclusionBits[d]
    560 
    561 	// A contained country may belong to multiple disjoint groups. Matching any
    562 	// of these indicates containment. If the contained region is a group, it
    563 	// must strictly be a subset.
    564 	if d >= nRegionGroups {
    565 		return b&m != 0
    566 	}
    567 	return b&^m == 0
    568 }
    569 
    570 var errNoTLD = errors.New("language: region is not a valid ccTLD")
    571 
    572 // TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
    573 // In all other cases it returns either the region itself or an error.
    574 //
    575 // This method may return an error for a region for which there exists a
    576 // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
    577 // region will already be canonicalized it was obtained from a Tag that was
    578 // obtained using any of the default methods.
    579 func (r Region) TLD() (Region, error) {
    580 	// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
    581 	// difference between ISO 3166-1 and IANA ccTLD.
    582 	if r == _GB {
    583 		r = _UK
    584 	}
    585 	if (r.typ() & ccTLD) == 0 {
    586 		return 0, errNoTLD
    587 	}
    588 	return r, nil
    589 }
    590 
    591 // Canonicalize returns the region or a possible replacement if the region is
    592 // deprecated. It will not return a replacement for deprecated regions that
    593 // are split into multiple regions.
    594 func (r Region) Canonicalize() Region {
    595 	if cr := normRegion(r); cr != 0 {
    596 		return cr
    597 	}
    598 	return r
    599 }
    600 
    601 // Variant represents a registered variant of a language as defined by BCP 47.
    602 type Variant struct {
    603 	ID  uint8
    604 	str string
    605 }
    606 
    607 // ParseVariant parses and returns a Variant. An error is returned if s is not
    608 // a valid variant.
    609 func ParseVariant(s string) (v Variant, err error) {
    610 	defer func() {
    611 		if recover() != nil {
    612 			v = Variant{}
    613 			err = ErrSyntax
    614 		}
    615 	}()
    616 
    617 	s = strings.ToLower(s)
    618 	if id, ok := variantIndex[s]; ok {
    619 		return Variant{id, s}, nil
    620 	}
    621 	return Variant{}, NewValueError([]byte(s))
    622 }
    623 
    624 // String returns the string representation of the variant.
    625 func (v Variant) String() string {
    626 	return v.str
    627 }