gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

lookup.go (12387B)


      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package language
      6 
      7 import (
      8 	"bytes"
      9 	"fmt"
     10 	"sort"
     11 	"strconv"
     12 
     13 	"golang.org/x/text/internal/tag"
     14 )
     15 
     16 // findIndex tries to find the given tag in idx and returns a standardized error
     17 // if it could not be found.
     18 func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
     19 	if !tag.FixCase(form, key) {
     20 		return 0, ErrSyntax
     21 	}
     22 	i := idx.Index(key)
     23 	if i == -1 {
     24 		return 0, NewValueError(key)
     25 	}
     26 	return i, nil
     27 }
     28 
     29 func searchUint(imap []uint16, key uint16) int {
     30 	return sort.Search(len(imap), func(i int) bool {
     31 		return imap[i] >= key
     32 	})
     33 }
     34 
     35 type Language uint16
     36 
     37 // getLangID returns the langID of s if s is a canonical subtag
     38 // or langUnknown if s is not a canonical subtag.
     39 func getLangID(s []byte) (Language, error) {
     40 	if len(s) == 2 {
     41 		return getLangISO2(s)
     42 	}
     43 	return getLangISO3(s)
     44 }
     45 
     46 // TODO language normalization as well as the AliasMaps could be moved to the
     47 // higher level package, but it is a bit tricky to separate the generation.
     48 
     49 func (id Language) Canonicalize() (Language, AliasType) {
     50 	return normLang(id)
     51 }
     52 
     53 // normLang returns the mapped langID of id according to mapping m.
     54 func normLang(id Language) (Language, AliasType) {
     55 	k := sort.Search(len(AliasMap), func(i int) bool {
     56 		return AliasMap[i].From >= uint16(id)
     57 	})
     58 	if k < len(AliasMap) && AliasMap[k].From == uint16(id) {
     59 		return Language(AliasMap[k].To), AliasTypes[k]
     60 	}
     61 	return id, AliasTypeUnknown
     62 }
     63 
     64 // getLangISO2 returns the langID for the given 2-letter ISO language code
     65 // or unknownLang if this does not exist.
     66 func getLangISO2(s []byte) (Language, error) {
     67 	if !tag.FixCase("zz", s) {
     68 		return 0, ErrSyntax
     69 	}
     70 	if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
     71 		return Language(i), nil
     72 	}
     73 	return 0, NewValueError(s)
     74 }
     75 
     76 const base = 'z' - 'a' + 1
     77 
     78 func strToInt(s []byte) uint {
     79 	v := uint(0)
     80 	for i := 0; i < len(s); i++ {
     81 		v *= base
     82 		v += uint(s[i] - 'a')
     83 	}
     84 	return v
     85 }
     86 
     87 // converts the given integer to the original ASCII string passed to strToInt.
     88 // len(s) must match the number of characters obtained.
     89 func intToStr(v uint, s []byte) {
     90 	for i := len(s) - 1; i >= 0; i-- {
     91 		s[i] = byte(v%base) + 'a'
     92 		v /= base
     93 	}
     94 }
     95 
     96 // getLangISO3 returns the langID for the given 3-letter ISO language code
     97 // or unknownLang if this does not exist.
     98 func getLangISO3(s []byte) (Language, error) {
     99 	if tag.FixCase("und", s) {
    100 		// first try to match canonical 3-letter entries
    101 		for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
    102 			if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
    103 				// We treat "und" as special and always translate it to "unspecified".
    104 				// Note that ZZ and Zzzz are private use and are not treated as
    105 				// unspecified by default.
    106 				id := Language(i)
    107 				if id == nonCanonicalUnd {
    108 					return 0, nil
    109 				}
    110 				return id, nil
    111 			}
    112 		}
    113 		if i := altLangISO3.Index(s); i != -1 {
    114 			return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil
    115 		}
    116 		n := strToInt(s)
    117 		if langNoIndex[n/8]&(1<<(n%8)) != 0 {
    118 			return Language(n) + langNoIndexOffset, nil
    119 		}
    120 		// Check for non-canonical uses of ISO3.
    121 		for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
    122 			if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
    123 				return Language(i), nil
    124 			}
    125 		}
    126 		return 0, NewValueError(s)
    127 	}
    128 	return 0, ErrSyntax
    129 }
    130 
    131 // StringToBuf writes the string to b and returns the number of bytes
    132 // written.  cap(b) must be >= 3.
    133 func (id Language) StringToBuf(b []byte) int {
    134 	if id >= langNoIndexOffset {
    135 		intToStr(uint(id)-langNoIndexOffset, b[:3])
    136 		return 3
    137 	} else if id == 0 {
    138 		return copy(b, "und")
    139 	}
    140 	l := lang[id<<2:]
    141 	if l[3] == 0 {
    142 		return copy(b, l[:3])
    143 	}
    144 	return copy(b, l[:2])
    145 }
    146 
    147 // String returns the BCP 47 representation of the langID.
    148 // Use b as variable name, instead of id, to ensure the variable
    149 // used is consistent with that of Base in which this type is embedded.
    150 func (b Language) String() string {
    151 	if b == 0 {
    152 		return "und"
    153 	} else if b >= langNoIndexOffset {
    154 		b -= langNoIndexOffset
    155 		buf := [3]byte{}
    156 		intToStr(uint(b), buf[:])
    157 		return string(buf[:])
    158 	}
    159 	l := lang.Elem(int(b))
    160 	if l[3] == 0 {
    161 		return l[:3]
    162 	}
    163 	return l[:2]
    164 }
    165 
    166 // ISO3 returns the ISO 639-3 language code.
    167 func (b Language) ISO3() string {
    168 	if b == 0 || b >= langNoIndexOffset {
    169 		return b.String()
    170 	}
    171 	l := lang.Elem(int(b))
    172 	if l[3] == 0 {
    173 		return l[:3]
    174 	} else if l[2] == 0 {
    175 		return altLangISO3.Elem(int(l[3]))[:3]
    176 	}
    177 	// This allocation will only happen for 3-letter ISO codes
    178 	// that are non-canonical BCP 47 language identifiers.
    179 	return l[0:1] + l[2:4]
    180 }
    181 
    182 // IsPrivateUse reports whether this language code is reserved for private use.
    183 func (b Language) IsPrivateUse() bool {
    184 	return langPrivateStart <= b && b <= langPrivateEnd
    185 }
    186 
    187 // SuppressScript returns the script marked as SuppressScript in the IANA
    188 // language tag repository, or 0 if there is no such script.
    189 func (b Language) SuppressScript() Script {
    190 	if b < langNoIndexOffset {
    191 		return Script(suppressScript[b])
    192 	}
    193 	return 0
    194 }
    195 
    196 type Region uint16
    197 
    198 // getRegionID returns the region id for s if s is a valid 2-letter region code
    199 // or unknownRegion.
    200 func getRegionID(s []byte) (Region, error) {
    201 	if len(s) == 3 {
    202 		if isAlpha(s[0]) {
    203 			return getRegionISO3(s)
    204 		}
    205 		if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
    206 			return getRegionM49(int(i))
    207 		}
    208 	}
    209 	return getRegionISO2(s)
    210 }
    211 
    212 // getRegionISO2 returns the regionID for the given 2-letter ISO country code
    213 // or unknownRegion if this does not exist.
    214 func getRegionISO2(s []byte) (Region, error) {
    215 	i, err := findIndex(regionISO, s, "ZZ")
    216 	if err != nil {
    217 		return 0, err
    218 	}
    219 	return Region(i) + isoRegionOffset, nil
    220 }
    221 
    222 // getRegionISO3 returns the regionID for the given 3-letter ISO country code
    223 // or unknownRegion if this does not exist.
    224 func getRegionISO3(s []byte) (Region, error) {
    225 	if tag.FixCase("ZZZ", s) {
    226 		for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
    227 			if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
    228 				return Region(i) + isoRegionOffset, nil
    229 			}
    230 		}
    231 		for i := 0; i < len(altRegionISO3); i += 3 {
    232 			if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
    233 				return Region(altRegionIDs[i/3]), nil
    234 			}
    235 		}
    236 		return 0, NewValueError(s)
    237 	}
    238 	return 0, ErrSyntax
    239 }
    240 
    241 func getRegionM49(n int) (Region, error) {
    242 	if 0 < n && n <= 999 {
    243 		const (
    244 			searchBits = 7
    245 			regionBits = 9
    246 			regionMask = 1<<regionBits - 1
    247 		)
    248 		idx := n >> searchBits
    249 		buf := fromM49[m49Index[idx]:m49Index[idx+1]]
    250 		val := uint16(n) << regionBits // we rely on bits shifting out
    251 		i := sort.Search(len(buf), func(i int) bool {
    252 			return buf[i] >= val
    253 		})
    254 		if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
    255 			return Region(r & regionMask), nil
    256 		}
    257 	}
    258 	var e ValueError
    259 	fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
    260 	return 0, e
    261 }
    262 
    263 // normRegion returns a region if r is deprecated or 0 otherwise.
    264 // TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
    265 // TODO: consider mapping split up regions to new most populous one (like CLDR).
    266 func normRegion(r Region) Region {
    267 	m := regionOldMap
    268 	k := sort.Search(len(m), func(i int) bool {
    269 		return m[i].From >= uint16(r)
    270 	})
    271 	if k < len(m) && m[k].From == uint16(r) {
    272 		return Region(m[k].To)
    273 	}
    274 	return 0
    275 }
    276 
    277 const (
    278 	iso3166UserAssigned = 1 << iota
    279 	ccTLD
    280 	bcp47Region
    281 )
    282 
    283 func (r Region) typ() byte {
    284 	return regionTypes[r]
    285 }
    286 
    287 // String returns the BCP 47 representation for the region.
    288 // It returns "ZZ" for an unspecified region.
    289 func (r Region) String() string {
    290 	if r < isoRegionOffset {
    291 		if r == 0 {
    292 			return "ZZ"
    293 		}
    294 		return fmt.Sprintf("%03d", r.M49())
    295 	}
    296 	r -= isoRegionOffset
    297 	return regionISO.Elem(int(r))[:2]
    298 }
    299 
    300 // ISO3 returns the 3-letter ISO code of r.
    301 // Note that not all regions have a 3-letter ISO code.
    302 // In such cases this method returns "ZZZ".
    303 func (r Region) ISO3() string {
    304 	if r < isoRegionOffset {
    305 		return "ZZZ"
    306 	}
    307 	r -= isoRegionOffset
    308 	reg := regionISO.Elem(int(r))
    309 	switch reg[2] {
    310 	case 0:
    311 		return altRegionISO3[reg[3]:][:3]
    312 	case ' ':
    313 		return "ZZZ"
    314 	}
    315 	return reg[0:1] + reg[2:4]
    316 }
    317 
    318 // M49 returns the UN M.49 encoding of r, or 0 if this encoding
    319 // is not defined for r.
    320 func (r Region) M49() int {
    321 	return int(m49[r])
    322 }
    323 
    324 // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
    325 // may include private-use tags that are assigned by CLDR and used in this
    326 // implementation. So IsPrivateUse and IsCountry can be simultaneously true.
    327 func (r Region) IsPrivateUse() bool {
    328 	return r.typ()&iso3166UserAssigned != 0
    329 }
    330 
    331 type Script uint16
    332 
    333 // getScriptID returns the script id for string s. It assumes that s
    334 // is of the format [A-Z][a-z]{3}.
    335 func getScriptID(idx tag.Index, s []byte) (Script, error) {
    336 	i, err := findIndex(idx, s, "Zzzz")
    337 	return Script(i), err
    338 }
    339 
    340 // String returns the script code in title case.
    341 // It returns "Zzzz" for an unspecified script.
    342 func (s Script) String() string {
    343 	if s == 0 {
    344 		return "Zzzz"
    345 	}
    346 	return script.Elem(int(s))
    347 }
    348 
    349 // IsPrivateUse reports whether this script code is reserved for private use.
    350 func (s Script) IsPrivateUse() bool {
    351 	return _Qaaa <= s && s <= _Qabx
    352 }
    353 
    354 const (
    355 	maxAltTaglen = len("en-US-POSIX")
    356 	maxLen       = maxAltTaglen
    357 )
    358 
    359 var (
    360 	// grandfatheredMap holds a mapping from legacy and grandfathered tags to
    361 	// their base language or index to more elaborate tag.
    362 	grandfatheredMap = map[[maxLen]byte]int16{
    363 		[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
    364 		[maxLen]byte{'i', '-', 'a', 'm', 'i'}:                          _ami, // i-ami
    365 		[maxLen]byte{'i', '-', 'b', 'n', 'n'}:                          _bnn, // i-bnn
    366 		[maxLen]byte{'i', '-', 'h', 'a', 'k'}:                          _hak, // i-hak
    367 		[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}:      _tlh, // i-klingon
    368 		[maxLen]byte{'i', '-', 'l', 'u', 'x'}:                          _lb,  // i-lux
    369 		[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}:           _nv,  // i-navajo
    370 		[maxLen]byte{'i', '-', 'p', 'w', 'n'}:                          _pwn, // i-pwn
    371 		[maxLen]byte{'i', '-', 't', 'a', 'o'}:                          _tao, // i-tao
    372 		[maxLen]byte{'i', '-', 't', 'a', 'y'}:                          _tay, // i-tay
    373 		[maxLen]byte{'i', '-', 't', 's', 'u'}:                          _tsu, // i-tsu
    374 		[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}:                     _nb,  // no-bok
    375 		[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}:                     _nn,  // no-nyn
    376 		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}:      _sfb, // sgn-BE-FR
    377 		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}:      _vgt, // sgn-BE-NL
    378 		[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}:      _sgg, // sgn-CH-DE
    379 		[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}:           _cmn, // zh-guoyu
    380 		[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}:           _hak, // zh-hakka
    381 		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
    382 		[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}:           _hsn, // zh-xiang
    383 
    384 		// Grandfathered tags with no modern replacement will be converted as
    385 		// follows:
    386 		[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
    387 		[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}:           -2, // en-GB-oed
    388 		[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}:           -3, // i-default
    389 		[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}:      -4, // i-enochian
    390 		[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}:                     -5, // i-mingo
    391 		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}:                          -6, // zh-min
    392 
    393 		// CLDR-specific tag.
    394 		[maxLen]byte{'r', 'o', 'o', 't'}:                                    0,  // root
    395 		[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
    396 	}
    397 
    398 	altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
    399 
    400 	altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
    401 )
    402 
    403 func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
    404 	if v, ok := grandfatheredMap[s]; ok {
    405 		if v < 0 {
    406 			return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
    407 		}
    408 		t.LangID = Language(v)
    409 		return t, true
    410 	}
    411 	return t, false
    412 }