lookup.go (12387B)
1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package language 6 7 import ( 8 "bytes" 9 "fmt" 10 "sort" 11 "strconv" 12 13 "golang.org/x/text/internal/tag" 14 ) 15 16 // findIndex tries to find the given tag in idx and returns a standardized error 17 // if it could not be found. 18 func findIndex(idx tag.Index, key []byte, form string) (index int, err error) { 19 if !tag.FixCase(form, key) { 20 return 0, ErrSyntax 21 } 22 i := idx.Index(key) 23 if i == -1 { 24 return 0, NewValueError(key) 25 } 26 return i, nil 27 } 28 29 func searchUint(imap []uint16, key uint16) int { 30 return sort.Search(len(imap), func(i int) bool { 31 return imap[i] >= key 32 }) 33 } 34 35 type Language uint16 36 37 // getLangID returns the langID of s if s is a canonical subtag 38 // or langUnknown if s is not a canonical subtag. 39 func getLangID(s []byte) (Language, error) { 40 if len(s) == 2 { 41 return getLangISO2(s) 42 } 43 return getLangISO3(s) 44 } 45 46 // TODO language normalization as well as the AliasMaps could be moved to the 47 // higher level package, but it is a bit tricky to separate the generation. 48 49 func (id Language) Canonicalize() (Language, AliasType) { 50 return normLang(id) 51 } 52 53 // normLang returns the mapped langID of id according to mapping m. 54 func normLang(id Language) (Language, AliasType) { 55 k := sort.Search(len(AliasMap), func(i int) bool { 56 return AliasMap[i].From >= uint16(id) 57 }) 58 if k < len(AliasMap) && AliasMap[k].From == uint16(id) { 59 return Language(AliasMap[k].To), AliasTypes[k] 60 } 61 return id, AliasTypeUnknown 62 } 63 64 // getLangISO2 returns the langID for the given 2-letter ISO language code 65 // or unknownLang if this does not exist. 66 func getLangISO2(s []byte) (Language, error) { 67 if !tag.FixCase("zz", s) { 68 return 0, ErrSyntax 69 } 70 if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 { 71 return Language(i), nil 72 } 73 return 0, NewValueError(s) 74 } 75 76 const base = 'z' - 'a' + 1 77 78 func strToInt(s []byte) uint { 79 v := uint(0) 80 for i := 0; i < len(s); i++ { 81 v *= base 82 v += uint(s[i] - 'a') 83 } 84 return v 85 } 86 87 // converts the given integer to the original ASCII string passed to strToInt. 88 // len(s) must match the number of characters obtained. 89 func intToStr(v uint, s []byte) { 90 for i := len(s) - 1; i >= 0; i-- { 91 s[i] = byte(v%base) + 'a' 92 v /= base 93 } 94 } 95 96 // getLangISO3 returns the langID for the given 3-letter ISO language code 97 // or unknownLang if this does not exist. 98 func getLangISO3(s []byte) (Language, error) { 99 if tag.FixCase("und", s) { 100 // first try to match canonical 3-letter entries 101 for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) { 102 if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] { 103 // We treat "und" as special and always translate it to "unspecified". 104 // Note that ZZ and Zzzz are private use and are not treated as 105 // unspecified by default. 106 id := Language(i) 107 if id == nonCanonicalUnd { 108 return 0, nil 109 } 110 return id, nil 111 } 112 } 113 if i := altLangISO3.Index(s); i != -1 { 114 return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil 115 } 116 n := strToInt(s) 117 if langNoIndex[n/8]&(1<<(n%8)) != 0 { 118 return Language(n) + langNoIndexOffset, nil 119 } 120 // Check for non-canonical uses of ISO3. 121 for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) { 122 if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] { 123 return Language(i), nil 124 } 125 } 126 return 0, NewValueError(s) 127 } 128 return 0, ErrSyntax 129 } 130 131 // StringToBuf writes the string to b and returns the number of bytes 132 // written. cap(b) must be >= 3. 133 func (id Language) StringToBuf(b []byte) int { 134 if id >= langNoIndexOffset { 135 intToStr(uint(id)-langNoIndexOffset, b[:3]) 136 return 3 137 } else if id == 0 { 138 return copy(b, "und") 139 } 140 l := lang[id<<2:] 141 if l[3] == 0 { 142 return copy(b, l[:3]) 143 } 144 return copy(b, l[:2]) 145 } 146 147 // String returns the BCP 47 representation of the langID. 148 // Use b as variable name, instead of id, to ensure the variable 149 // used is consistent with that of Base in which this type is embedded. 150 func (b Language) String() string { 151 if b == 0 { 152 return "und" 153 } else if b >= langNoIndexOffset { 154 b -= langNoIndexOffset 155 buf := [3]byte{} 156 intToStr(uint(b), buf[:]) 157 return string(buf[:]) 158 } 159 l := lang.Elem(int(b)) 160 if l[3] == 0 { 161 return l[:3] 162 } 163 return l[:2] 164 } 165 166 // ISO3 returns the ISO 639-3 language code. 167 func (b Language) ISO3() string { 168 if b == 0 || b >= langNoIndexOffset { 169 return b.String() 170 } 171 l := lang.Elem(int(b)) 172 if l[3] == 0 { 173 return l[:3] 174 } else if l[2] == 0 { 175 return altLangISO3.Elem(int(l[3]))[:3] 176 } 177 // This allocation will only happen for 3-letter ISO codes 178 // that are non-canonical BCP 47 language identifiers. 179 return l[0:1] + l[2:4] 180 } 181 182 // IsPrivateUse reports whether this language code is reserved for private use. 183 func (b Language) IsPrivateUse() bool { 184 return langPrivateStart <= b && b <= langPrivateEnd 185 } 186 187 // SuppressScript returns the script marked as SuppressScript in the IANA 188 // language tag repository, or 0 if there is no such script. 189 func (b Language) SuppressScript() Script { 190 if b < langNoIndexOffset { 191 return Script(suppressScript[b]) 192 } 193 return 0 194 } 195 196 type Region uint16 197 198 // getRegionID returns the region id for s if s is a valid 2-letter region code 199 // or unknownRegion. 200 func getRegionID(s []byte) (Region, error) { 201 if len(s) == 3 { 202 if isAlpha(s[0]) { 203 return getRegionISO3(s) 204 } 205 if i, err := strconv.ParseUint(string(s), 10, 10); err == nil { 206 return getRegionM49(int(i)) 207 } 208 } 209 return getRegionISO2(s) 210 } 211 212 // getRegionISO2 returns the regionID for the given 2-letter ISO country code 213 // or unknownRegion if this does not exist. 214 func getRegionISO2(s []byte) (Region, error) { 215 i, err := findIndex(regionISO, s, "ZZ") 216 if err != nil { 217 return 0, err 218 } 219 return Region(i) + isoRegionOffset, nil 220 } 221 222 // getRegionISO3 returns the regionID for the given 3-letter ISO country code 223 // or unknownRegion if this does not exist. 224 func getRegionISO3(s []byte) (Region, error) { 225 if tag.FixCase("ZZZ", s) { 226 for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) { 227 if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] { 228 return Region(i) + isoRegionOffset, nil 229 } 230 } 231 for i := 0; i < len(altRegionISO3); i += 3 { 232 if tag.Compare(altRegionISO3[i:i+3], s) == 0 { 233 return Region(altRegionIDs[i/3]), nil 234 } 235 } 236 return 0, NewValueError(s) 237 } 238 return 0, ErrSyntax 239 } 240 241 func getRegionM49(n int) (Region, error) { 242 if 0 < n && n <= 999 { 243 const ( 244 searchBits = 7 245 regionBits = 9 246 regionMask = 1<<regionBits - 1 247 ) 248 idx := n >> searchBits 249 buf := fromM49[m49Index[idx]:m49Index[idx+1]] 250 val := uint16(n) << regionBits // we rely on bits shifting out 251 i := sort.Search(len(buf), func(i int) bool { 252 return buf[i] >= val 253 }) 254 if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val { 255 return Region(r & regionMask), nil 256 } 257 } 258 var e ValueError 259 fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n) 260 return 0, e 261 } 262 263 // normRegion returns a region if r is deprecated or 0 otherwise. 264 // TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ). 265 // TODO: consider mapping split up regions to new most populous one (like CLDR). 266 func normRegion(r Region) Region { 267 m := regionOldMap 268 k := sort.Search(len(m), func(i int) bool { 269 return m[i].From >= uint16(r) 270 }) 271 if k < len(m) && m[k].From == uint16(r) { 272 return Region(m[k].To) 273 } 274 return 0 275 } 276 277 const ( 278 iso3166UserAssigned = 1 << iota 279 ccTLD 280 bcp47Region 281 ) 282 283 func (r Region) typ() byte { 284 return regionTypes[r] 285 } 286 287 // String returns the BCP 47 representation for the region. 288 // It returns "ZZ" for an unspecified region. 289 func (r Region) String() string { 290 if r < isoRegionOffset { 291 if r == 0 { 292 return "ZZ" 293 } 294 return fmt.Sprintf("%03d", r.M49()) 295 } 296 r -= isoRegionOffset 297 return regionISO.Elem(int(r))[:2] 298 } 299 300 // ISO3 returns the 3-letter ISO code of r. 301 // Note that not all regions have a 3-letter ISO code. 302 // In such cases this method returns "ZZZ". 303 func (r Region) ISO3() string { 304 if r < isoRegionOffset { 305 return "ZZZ" 306 } 307 r -= isoRegionOffset 308 reg := regionISO.Elem(int(r)) 309 switch reg[2] { 310 case 0: 311 return altRegionISO3[reg[3]:][:3] 312 case ' ': 313 return "ZZZ" 314 } 315 return reg[0:1] + reg[2:4] 316 } 317 318 // M49 returns the UN M.49 encoding of r, or 0 if this encoding 319 // is not defined for r. 320 func (r Region) M49() int { 321 return int(m49[r]) 322 } 323 324 // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This 325 // may include private-use tags that are assigned by CLDR and used in this 326 // implementation. So IsPrivateUse and IsCountry can be simultaneously true. 327 func (r Region) IsPrivateUse() bool { 328 return r.typ()&iso3166UserAssigned != 0 329 } 330 331 type Script uint16 332 333 // getScriptID returns the script id for string s. It assumes that s 334 // is of the format [A-Z][a-z]{3}. 335 func getScriptID(idx tag.Index, s []byte) (Script, error) { 336 i, err := findIndex(idx, s, "Zzzz") 337 return Script(i), err 338 } 339 340 // String returns the script code in title case. 341 // It returns "Zzzz" for an unspecified script. 342 func (s Script) String() string { 343 if s == 0 { 344 return "Zzzz" 345 } 346 return script.Elem(int(s)) 347 } 348 349 // IsPrivateUse reports whether this script code is reserved for private use. 350 func (s Script) IsPrivateUse() bool { 351 return _Qaaa <= s && s <= _Qabx 352 } 353 354 const ( 355 maxAltTaglen = len("en-US-POSIX") 356 maxLen = maxAltTaglen 357 ) 358 359 var ( 360 // grandfatheredMap holds a mapping from legacy and grandfathered tags to 361 // their base language or index to more elaborate tag. 362 grandfatheredMap = map[[maxLen]byte]int16{ 363 [maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban 364 [maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami 365 [maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn 366 [maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak 367 [maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon 368 [maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux 369 [maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo 370 [maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn 371 [maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao 372 [maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay 373 [maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu 374 [maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok 375 [maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn 376 [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR 377 [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL 378 [maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE 379 [maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu 380 [maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka 381 [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan 382 [maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang 383 384 // Grandfathered tags with no modern replacement will be converted as 385 // follows: 386 [maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish 387 [maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed 388 [maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default 389 [maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian 390 [maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo 391 [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min 392 393 // CLDR-specific tag. 394 [maxLen]byte{'r', 'o', 'o', 't'}: 0, // root 395 [maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX" 396 } 397 398 altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102} 399 400 altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix" 401 ) 402 403 func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) { 404 if v, ok := grandfatheredMap[s]; ok { 405 if v < 0 { 406 return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true 407 } 408 t.LangID = Language(v) 409 return t, true 410 } 411 return t, false 412 }