parse.go (7695B)
1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package language 6 7 import ( 8 "errors" 9 "sort" 10 "strconv" 11 "strings" 12 13 "golang.org/x/text/internal/language" 14 ) 15 16 // ValueError is returned by any of the parsing functions when the 17 // input is well-formed but the respective subtag is not recognized 18 // as a valid value. 19 type ValueError interface { 20 error 21 22 // Subtag returns the subtag for which the error occurred. 23 Subtag() string 24 } 25 26 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing 27 // failed it returns an error and any part of the tag that could be parsed. 28 // If parsing succeeded but an unknown value was found, it returns 29 // ValueError. The Tag returned in this case is just stripped of the unknown 30 // value. All other values are preserved. It accepts tags in the BCP 47 format 31 // and extensions to this standard defined in 32 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. 33 // The resulting tag is canonicalized using the default canonicalization type. 34 func Parse(s string) (t Tag, err error) { 35 return Default.Parse(s) 36 } 37 38 // Parse parses the given BCP 47 string and returns a valid Tag. If parsing 39 // failed it returns an error and any part of the tag that could be parsed. 40 // If parsing succeeded but an unknown value was found, it returns 41 // ValueError. The Tag returned in this case is just stripped of the unknown 42 // value. All other values are preserved. It accepts tags in the BCP 47 format 43 // and extensions to this standard defined in 44 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. 45 // The resulting tag is canonicalized using the canonicalization type c. 46 func (c CanonType) Parse(s string) (t Tag, err error) { 47 defer func() { 48 if recover() != nil { 49 t = Tag{} 50 err = language.ErrSyntax 51 } 52 }() 53 54 tt, err := language.Parse(s) 55 if err != nil { 56 return makeTag(tt), err 57 } 58 tt, changed := canonicalize(c, tt) 59 if changed { 60 tt.RemakeString() 61 } 62 return makeTag(tt), err 63 } 64 65 // Compose creates a Tag from individual parts, which may be of type Tag, Base, 66 // Script, Region, Variant, []Variant, Extension, []Extension or error. If a 67 // Base, Script or Region or slice of type Variant or Extension is passed more 68 // than once, the latter will overwrite the former. Variants and Extensions are 69 // accumulated, but if two extensions of the same type are passed, the latter 70 // will replace the former. For -u extensions, though, the key-type pairs are 71 // added, where later values overwrite older ones. A Tag overwrites all former 72 // values and typically only makes sense as the first argument. The resulting 73 // tag is returned after canonicalizing using the Default CanonType. If one or 74 // more errors are encountered, one of the errors is returned. 75 func Compose(part ...interface{}) (t Tag, err error) { 76 return Default.Compose(part...) 77 } 78 79 // Compose creates a Tag from individual parts, which may be of type Tag, Base, 80 // Script, Region, Variant, []Variant, Extension, []Extension or error. If a 81 // Base, Script or Region or slice of type Variant or Extension is passed more 82 // than once, the latter will overwrite the former. Variants and Extensions are 83 // accumulated, but if two extensions of the same type are passed, the latter 84 // will replace the former. For -u extensions, though, the key-type pairs are 85 // added, where later values overwrite older ones. A Tag overwrites all former 86 // values and typically only makes sense as the first argument. The resulting 87 // tag is returned after canonicalizing using CanonType c. If one or more errors 88 // are encountered, one of the errors is returned. 89 func (c CanonType) Compose(part ...interface{}) (t Tag, err error) { 90 defer func() { 91 if recover() != nil { 92 t = Tag{} 93 err = language.ErrSyntax 94 } 95 }() 96 97 var b language.Builder 98 if err = update(&b, part...); err != nil { 99 return und, err 100 } 101 b.Tag, _ = canonicalize(c, b.Tag) 102 return makeTag(b.Make()), err 103 } 104 105 var errInvalidArgument = errors.New("invalid Extension or Variant") 106 107 func update(b *language.Builder, part ...interface{}) (err error) { 108 for _, x := range part { 109 switch v := x.(type) { 110 case Tag: 111 b.SetTag(v.tag()) 112 case Base: 113 b.Tag.LangID = v.langID 114 case Script: 115 b.Tag.ScriptID = v.scriptID 116 case Region: 117 b.Tag.RegionID = v.regionID 118 case Variant: 119 if v.variant == "" { 120 err = errInvalidArgument 121 break 122 } 123 b.AddVariant(v.variant) 124 case Extension: 125 if v.s == "" { 126 err = errInvalidArgument 127 break 128 } 129 b.SetExt(v.s) 130 case []Variant: 131 b.ClearVariants() 132 for _, v := range v { 133 b.AddVariant(v.variant) 134 } 135 case []Extension: 136 b.ClearExtensions() 137 for _, e := range v { 138 b.SetExt(e.s) 139 } 140 // TODO: support parsing of raw strings based on morphology or just extensions? 141 case error: 142 if v != nil { 143 err = v 144 } 145 } 146 } 147 return 148 } 149 150 var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight") 151 var errTagListTooLarge = errors.New("tag list exceeds max length") 152 153 // ParseAcceptLanguage parses the contents of an Accept-Language header as 154 // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and 155 // a list of corresponding quality weights. It is more permissive than RFC 2616 156 // and may return non-nil slices even if the input is not valid. 157 // The Tags will be sorted by highest weight first and then by first occurrence. 158 // Tags with a weight of zero will be dropped. An error will be returned if the 159 // input could not be parsed. 160 func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) { 161 defer func() { 162 if recover() != nil { 163 tag = nil 164 q = nil 165 err = language.ErrSyntax 166 } 167 }() 168 169 if strings.Count(s, "-") > 1000 { 170 return nil, nil, errTagListTooLarge 171 } 172 173 var entry string 174 for s != "" { 175 if entry, s = split(s, ','); entry == "" { 176 continue 177 } 178 179 entry, weight := split(entry, ';') 180 181 // Scan the language. 182 t, err := Parse(entry) 183 if err != nil { 184 id, ok := acceptFallback[entry] 185 if !ok { 186 return nil, nil, err 187 } 188 t = makeTag(language.Tag{LangID: id}) 189 } 190 191 // Scan the optional weight. 192 w := 1.0 193 if weight != "" { 194 weight = consume(weight, 'q') 195 weight = consume(weight, '=') 196 // consume returns the empty string when a token could not be 197 // consumed, resulting in an error for ParseFloat. 198 if w, err = strconv.ParseFloat(weight, 32); err != nil { 199 return nil, nil, errInvalidWeight 200 } 201 // Drop tags with a quality weight of 0. 202 if w <= 0 { 203 continue 204 } 205 } 206 207 tag = append(tag, t) 208 q = append(q, float32(w)) 209 } 210 sort.Stable(&tagSort{tag, q}) 211 return tag, q, nil 212 } 213 214 // consume removes a leading token c from s and returns the result or the empty 215 // string if there is no such token. 216 func consume(s string, c byte) string { 217 if s == "" || s[0] != c { 218 return "" 219 } 220 return strings.TrimSpace(s[1:]) 221 } 222 223 func split(s string, c byte) (head, tail string) { 224 if i := strings.IndexByte(s, c); i >= 0 { 225 return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:]) 226 } 227 return strings.TrimSpace(s), "" 228 } 229 230 // Add hack mapping to deal with a small number of cases that occur 231 // in Accept-Language (with reasonable frequency). 232 var acceptFallback = map[string]language.Language{ 233 "english": _en, 234 "deutsch": _de, 235 "italian": _it, 236 "french": _fr, 237 "*": _mul, // defined in the spec to match all languages. 238 } 239 240 type tagSort struct { 241 tag []Tag 242 q []float32 243 } 244 245 func (s *tagSort) Len() int { 246 return len(s.q) 247 } 248 249 func (s *tagSort) Less(i, j int) bool { 250 return s.q[i] > s.q[j] 251 } 252 253 func (s *tagSort) Swap(i, j int) { 254 s.tag[i], s.tag[j] = s.tag[j], s.tag[i] 255 s.q[i], s.q[j] = s.q[j], s.q[i] 256 }