map.go (23278B)
1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cases 6 7 // This file contains the definitions of case mappings for all supported 8 // languages. The rules for the language-specific tailorings were taken and 9 // modified from the CLDR transform definitions in common/transforms. 10 11 import ( 12 "strings" 13 "unicode" 14 "unicode/utf8" 15 16 "golang.org/x/text/internal" 17 "golang.org/x/text/language" 18 "golang.org/x/text/transform" 19 "golang.org/x/text/unicode/norm" 20 ) 21 22 // A mapFunc takes a context set to the current rune and writes the mapped 23 // version to the same context. It may advance the context to the next rune. It 24 // returns whether a checkpoint is possible: whether the pDst bytes written to 25 // dst so far won't need changing as we see more source bytes. 26 type mapFunc func(*context) bool 27 28 // A spanFunc takes a context set to the current rune and returns whether this 29 // rune would be altered when written to the output. It may advance the context 30 // to the next rune. It returns whether a checkpoint is possible. 31 type spanFunc func(*context) bool 32 33 // maxIgnorable defines the maximum number of ignorables to consider for 34 // lookahead operations. 35 const maxIgnorable = 30 36 37 // supported lists the language tags for which we have tailorings. 38 const supported = "und af az el lt nl tr" 39 40 func init() { 41 tags := []language.Tag{} 42 for _, s := range strings.Split(supported, " ") { 43 tags = append(tags, language.MustParse(s)) 44 } 45 matcher = internal.NewInheritanceMatcher(tags) 46 Supported = language.NewCoverage(tags) 47 } 48 49 var ( 50 matcher *internal.InheritanceMatcher 51 52 Supported language.Coverage 53 54 // We keep the following lists separate, instead of having a single per- 55 // language struct, to give the compiler a chance to remove unused code. 56 57 // Some uppercase mappers are stateless, so we can precompute the 58 // Transformers and save a bit on runtime allocations. 59 upperFunc = []struct { 60 upper mapFunc 61 span spanFunc 62 }{ 63 {nil, nil}, // und 64 {nil, nil}, // af 65 {aztrUpper(upper), isUpper}, // az 66 {elUpper, noSpan}, // el 67 {ltUpper(upper), noSpan}, // lt 68 {nil, nil}, // nl 69 {aztrUpper(upper), isUpper}, // tr 70 } 71 72 undUpper transform.SpanningTransformer = &undUpperCaser{} 73 undLower transform.SpanningTransformer = &undLowerCaser{} 74 undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{} 75 76 lowerFunc = []mapFunc{ 77 nil, // und 78 nil, // af 79 aztrLower, // az 80 nil, // el 81 ltLower, // lt 82 nil, // nl 83 aztrLower, // tr 84 } 85 86 titleInfos = []struct { 87 title mapFunc 88 lower mapFunc 89 titleSpan spanFunc 90 rewrite func(*context) 91 }{ 92 {title, lower, isTitle, nil}, // und 93 {title, lower, isTitle, afnlRewrite}, // af 94 {aztrUpper(title), aztrLower, isTitle, nil}, // az 95 {title, lower, isTitle, nil}, // el 96 {ltUpper(title), ltLower, noSpan, nil}, // lt 97 {nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl 98 {aztrUpper(title), aztrLower, isTitle, nil}, // tr 99 } 100 ) 101 102 func makeUpper(t language.Tag, o options) transform.SpanningTransformer { 103 _, i, _ := matcher.Match(t) 104 f := upperFunc[i].upper 105 if f == nil { 106 return undUpper 107 } 108 return &simpleCaser{f: f, span: upperFunc[i].span} 109 } 110 111 func makeLower(t language.Tag, o options) transform.SpanningTransformer { 112 _, i, _ := matcher.Match(t) 113 f := lowerFunc[i] 114 if f == nil { 115 if o.ignoreFinalSigma { 116 return undLowerIgnoreSigma 117 } 118 return undLower 119 } 120 if o.ignoreFinalSigma { 121 return &simpleCaser{f: f, span: isLower} 122 } 123 return &lowerCaser{ 124 first: f, 125 midWord: finalSigma(f), 126 } 127 } 128 129 func makeTitle(t language.Tag, o options) transform.SpanningTransformer { 130 _, i, _ := matcher.Match(t) 131 x := &titleInfos[i] 132 lower := x.lower 133 if o.noLower { 134 lower = (*context).copy 135 } else if !o.ignoreFinalSigma { 136 lower = finalSigma(lower) 137 } 138 return &titleCaser{ 139 title: x.title, 140 lower: lower, 141 titleSpan: x.titleSpan, 142 rewrite: x.rewrite, 143 } 144 } 145 146 func noSpan(c *context) bool { 147 c.err = transform.ErrEndOfSpan 148 return false 149 } 150 151 // TODO: consider a similar special case for the fast majority lower case. This 152 // is a bit more involved so will require some more precise benchmarking to 153 // justify it. 154 155 type undUpperCaser struct{ transform.NopResetter } 156 157 // undUpperCaser implements the Transformer interface for doing an upper case 158 // mapping for the root locale (und). It eliminates the need for an allocation 159 // as it prevents escaping by not using function pointers. 160 func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 161 c := context{dst: dst, src: src, atEOF: atEOF} 162 for c.next() { 163 upper(&c) 164 c.checkpoint() 165 } 166 return c.ret() 167 } 168 169 func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) { 170 c := context{src: src, atEOF: atEOF} 171 for c.next() && isUpper(&c) { 172 c.checkpoint() 173 } 174 return c.retSpan() 175 } 176 177 // undLowerIgnoreSigmaCaser implements the Transformer interface for doing 178 // a lower case mapping for the root locale (und) ignoring final sigma 179 // handling. This casing algorithm is used in some performance-critical packages 180 // like secure/precis and x/net/http/idna, which warrants its special-casing. 181 type undLowerIgnoreSigmaCaser struct{ transform.NopResetter } 182 183 func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 184 c := context{dst: dst, src: src, atEOF: atEOF} 185 for c.next() && lower(&c) { 186 c.checkpoint() 187 } 188 return c.ret() 189 190 } 191 192 // Span implements a generic lower-casing. This is possible as isLower works 193 // for all lowercasing variants. All lowercase variants only vary in how they 194 // transform a non-lowercase letter. They will never change an already lowercase 195 // letter. In addition, there is no state. 196 func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) { 197 c := context{src: src, atEOF: atEOF} 198 for c.next() && isLower(&c) { 199 c.checkpoint() 200 } 201 return c.retSpan() 202 } 203 204 type simpleCaser struct { 205 context 206 f mapFunc 207 span spanFunc 208 } 209 210 // simpleCaser implements the Transformer interface for doing a case operation 211 // on a rune-by-rune basis. 212 func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 213 c := context{dst: dst, src: src, atEOF: atEOF} 214 for c.next() && t.f(&c) { 215 c.checkpoint() 216 } 217 return c.ret() 218 } 219 220 func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) { 221 c := context{src: src, atEOF: atEOF} 222 for c.next() && t.span(&c) { 223 c.checkpoint() 224 } 225 return c.retSpan() 226 } 227 228 // undLowerCaser implements the Transformer interface for doing a lower case 229 // mapping for the root locale (und) ignoring final sigma handling. This casing 230 // algorithm is used in some performance-critical packages like secure/precis 231 // and x/net/http/idna, which warrants its special-casing. 232 type undLowerCaser struct{ transform.NopResetter } 233 234 func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 235 c := context{dst: dst, src: src, atEOF: atEOF} 236 237 for isInterWord := true; c.next(); { 238 if isInterWord { 239 if c.info.isCased() { 240 if !lower(&c) { 241 break 242 } 243 isInterWord = false 244 } else if !c.copy() { 245 break 246 } 247 } else { 248 if c.info.isNotCasedAndNotCaseIgnorable() { 249 if !c.copy() { 250 break 251 } 252 isInterWord = true 253 } else if !c.hasPrefix("Σ") { 254 if !lower(&c) { 255 break 256 } 257 } else if !finalSigmaBody(&c) { 258 break 259 } 260 } 261 c.checkpoint() 262 } 263 return c.ret() 264 } 265 266 func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) { 267 c := context{src: src, atEOF: atEOF} 268 for c.next() && isLower(&c) { 269 c.checkpoint() 270 } 271 return c.retSpan() 272 } 273 274 // lowerCaser implements the Transformer interface. The default Unicode lower 275 // casing requires different treatment for the first and subsequent characters 276 // of a word, most notably to handle the Greek final Sigma. 277 type lowerCaser struct { 278 undLowerIgnoreSigmaCaser 279 280 context 281 282 first, midWord mapFunc 283 } 284 285 func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 286 t.context = context{dst: dst, src: src, atEOF: atEOF} 287 c := &t.context 288 289 for isInterWord := true; c.next(); { 290 if isInterWord { 291 if c.info.isCased() { 292 if !t.first(c) { 293 break 294 } 295 isInterWord = false 296 } else if !c.copy() { 297 break 298 } 299 } else { 300 if c.info.isNotCasedAndNotCaseIgnorable() { 301 if !c.copy() { 302 break 303 } 304 isInterWord = true 305 } else if !t.midWord(c) { 306 break 307 } 308 } 309 c.checkpoint() 310 } 311 return c.ret() 312 } 313 314 // titleCaser implements the Transformer interface. Title casing algorithms 315 // distinguish between the first letter of a word and subsequent letters of the 316 // same word. It uses state to avoid requiring a potentially infinite lookahead. 317 type titleCaser struct { 318 context 319 320 // rune mappings used by the actual casing algorithms. 321 title mapFunc 322 lower mapFunc 323 titleSpan spanFunc 324 325 rewrite func(*context) 326 } 327 328 // Transform implements the standard Unicode title case algorithm as defined in 329 // Chapter 3 of The Unicode Standard: 330 // toTitlecase(X): Find the word boundaries in X according to Unicode Standard 331 // Annex #29, "Unicode Text Segmentation." For each word boundary, find the 332 // first cased character F following the word boundary. If F exists, map F to 333 // Titlecase_Mapping(F); then map all characters C between F and the following 334 // word boundary to Lowercase_Mapping(C). 335 func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 336 t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord} 337 c := &t.context 338 339 if !c.next() { 340 return c.ret() 341 } 342 343 for { 344 p := c.info 345 if t.rewrite != nil { 346 t.rewrite(c) 347 } 348 349 wasMid := p.isMid() 350 // Break out of this loop on failure to ensure we do not modify the 351 // state incorrectly. 352 if p.isCased() { 353 if !c.isMidWord { 354 if !t.title(c) { 355 break 356 } 357 c.isMidWord = true 358 } else if !t.lower(c) { 359 break 360 } 361 } else if !c.copy() { 362 break 363 } else if p.isBreak() { 364 c.isMidWord = false 365 } 366 367 // As we save the state of the transformer, it is safe to call 368 // checkpoint after any successful write. 369 if !(c.isMidWord && wasMid) { 370 c.checkpoint() 371 } 372 373 if !c.next() { 374 break 375 } 376 if wasMid && c.info.isMid() { 377 c.isMidWord = false 378 } 379 } 380 return c.ret() 381 } 382 383 func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) { 384 t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord} 385 c := &t.context 386 387 if !c.next() { 388 return c.retSpan() 389 } 390 391 for { 392 p := c.info 393 if t.rewrite != nil { 394 t.rewrite(c) 395 } 396 397 wasMid := p.isMid() 398 // Break out of this loop on failure to ensure we do not modify the 399 // state incorrectly. 400 if p.isCased() { 401 if !c.isMidWord { 402 if !t.titleSpan(c) { 403 break 404 } 405 c.isMidWord = true 406 } else if !isLower(c) { 407 break 408 } 409 } else if p.isBreak() { 410 c.isMidWord = false 411 } 412 // As we save the state of the transformer, it is safe to call 413 // checkpoint after any successful write. 414 if !(c.isMidWord && wasMid) { 415 c.checkpoint() 416 } 417 418 if !c.next() { 419 break 420 } 421 if wasMid && c.info.isMid() { 422 c.isMidWord = false 423 } 424 } 425 return c.retSpan() 426 } 427 428 // finalSigma adds Greek final Sigma handing to another casing function. It 429 // determines whether a lowercased sigma should be σ or ς, by looking ahead for 430 // case-ignorables and a cased letters. 431 func finalSigma(f mapFunc) mapFunc { 432 return func(c *context) bool { 433 if !c.hasPrefix("Σ") { 434 return f(c) 435 } 436 return finalSigmaBody(c) 437 } 438 } 439 440 func finalSigmaBody(c *context) bool { 441 // Current rune must be ∑. 442 443 // ::NFD(); 444 // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 445 // Σ } [:case-ignorable:]* [:cased:] → σ; 446 // [:cased:] [:case-ignorable:]* { Σ → ς; 447 // ::Any-Lower; 448 // ::NFC(); 449 450 p := c.pDst 451 c.writeString("ς") 452 453 // TODO: we should do this here, but right now this will never have an 454 // effect as this is called when the prefix is Sigma, whereas Dutch and 455 // Afrikaans only test for an apostrophe. 456 // 457 // if t.rewrite != nil { 458 // t.rewrite(c) 459 // } 460 461 // We need to do one more iteration after maxIgnorable, as a cased 462 // letter is not an ignorable and may modify the result. 463 wasMid := false 464 for i := 0; i < maxIgnorable+1; i++ { 465 if !c.next() { 466 return false 467 } 468 if !c.info.isCaseIgnorable() { 469 // All Midword runes are also case ignorable, so we are 470 // guaranteed to have a letter or word break here. As we are 471 // unreading the run, there is no need to unset c.isMidWord; 472 // the title caser will handle this. 473 if c.info.isCased() { 474 // p+1 is guaranteed to be in bounds: if writing ς was 475 // successful, p+1 will contain the second byte of ς. If not, 476 // this function will have returned after c.next returned false. 477 c.dst[p+1]++ // ς → σ 478 } 479 c.unreadRune() 480 return true 481 } 482 // A case ignorable may also introduce a word break, so we may need 483 // to continue searching even after detecting a break. 484 isMid := c.info.isMid() 485 if (wasMid && isMid) || c.info.isBreak() { 486 c.isMidWord = false 487 } 488 wasMid = isMid 489 c.copy() 490 } 491 return true 492 } 493 494 // finalSigmaSpan would be the same as isLower. 495 496 // elUpper implements Greek upper casing, which entails removing a predefined 497 // set of non-blocked modifiers. Note that these accents should not be removed 498 // for title casing! 499 // Example: "Οδός" -> "ΟΔΟΣ". 500 func elUpper(c *context) bool { 501 // From CLDR: 502 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; 503 // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; 504 505 r, _ := utf8.DecodeRune(c.src[c.pSrc:]) 506 oldPDst := c.pDst 507 if !upper(c) { 508 return false 509 } 510 if !unicode.Is(unicode.Greek, r) { 511 return true 512 } 513 i := 0 514 // Take the properties of the uppercased rune that is already written to the 515 // destination. This saves us the trouble of having to uppercase the 516 // decomposed rune again. 517 if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil { 518 // Restore the destination position and process the decomposed rune. 519 r, sz := utf8.DecodeRune(b) 520 if r <= 0xFF { // See A.6.1 521 return true 522 } 523 c.pDst = oldPDst 524 // Insert the first rune and ignore the modifiers. See A.6.2. 525 c.writeBytes(b[:sz]) 526 i = len(b[sz:]) / 2 // Greek modifiers are always of length 2. 527 } 528 529 for ; i < maxIgnorable && c.next(); i++ { 530 switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r { 531 // Above and Iota Subscript 532 case 0x0300, // U+0300 COMBINING GRAVE ACCENT 533 0x0301, // U+0301 COMBINING ACUTE ACCENT 534 0x0304, // U+0304 COMBINING MACRON 535 0x0306, // U+0306 COMBINING BREVE 536 0x0308, // U+0308 COMBINING DIAERESIS 537 0x0313, // U+0313 COMBINING COMMA ABOVE 538 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE 539 0x0342, // U+0342 COMBINING GREEK PERISPOMENI 540 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI 541 // No-op. Gobble the modifier. 542 543 default: 544 switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() { 545 case cccZero: 546 c.unreadRune() 547 return true 548 549 // We don't need to test for IotaSubscript as the only rune that 550 // qualifies (U+0345) was already excluded in the switch statement 551 // above. See A.4. 552 553 case cccAbove: 554 return c.copy() 555 default: 556 // Some other modifier. We're still allowed to gobble Greek 557 // modifiers after this. 558 c.copy() 559 } 560 } 561 } 562 return i == maxIgnorable 563 } 564 565 // TODO: implement elUpperSpan (low-priority: complex and infrequent). 566 567 func ltLower(c *context) bool { 568 // From CLDR: 569 // # Introduce an explicit dot above when lowercasing capital I's and J's 570 // # whenever there are more accents above. 571 // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 572 // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 573 // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 574 // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 575 // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 576 // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 577 // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 578 // ::NFD(); 579 // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; 580 // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; 581 // I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307; 582 // I \u0300 (Ì) → i \u0307 \u0300; 583 // I \u0301 (Í) → i \u0307 \u0301; 584 // I \u0303 (Ĩ) → i \u0307 \u0303; 585 // ::Any-Lower(); 586 // ::NFC(); 587 588 i := 0 589 if r := c.src[c.pSrc]; r < utf8.RuneSelf { 590 lower(c) 591 if r != 'I' && r != 'J' { 592 return true 593 } 594 } else { 595 p := norm.NFD.Properties(c.src[c.pSrc:]) 596 if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') { 597 // UTF-8 optimization: the decomposition will only have an above 598 // modifier if the last rune of the decomposition is in [U+300-U+311]. 599 // In all other cases, a decomposition starting with I is always 600 // an I followed by modifiers that are not cased themselves. See A.2. 601 if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4. 602 if !c.writeBytes(d[:1]) { 603 return false 604 } 605 c.dst[c.pDst-1] += 'a' - 'A' // lower 606 607 // Assumption: modifier never changes on lowercase. See A.1. 608 // Assumption: all modifiers added have CCC = Above. See A.2.3. 609 return c.writeString("\u0307") && c.writeBytes(d[1:]) 610 } 611 // In all other cases the additional modifiers will have a CCC 612 // that is less than 230 (Above). We will insert the U+0307, if 613 // needed, after these modifiers so that a string in FCD form 614 // will remain so. See A.2.2. 615 lower(c) 616 i = 1 617 } else { 618 return lower(c) 619 } 620 } 621 622 for ; i < maxIgnorable && c.next(); i++ { 623 switch c.info.cccType() { 624 case cccZero: 625 c.unreadRune() 626 return true 627 case cccAbove: 628 return c.writeString("\u0307") && c.copy() // See A.1. 629 default: 630 c.copy() // See A.1. 631 } 632 } 633 return i == maxIgnorable 634 } 635 636 // ltLowerSpan would be the same as isLower. 637 638 func ltUpper(f mapFunc) mapFunc { 639 return func(c *context) bool { 640 // Unicode: 641 // 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 642 // 643 // From CLDR: 644 // # Remove \u0307 following soft-dotteds (i, j, and the like), with possible 645 // # intervening non-230 marks. 646 // ::NFD(); 647 // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; 648 // ::Any-Upper(); 649 // ::NFC(); 650 651 // TODO: See A.5. A soft-dotted rune never has an exception. This would 652 // allow us to overload the exception bit and encode this property in 653 // info. Need to measure performance impact of this. 654 r, _ := utf8.DecodeRune(c.src[c.pSrc:]) 655 oldPDst := c.pDst 656 if !f(c) { 657 return false 658 } 659 if !unicode.Is(unicode.Soft_Dotted, r) { 660 return true 661 } 662 663 // We don't need to do an NFD normalization, as a soft-dotted rune never 664 // contains U+0307. See A.3. 665 666 i := 0 667 for ; i < maxIgnorable && c.next(); i++ { 668 switch c.info.cccType() { 669 case cccZero: 670 c.unreadRune() 671 return true 672 case cccAbove: 673 if c.hasPrefix("\u0307") { 674 // We don't do a full NFC, but rather combine runes for 675 // some of the common cases. (Returning NFC or 676 // preserving normal form is neither a requirement nor 677 // a possibility anyway). 678 if !c.next() { 679 return false 680 } 681 if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc { 682 s := "" 683 switch c.src[c.pSrc+1] { 684 case 0x80: // U+0300 COMBINING GRAVE ACCENT 685 s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE 686 case 0x81: // U+0301 COMBINING ACUTE ACCENT 687 s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE 688 case 0x83: // U+0303 COMBINING TILDE 689 s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE 690 case 0x88: // U+0308 COMBINING DIAERESIS 691 s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS 692 default: 693 } 694 if s != "" { 695 c.pDst = oldPDst 696 return c.writeString(s) 697 } 698 } 699 } 700 return c.copy() 701 default: 702 c.copy() 703 } 704 } 705 return i == maxIgnorable 706 } 707 } 708 709 // TODO: implement ltUpperSpan (low priority: complex and infrequent). 710 711 func aztrUpper(f mapFunc) mapFunc { 712 return func(c *context) bool { 713 // i→İ; 714 if c.src[c.pSrc] == 'i' { 715 return c.writeString("İ") 716 } 717 return f(c) 718 } 719 } 720 721 func aztrLower(c *context) (done bool) { 722 // From CLDR: 723 // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 724 // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE 725 // İ→i; 726 // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 727 // # This matches the behavior of the canonically equivalent I-dot_above 728 // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 729 // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 730 // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 731 // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; 732 // I→ı ; 733 // ::Any-Lower(); 734 if c.hasPrefix("\u0130") { // İ 735 return c.writeString("i") 736 } 737 if c.src[c.pSrc] != 'I' { 738 return lower(c) 739 } 740 741 // We ignore the lower-case I for now, but insert it later when we know 742 // which form we need. 743 start := c.pSrc + c.sz 744 745 i := 0 746 Loop: 747 // We check for up to n ignorables before \u0307. As \u0307 is an 748 // ignorable as well, n is maxIgnorable-1. 749 for ; i < maxIgnorable && c.next(); i++ { 750 switch c.info.cccType() { 751 case cccAbove: 752 if c.hasPrefix("\u0307") { 753 return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307 754 } 755 done = true 756 break Loop 757 case cccZero: 758 c.unreadRune() 759 done = true 760 break Loop 761 default: 762 // We'll write this rune after we know which starter to use. 763 } 764 } 765 if i == maxIgnorable { 766 done = true 767 } 768 return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done 769 } 770 771 // aztrLowerSpan would be the same as isLower. 772 773 func nlTitle(c *context) bool { 774 // From CLDR: 775 // # Special titlecasing for Dutch initial "ij". 776 // ::Any-Title(); 777 // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) 778 // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; 779 if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' { 780 return title(c) 781 } 782 783 if !c.writeString("I") || !c.next() { 784 return false 785 } 786 if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' { 787 return c.writeString("J") 788 } 789 c.unreadRune() 790 return true 791 } 792 793 func nlTitleSpan(c *context) bool { 794 // From CLDR: 795 // # Special titlecasing for Dutch initial "ij". 796 // ::Any-Title(); 797 // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) 798 // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; 799 if c.src[c.pSrc] != 'I' { 800 return isTitle(c) 801 } 802 if !c.next() || c.src[c.pSrc] == 'j' { 803 return false 804 } 805 if c.src[c.pSrc] != 'J' { 806 c.unreadRune() 807 } 808 return true 809 } 810 811 // Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078. 812 func afnlRewrite(c *context) { 813 if c.hasPrefix("'") || c.hasPrefix("’") { 814 c.isMidWord = true 815 } 816 }