info.go (2997B)
1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cases 6 7 func (c info) cccVal() info { 8 if c&exceptionBit != 0 { 9 return info(exceptions[c>>exceptionShift]) & cccMask 10 } 11 return c & cccMask 12 } 13 14 func (c info) cccType() info { 15 ccc := c.cccVal() 16 if ccc <= cccZero { 17 return cccZero 18 } 19 return ccc 20 } 21 22 // TODO: Implement full Unicode breaking algorithm: 23 // 1) Implement breaking in separate package. 24 // 2) Use the breaker here. 25 // 3) Compare table size and performance of using the more generic breaker. 26 // 27 // Note that we can extend the current algorithm to be much more accurate. This 28 // only makes sense, though, if the performance and/or space penalty of using 29 // the generic breaker is big. Extra data will only be needed for non-cased 30 // runes, which means there are sufficient bits left in the caseType. 31 // ICU prohibits breaking in such cases as well. 32 33 // For the purpose of title casing we use an approximation of the Unicode Word 34 // Breaking algorithm defined in Annex #29: 35 // https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table. 36 // 37 // For our approximation, we group the Word Break types into the following 38 // categories, with associated rules: 39 // 40 // 1) Letter: 41 // ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend, Format_FE, ZWJ. 42 // Rule: Never break between consecutive runes of this category. 43 // 44 // 2) Mid: 45 // MidLetter, MidNumLet, Single_Quote. 46 // (Cf. case-ignorable: MidLetter, MidNumLet, Single_Quote or cat is Mn, 47 // Me, Cf, Lm or Sk). 48 // Rule: Don't break between Letter and Mid, but break between two Mids. 49 // 50 // 3) Break: 51 // Any other category: NewLine, MidNum, CR, LF, Double_Quote, Katakana, and 52 // Other. 53 // These categories should always result in a break between two cased letters. 54 // Rule: Always break. 55 // 56 // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in 57 // preventing a break between two cased letters. For now we will ignore this 58 // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and 59 // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].) 60 // 61 // Note 2: the rule for Mid is very approximate, but works in most cases. To 62 // improve, we could store the categories in the trie value and use a FA to 63 // manage breaks. See TODO comment above. 64 // 65 // Note 3: according to the spec, it is possible for the Extend category to 66 // introduce breaks between other categories grouped in Letter. However, this 67 // is undesirable for our purposes. ICU prevents breaks in such cases as well. 68 69 // isBreak returns whether this rune should introduce a break. 70 func (c info) isBreak() bool { 71 return c.cccVal() == cccBreak 72 } 73 74 // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter, 75 // Numeric, ExtendNumLet, or Extend. 76 func (c info) isLetter() bool { 77 ccc := c.cccVal() 78 if ccc == cccZero { 79 return !c.isCaseIgnorable() 80 } 81 return ccc != cccBreak 82 }