info.go - gtsocial-umbx - Unnamed repository; edit this file 'description' to name the repository.

info.go (2997B)
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package cases
      6 
      7 func (c info) cccVal() info {
      8 	if c&exceptionBit != 0 {
      9 		return info(exceptions[c>>exceptionShift]) & cccMask
     10 	}
     11 	return c & cccMask
     12 }
     13 
     14 func (c info) cccType() info {
     15 	ccc := c.cccVal()
     16 	if ccc <= cccZero {
     17 		return cccZero
     18 	}
     19 	return ccc
     20 }
     21 
     22 // TODO: Implement full Unicode breaking algorithm:
     23 // 1) Implement breaking in separate package.
     24 // 2) Use the breaker here.
     25 // 3) Compare table size and performance of using the more generic breaker.
     26 //
     27 // Note that we can extend the current algorithm to be much more accurate. This
     28 // only makes sense, though, if the performance and/or space penalty of using
     29 // the generic breaker is big. Extra data will only be needed for non-cased
     30 // runes, which means there are sufficient bits left in the caseType.
     31 // ICU prohibits breaking in such cases as well.
     32 
     33 // For the purpose of title casing we use an approximation of the Unicode Word
     34 // Breaking algorithm defined in Annex #29:
     35 // https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table.
     36 //
     37 // For our approximation, we group the Word Break types into the following
     38 // categories, with associated rules:
     39 //
     40 // 1) Letter:
     41 //    ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend, Format_FE, ZWJ.
     42 //    Rule: Never break between consecutive runes of this category.
     43 //
     44 // 2) Mid:
     45 //    MidLetter, MidNumLet, Single_Quote.
     46 //    (Cf. case-ignorable: MidLetter, MidNumLet, Single_Quote or cat is Mn,
     47 //    Me, Cf, Lm or Sk).
     48 //    Rule: Don't break between Letter and Mid, but break between two Mids.
     49 //
     50 // 3) Break:
     51 //    Any other category: NewLine, MidNum, CR, LF, Double_Quote, Katakana, and
     52 //    Other.
     53 //    These categories should always result in a break between two cased letters.
     54 //    Rule: Always break.
     55 //
     56 // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in
     57 // preventing a break between two cased letters. For now we will ignore this
     58 // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and
     59 // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].)
     60 //
     61 // Note 2: the rule for Mid is very approximate, but works in most cases. To
     62 // improve, we could store the categories in the trie value and use a FA to
     63 // manage breaks. See TODO comment above.
     64 //
     65 // Note 3: according to the spec, it is possible for the Extend category to
     66 // introduce breaks between other categories grouped in Letter. However, this
     67 // is undesirable for our purposes. ICU prevents breaks in such cases as well.
     68 
     69 // isBreak returns whether this rune should introduce a break.
     70 func (c info) isBreak() bool {
     71 	return c.cccVal() == cccBreak
     72 }
     73 
     74 // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter,
     75 // Numeric, ExtendNumLet, or Extend.
     76 func (c info) isLetter() bool {
     77 	ccc := c.cccVal()
     78 	if ccc == cccZero {
     79 		return !c.isCaseIgnorable()
     80 	}
     81 	return ccc != cccBreak
     82 }
	gtsocial-umbx Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE