gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

runes.go (8677B)


      1 // Copyright 2014 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package runes provide transforms for UTF-8 encoded text.
      6 package runes // import "golang.org/x/text/runes"
      7 
      8 import (
      9 	"unicode"
     10 	"unicode/utf8"
     11 
     12 	"golang.org/x/text/transform"
     13 )
     14 
     15 // A Set is a collection of runes.
     16 type Set interface {
     17 	// Contains returns true if r is contained in the set.
     18 	Contains(r rune) bool
     19 }
     20 
     21 type setFunc func(rune) bool
     22 
     23 func (s setFunc) Contains(r rune) bool {
     24 	return s(r)
     25 }
     26 
     27 // Note: using funcs here instead of wrapping types result in cleaner
     28 // documentation and a smaller API.
     29 
     30 // In creates a Set with a Contains method that returns true for all runes in
     31 // the given RangeTable.
     32 func In(rt *unicode.RangeTable) Set {
     33 	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
     34 }
     35 
     36 // NotIn creates a Set with a Contains method that returns true for all runes not
     37 // in the given RangeTable.
     38 func NotIn(rt *unicode.RangeTable) Set {
     39 	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
     40 }
     41 
     42 // Predicate creates a Set with a Contains method that returns f(r).
     43 func Predicate(f func(rune) bool) Set {
     44 	return setFunc(f)
     45 }
     46 
     47 // Transformer implements the transform.Transformer interface.
     48 type Transformer struct {
     49 	t transform.SpanningTransformer
     50 }
     51 
     52 func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
     53 	return t.t.Transform(dst, src, atEOF)
     54 }
     55 
     56 func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
     57 	return t.t.Span(b, atEOF)
     58 }
     59 
     60 func (t Transformer) Reset() { t.t.Reset() }
     61 
     62 // Bytes returns a new byte slice with the result of converting b using t.  It
     63 // calls Reset on t. It returns nil if any error was found. This can only happen
     64 // if an error-producing Transformer is passed to If.
     65 func (t Transformer) Bytes(b []byte) []byte {
     66 	b, _, err := transform.Bytes(t, b)
     67 	if err != nil {
     68 		return nil
     69 	}
     70 	return b
     71 }
     72 
     73 // String returns a string with the result of converting s using t. It calls
     74 // Reset on t. It returns the empty string if any error was found. This can only
     75 // happen if an error-producing Transformer is passed to If.
     76 func (t Transformer) String(s string) string {
     77 	s, _, err := transform.String(t, s)
     78 	if err != nil {
     79 		return ""
     80 	}
     81 	return s
     82 }
     83 
     84 // TODO:
     85 // - Copy: copying strings and bytes in whole-rune units.
     86 // - Validation (maybe)
     87 // - Well-formed-ness (maybe)
     88 
     89 const runeErrorString = string(utf8.RuneError)
     90 
     91 // Remove returns a Transformer that removes runes r for which s.Contains(r).
     92 // Illegal input bytes are replaced by RuneError before being passed to f.
     93 func Remove(s Set) Transformer {
     94 	if f, ok := s.(setFunc); ok {
     95 		// This little trick cuts the running time of BenchmarkRemove for sets
     96 		// created by Predicate roughly in half.
     97 		// TODO: special-case RangeTables as well.
     98 		return Transformer{remove(f)}
     99 	}
    100 	return Transformer{remove(s.Contains)}
    101 }
    102 
    103 // TODO: remove transform.RemoveFunc.
    104 
    105 type remove func(r rune) bool
    106 
    107 func (remove) Reset() {}
    108 
    109 // Span implements transform.Spanner.
    110 func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
    111 	for r, size := rune(0), 0; n < len(src); {
    112 		if r = rune(src[n]); r < utf8.RuneSelf {
    113 			size = 1
    114 		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
    115 			// Invalid rune.
    116 			if !atEOF && !utf8.FullRune(src[n:]) {
    117 				err = transform.ErrShortSrc
    118 			} else {
    119 				err = transform.ErrEndOfSpan
    120 			}
    121 			break
    122 		}
    123 		if t(r) {
    124 			err = transform.ErrEndOfSpan
    125 			break
    126 		}
    127 		n += size
    128 	}
    129 	return
    130 }
    131 
    132 // Transform implements transform.Transformer.
    133 func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    134 	for r, size := rune(0), 0; nSrc < len(src); {
    135 		if r = rune(src[nSrc]); r < utf8.RuneSelf {
    136 			size = 1
    137 		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
    138 			// Invalid rune.
    139 			if !atEOF && !utf8.FullRune(src[nSrc:]) {
    140 				err = transform.ErrShortSrc
    141 				break
    142 			}
    143 			// We replace illegal bytes with RuneError. Not doing so might
    144 			// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
    145 			// The resulting byte sequence may subsequently contain runes
    146 			// for which t(r) is true that were passed unnoticed.
    147 			if !t(utf8.RuneError) {
    148 				if nDst+3 > len(dst) {
    149 					err = transform.ErrShortDst
    150 					break
    151 				}
    152 				dst[nDst+0] = runeErrorString[0]
    153 				dst[nDst+1] = runeErrorString[1]
    154 				dst[nDst+2] = runeErrorString[2]
    155 				nDst += 3
    156 			}
    157 			nSrc++
    158 			continue
    159 		}
    160 		if t(r) {
    161 			nSrc += size
    162 			continue
    163 		}
    164 		if nDst+size > len(dst) {
    165 			err = transform.ErrShortDst
    166 			break
    167 		}
    168 		for i := 0; i < size; i++ {
    169 			dst[nDst] = src[nSrc]
    170 			nDst++
    171 			nSrc++
    172 		}
    173 	}
    174 	return
    175 }
    176 
    177 // Map returns a Transformer that maps the runes in the input using the given
    178 // mapping. Illegal bytes in the input are converted to utf8.RuneError before
    179 // being passed to the mapping func.
    180 func Map(mapping func(rune) rune) Transformer {
    181 	return Transformer{mapper(mapping)}
    182 }
    183 
    184 type mapper func(rune) rune
    185 
    186 func (mapper) Reset() {}
    187 
    188 // Span implements transform.Spanner.
    189 func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
    190 	for r, size := rune(0), 0; n < len(src); n += size {
    191 		if r = rune(src[n]); r < utf8.RuneSelf {
    192 			size = 1
    193 		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
    194 			// Invalid rune.
    195 			if !atEOF && !utf8.FullRune(src[n:]) {
    196 				err = transform.ErrShortSrc
    197 			} else {
    198 				err = transform.ErrEndOfSpan
    199 			}
    200 			break
    201 		}
    202 		if t(r) != r {
    203 			err = transform.ErrEndOfSpan
    204 			break
    205 		}
    206 	}
    207 	return n, err
    208 }
    209 
    210 // Transform implements transform.Transformer.
    211 func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    212 	var replacement rune
    213 	var b [utf8.UTFMax]byte
    214 
    215 	for r, size := rune(0), 0; nSrc < len(src); {
    216 		if r = rune(src[nSrc]); r < utf8.RuneSelf {
    217 			if replacement = t(r); replacement < utf8.RuneSelf {
    218 				if nDst == len(dst) {
    219 					err = transform.ErrShortDst
    220 					break
    221 				}
    222 				dst[nDst] = byte(replacement)
    223 				nDst++
    224 				nSrc++
    225 				continue
    226 			}
    227 			size = 1
    228 		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
    229 			// Invalid rune.
    230 			if !atEOF && !utf8.FullRune(src[nSrc:]) {
    231 				err = transform.ErrShortSrc
    232 				break
    233 			}
    234 
    235 			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
    236 				if nDst+3 > len(dst) {
    237 					err = transform.ErrShortDst
    238 					break
    239 				}
    240 				dst[nDst+0] = runeErrorString[0]
    241 				dst[nDst+1] = runeErrorString[1]
    242 				dst[nDst+2] = runeErrorString[2]
    243 				nDst += 3
    244 				nSrc++
    245 				continue
    246 			}
    247 		} else if replacement = t(r); replacement == r {
    248 			if nDst+size > len(dst) {
    249 				err = transform.ErrShortDst
    250 				break
    251 			}
    252 			for i := 0; i < size; i++ {
    253 				dst[nDst] = src[nSrc]
    254 				nDst++
    255 				nSrc++
    256 			}
    257 			continue
    258 		}
    259 
    260 		n := utf8.EncodeRune(b[:], replacement)
    261 
    262 		if nDst+n > len(dst) {
    263 			err = transform.ErrShortDst
    264 			break
    265 		}
    266 		for i := 0; i < n; i++ {
    267 			dst[nDst] = b[i]
    268 			nDst++
    269 		}
    270 		nSrc += size
    271 	}
    272 	return
    273 }
    274 
    275 // ReplaceIllFormed returns a transformer that replaces all input bytes that are
    276 // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
    277 func ReplaceIllFormed() Transformer {
    278 	return Transformer{&replaceIllFormed{}}
    279 }
    280 
    281 type replaceIllFormed struct{ transform.NopResetter }
    282 
    283 func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
    284 	for n < len(src) {
    285 		// ASCII fast path.
    286 		if src[n] < utf8.RuneSelf {
    287 			n++
    288 			continue
    289 		}
    290 
    291 		r, size := utf8.DecodeRune(src[n:])
    292 
    293 		// Look for a valid non-ASCII rune.
    294 		if r != utf8.RuneError || size != 1 {
    295 			n += size
    296 			continue
    297 		}
    298 
    299 		// Look for short source data.
    300 		if !atEOF && !utf8.FullRune(src[n:]) {
    301 			err = transform.ErrShortSrc
    302 			break
    303 		}
    304 
    305 		// We have an invalid rune.
    306 		err = transform.ErrEndOfSpan
    307 		break
    308 	}
    309 	return n, err
    310 }
    311 
    312 func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    313 	for nSrc < len(src) {
    314 		// ASCII fast path.
    315 		if r := src[nSrc]; r < utf8.RuneSelf {
    316 			if nDst == len(dst) {
    317 				err = transform.ErrShortDst
    318 				break
    319 			}
    320 			dst[nDst] = r
    321 			nDst++
    322 			nSrc++
    323 			continue
    324 		}
    325 
    326 		// Look for a valid non-ASCII rune.
    327 		if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
    328 			if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
    329 				err = transform.ErrShortDst
    330 				break
    331 			}
    332 			nDst += size
    333 			nSrc += size
    334 			continue
    335 		}
    336 
    337 		// Look for short source data.
    338 		if !atEOF && !utf8.FullRune(src[nSrc:]) {
    339 			err = transform.ErrShortSrc
    340 			break
    341 		}
    342 
    343 		// We have an invalid rune.
    344 		if nDst+3 > len(dst) {
    345 			err = transform.ErrShortDst
    346 			break
    347 		}
    348 		dst[nDst+0] = runeErrorString[0]
    349 		dst[nDst+1] = runeErrorString[1]
    350 		dst[nDst+2] = runeErrorString[2]
    351 		nDst += 3
    352 		nSrc++
    353 	}
    354 	return nDst, nSrc, err
    355 }