runes.go (8677B)
1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package runes provide transforms for UTF-8 encoded text. 6 package runes // import "golang.org/x/text/runes" 7 8 import ( 9 "unicode" 10 "unicode/utf8" 11 12 "golang.org/x/text/transform" 13 ) 14 15 // A Set is a collection of runes. 16 type Set interface { 17 // Contains returns true if r is contained in the set. 18 Contains(r rune) bool 19 } 20 21 type setFunc func(rune) bool 22 23 func (s setFunc) Contains(r rune) bool { 24 return s(r) 25 } 26 27 // Note: using funcs here instead of wrapping types result in cleaner 28 // documentation and a smaller API. 29 30 // In creates a Set with a Contains method that returns true for all runes in 31 // the given RangeTable. 32 func In(rt *unicode.RangeTable) Set { 33 return setFunc(func(r rune) bool { return unicode.Is(rt, r) }) 34 } 35 36 // NotIn creates a Set with a Contains method that returns true for all runes not 37 // in the given RangeTable. 38 func NotIn(rt *unicode.RangeTable) Set { 39 return setFunc(func(r rune) bool { return !unicode.Is(rt, r) }) 40 } 41 42 // Predicate creates a Set with a Contains method that returns f(r). 43 func Predicate(f func(rune) bool) Set { 44 return setFunc(f) 45 } 46 47 // Transformer implements the transform.Transformer interface. 48 type Transformer struct { 49 t transform.SpanningTransformer 50 } 51 52 func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 53 return t.t.Transform(dst, src, atEOF) 54 } 55 56 func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) { 57 return t.t.Span(b, atEOF) 58 } 59 60 func (t Transformer) Reset() { t.t.Reset() } 61 62 // Bytes returns a new byte slice with the result of converting b using t. It 63 // calls Reset on t. It returns nil if any error was found. This can only happen 64 // if an error-producing Transformer is passed to If. 65 func (t Transformer) Bytes(b []byte) []byte { 66 b, _, err := transform.Bytes(t, b) 67 if err != nil { 68 return nil 69 } 70 return b 71 } 72 73 // String returns a string with the result of converting s using t. It calls 74 // Reset on t. It returns the empty string if any error was found. This can only 75 // happen if an error-producing Transformer is passed to If. 76 func (t Transformer) String(s string) string { 77 s, _, err := transform.String(t, s) 78 if err != nil { 79 return "" 80 } 81 return s 82 } 83 84 // TODO: 85 // - Copy: copying strings and bytes in whole-rune units. 86 // - Validation (maybe) 87 // - Well-formed-ness (maybe) 88 89 const runeErrorString = string(utf8.RuneError) 90 91 // Remove returns a Transformer that removes runes r for which s.Contains(r). 92 // Illegal input bytes are replaced by RuneError before being passed to f. 93 func Remove(s Set) Transformer { 94 if f, ok := s.(setFunc); ok { 95 // This little trick cuts the running time of BenchmarkRemove for sets 96 // created by Predicate roughly in half. 97 // TODO: special-case RangeTables as well. 98 return Transformer{remove(f)} 99 } 100 return Transformer{remove(s.Contains)} 101 } 102 103 // TODO: remove transform.RemoveFunc. 104 105 type remove func(r rune) bool 106 107 func (remove) Reset() {} 108 109 // Span implements transform.Spanner. 110 func (t remove) Span(src []byte, atEOF bool) (n int, err error) { 111 for r, size := rune(0), 0; n < len(src); { 112 if r = rune(src[n]); r < utf8.RuneSelf { 113 size = 1 114 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { 115 // Invalid rune. 116 if !atEOF && !utf8.FullRune(src[n:]) { 117 err = transform.ErrShortSrc 118 } else { 119 err = transform.ErrEndOfSpan 120 } 121 break 122 } 123 if t(r) { 124 err = transform.ErrEndOfSpan 125 break 126 } 127 n += size 128 } 129 return 130 } 131 132 // Transform implements transform.Transformer. 133 func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 134 for r, size := rune(0), 0; nSrc < len(src); { 135 if r = rune(src[nSrc]); r < utf8.RuneSelf { 136 size = 1 137 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { 138 // Invalid rune. 139 if !atEOF && !utf8.FullRune(src[nSrc:]) { 140 err = transform.ErrShortSrc 141 break 142 } 143 // We replace illegal bytes with RuneError. Not doing so might 144 // otherwise turn a sequence of invalid UTF-8 into valid UTF-8. 145 // The resulting byte sequence may subsequently contain runes 146 // for which t(r) is true that were passed unnoticed. 147 if !t(utf8.RuneError) { 148 if nDst+3 > len(dst) { 149 err = transform.ErrShortDst 150 break 151 } 152 dst[nDst+0] = runeErrorString[0] 153 dst[nDst+1] = runeErrorString[1] 154 dst[nDst+2] = runeErrorString[2] 155 nDst += 3 156 } 157 nSrc++ 158 continue 159 } 160 if t(r) { 161 nSrc += size 162 continue 163 } 164 if nDst+size > len(dst) { 165 err = transform.ErrShortDst 166 break 167 } 168 for i := 0; i < size; i++ { 169 dst[nDst] = src[nSrc] 170 nDst++ 171 nSrc++ 172 } 173 } 174 return 175 } 176 177 // Map returns a Transformer that maps the runes in the input using the given 178 // mapping. Illegal bytes in the input are converted to utf8.RuneError before 179 // being passed to the mapping func. 180 func Map(mapping func(rune) rune) Transformer { 181 return Transformer{mapper(mapping)} 182 } 183 184 type mapper func(rune) rune 185 186 func (mapper) Reset() {} 187 188 // Span implements transform.Spanner. 189 func (t mapper) Span(src []byte, atEOF bool) (n int, err error) { 190 for r, size := rune(0), 0; n < len(src); n += size { 191 if r = rune(src[n]); r < utf8.RuneSelf { 192 size = 1 193 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { 194 // Invalid rune. 195 if !atEOF && !utf8.FullRune(src[n:]) { 196 err = transform.ErrShortSrc 197 } else { 198 err = transform.ErrEndOfSpan 199 } 200 break 201 } 202 if t(r) != r { 203 err = transform.ErrEndOfSpan 204 break 205 } 206 } 207 return n, err 208 } 209 210 // Transform implements transform.Transformer. 211 func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 212 var replacement rune 213 var b [utf8.UTFMax]byte 214 215 for r, size := rune(0), 0; nSrc < len(src); { 216 if r = rune(src[nSrc]); r < utf8.RuneSelf { 217 if replacement = t(r); replacement < utf8.RuneSelf { 218 if nDst == len(dst) { 219 err = transform.ErrShortDst 220 break 221 } 222 dst[nDst] = byte(replacement) 223 nDst++ 224 nSrc++ 225 continue 226 } 227 size = 1 228 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { 229 // Invalid rune. 230 if !atEOF && !utf8.FullRune(src[nSrc:]) { 231 err = transform.ErrShortSrc 232 break 233 } 234 235 if replacement = t(utf8.RuneError); replacement == utf8.RuneError { 236 if nDst+3 > len(dst) { 237 err = transform.ErrShortDst 238 break 239 } 240 dst[nDst+0] = runeErrorString[0] 241 dst[nDst+1] = runeErrorString[1] 242 dst[nDst+2] = runeErrorString[2] 243 nDst += 3 244 nSrc++ 245 continue 246 } 247 } else if replacement = t(r); replacement == r { 248 if nDst+size > len(dst) { 249 err = transform.ErrShortDst 250 break 251 } 252 for i := 0; i < size; i++ { 253 dst[nDst] = src[nSrc] 254 nDst++ 255 nSrc++ 256 } 257 continue 258 } 259 260 n := utf8.EncodeRune(b[:], replacement) 261 262 if nDst+n > len(dst) { 263 err = transform.ErrShortDst 264 break 265 } 266 for i := 0; i < n; i++ { 267 dst[nDst] = b[i] 268 nDst++ 269 } 270 nSrc += size 271 } 272 return 273 } 274 275 // ReplaceIllFormed returns a transformer that replaces all input bytes that are 276 // not part of a well-formed UTF-8 code sequence with utf8.RuneError. 277 func ReplaceIllFormed() Transformer { 278 return Transformer{&replaceIllFormed{}} 279 } 280 281 type replaceIllFormed struct{ transform.NopResetter } 282 283 func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) { 284 for n < len(src) { 285 // ASCII fast path. 286 if src[n] < utf8.RuneSelf { 287 n++ 288 continue 289 } 290 291 r, size := utf8.DecodeRune(src[n:]) 292 293 // Look for a valid non-ASCII rune. 294 if r != utf8.RuneError || size != 1 { 295 n += size 296 continue 297 } 298 299 // Look for short source data. 300 if !atEOF && !utf8.FullRune(src[n:]) { 301 err = transform.ErrShortSrc 302 break 303 } 304 305 // We have an invalid rune. 306 err = transform.ErrEndOfSpan 307 break 308 } 309 return n, err 310 } 311 312 func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 313 for nSrc < len(src) { 314 // ASCII fast path. 315 if r := src[nSrc]; r < utf8.RuneSelf { 316 if nDst == len(dst) { 317 err = transform.ErrShortDst 318 break 319 } 320 dst[nDst] = r 321 nDst++ 322 nSrc++ 323 continue 324 } 325 326 // Look for a valid non-ASCII rune. 327 if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 { 328 if size != copy(dst[nDst:], src[nSrc:nSrc+size]) { 329 err = transform.ErrShortDst 330 break 331 } 332 nDst += size 333 nSrc += size 334 continue 335 } 336 337 // Look for short source data. 338 if !atEOF && !utf8.FullRune(src[nSrc:]) { 339 err = transform.ErrShortSrc 340 break 341 } 342 343 // We have an invalid rune. 344 if nDst+3 > len(dst) { 345 err = transform.ErrShortDst 346 break 347 } 348 dst[nDst+0] = runeErrorString[0] 349 dst[nDst+1] = runeErrorString[1] 350 dst[nDst+2] = runeErrorString[2] 351 nDst += 3 352 nSrc++ 353 } 354 return nDst, nSrc, err 355 }