gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

utf8.go (5600B)


      1 package characters
      2 
      3 import (
      4 	"unicode/utf8"
      5 )
      6 
      7 type utf8Err struct {
      8 	Index int
      9 	Size  int
     10 }
     11 
     12 func (u utf8Err) Zero() bool {
     13 	return u.Size == 0
     14 }
     15 
     16 // Verified that a given string is only made of valid UTF-8 characters allowed
     17 // by the TOML spec:
     18 //
     19 // Any Unicode character may be used except those that must be escaped:
     20 // quotation mark, backslash, and the control characters other than tab (U+0000
     21 // to U+0008, U+000A to U+001F, U+007F).
     22 //
     23 // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
     24 // when a character is not allowed.
     25 //
     26 // The returned utf8Err is Zero() if the string is valid, or contains the byte
     27 // index and size of the invalid character.
     28 //
     29 // quotation mark => already checked
     30 // backslash => already checked
     31 // 0-0x8 => invalid
     32 // 0x9 => tab, ok
     33 // 0xA - 0x1F => invalid
     34 // 0x7F => invalid
     35 func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
     36 	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
     37 	offset := 0
     38 	for len(p) >= 8 {
     39 		// Combining two 32 bit loads allows the same code to be used
     40 		// for 32 and 64 bit platforms.
     41 		// The compiler can generate a 32bit load for first32 and second32
     42 		// on many platforms. See test/codegen/memcombine.go.
     43 		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
     44 		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
     45 		if (first32|second32)&0x80808080 != 0 {
     46 			// Found a non ASCII byte (>= RuneSelf).
     47 			break
     48 		}
     49 
     50 		for i, b := range p[:8] {
     51 			if InvalidAscii(b) {
     52 				err.Index = offset + i
     53 				err.Size = 1
     54 				return
     55 			}
     56 		}
     57 
     58 		p = p[8:]
     59 		offset += 8
     60 	}
     61 	n := len(p)
     62 	for i := 0; i < n; {
     63 		pi := p[i]
     64 		if pi < utf8.RuneSelf {
     65 			if InvalidAscii(pi) {
     66 				err.Index = offset + i
     67 				err.Size = 1
     68 				return
     69 			}
     70 			i++
     71 			continue
     72 		}
     73 		x := first[pi]
     74 		if x == xx {
     75 			// Illegal starter byte.
     76 			err.Index = offset + i
     77 			err.Size = 1
     78 			return
     79 		}
     80 		size := int(x & 7)
     81 		if i+size > n {
     82 			// Short or invalid.
     83 			err.Index = offset + i
     84 			err.Size = n - i
     85 			return
     86 		}
     87 		accept := acceptRanges[x>>4]
     88 		if c := p[i+1]; c < accept.lo || accept.hi < c {
     89 			err.Index = offset + i
     90 			err.Size = 2
     91 			return
     92 		} else if size == 2 {
     93 		} else if c := p[i+2]; c < locb || hicb < c {
     94 			err.Index = offset + i
     95 			err.Size = 3
     96 			return
     97 		} else if size == 3 {
     98 		} else if c := p[i+3]; c < locb || hicb < c {
     99 			err.Index = offset + i
    100 			err.Size = 4
    101 			return
    102 		}
    103 		i += size
    104 	}
    105 	return
    106 }
    107 
    108 // Return the size of the next rune if valid, 0 otherwise.
    109 func Utf8ValidNext(p []byte) int {
    110 	c := p[0]
    111 
    112 	if c < utf8.RuneSelf {
    113 		if InvalidAscii(c) {
    114 			return 0
    115 		}
    116 		return 1
    117 	}
    118 
    119 	x := first[c]
    120 	if x == xx {
    121 		// Illegal starter byte.
    122 		return 0
    123 	}
    124 	size := int(x & 7)
    125 	if size > len(p) {
    126 		// Short or invalid.
    127 		return 0
    128 	}
    129 	accept := acceptRanges[x>>4]
    130 	if c := p[1]; c < accept.lo || accept.hi < c {
    131 		return 0
    132 	} else if size == 2 {
    133 	} else if c := p[2]; c < locb || hicb < c {
    134 		return 0
    135 	} else if size == 3 {
    136 	} else if c := p[3]; c < locb || hicb < c {
    137 		return 0
    138 	}
    139 
    140 	return size
    141 }
    142 
    143 // acceptRange gives the range of valid values for the second byte in a UTF-8
    144 // sequence.
    145 type acceptRange struct {
    146 	lo uint8 // lowest value for second byte.
    147 	hi uint8 // highest value for second byte.
    148 }
    149 
    150 // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
    151 var acceptRanges = [16]acceptRange{
    152 	0: {locb, hicb},
    153 	1: {0xA0, hicb},
    154 	2: {locb, 0x9F},
    155 	3: {0x90, hicb},
    156 	4: {locb, 0x8F},
    157 }
    158 
    159 // first is information about the first byte in a UTF-8 sequence.
    160 var first = [256]uint8{
    161 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    162 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
    163 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
    164 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
    165 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
    166 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
    167 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
    168 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
    169 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
    170 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    171 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    172 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    173 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    174 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    175 	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    176 	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    177 	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    178 	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    179 }
    180 
    181 const (
    182 	// The default lowest and highest continuation byte.
    183 	locb = 0b10000000
    184 	hicb = 0b10111111
    185 
    186 	// These names of these constants are chosen to give nice alignment in the
    187 	// table below. The first nibble is an index into acceptRanges or F for
    188 	// special one-byte cases. The second nibble is the Rune length or the
    189 	// Status for the special one-byte case.
    190 	xx = 0xF1 // invalid: size 1
    191 	as = 0xF0 // ASCII: size 1
    192 	s1 = 0x02 // accept 0, size 2
    193 	s2 = 0x13 // accept 1, size 3
    194 	s3 = 0x03 // accept 0, size 3
    195 	s4 = 0x23 // accept 2, size 3
    196 	s5 = 0x34 // accept 3, size 4
    197 	s6 = 0x04 // accept 0, size 4
    198 	s7 = 0x44 // accept 4, size 4
    199 )