utf8.go (5600B)
1 package characters 2 3 import ( 4 "unicode/utf8" 5 ) 6 7 type utf8Err struct { 8 Index int 9 Size int 10 } 11 12 func (u utf8Err) Zero() bool { 13 return u.Size == 0 14 } 15 16 // Verified that a given string is only made of valid UTF-8 characters allowed 17 // by the TOML spec: 18 // 19 // Any Unicode character may be used except those that must be escaped: 20 // quotation mark, backslash, and the control characters other than tab (U+0000 21 // to U+0008, U+000A to U+001F, U+007F). 22 // 23 // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early 24 // when a character is not allowed. 25 // 26 // The returned utf8Err is Zero() if the string is valid, or contains the byte 27 // index and size of the invalid character. 28 // 29 // quotation mark => already checked 30 // backslash => already checked 31 // 0-0x8 => invalid 32 // 0x9 => tab, ok 33 // 0xA - 0x1F => invalid 34 // 0x7F => invalid 35 func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) { 36 // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. 37 offset := 0 38 for len(p) >= 8 { 39 // Combining two 32 bit loads allows the same code to be used 40 // for 32 and 64 bit platforms. 41 // The compiler can generate a 32bit load for first32 and second32 42 // on many platforms. See test/codegen/memcombine.go. 43 first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 44 second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24 45 if (first32|second32)&0x80808080 != 0 { 46 // Found a non ASCII byte (>= RuneSelf). 47 break 48 } 49 50 for i, b := range p[:8] { 51 if InvalidAscii(b) { 52 err.Index = offset + i 53 err.Size = 1 54 return 55 } 56 } 57 58 p = p[8:] 59 offset += 8 60 } 61 n := len(p) 62 for i := 0; i < n; { 63 pi := p[i] 64 if pi < utf8.RuneSelf { 65 if InvalidAscii(pi) { 66 err.Index = offset + i 67 err.Size = 1 68 return 69 } 70 i++ 71 continue 72 } 73 x := first[pi] 74 if x == xx { 75 // Illegal starter byte. 76 err.Index = offset + i 77 err.Size = 1 78 return 79 } 80 size := int(x & 7) 81 if i+size > n { 82 // Short or invalid. 83 err.Index = offset + i 84 err.Size = n - i 85 return 86 } 87 accept := acceptRanges[x>>4] 88 if c := p[i+1]; c < accept.lo || accept.hi < c { 89 err.Index = offset + i 90 err.Size = 2 91 return 92 } else if size == 2 { 93 } else if c := p[i+2]; c < locb || hicb < c { 94 err.Index = offset + i 95 err.Size = 3 96 return 97 } else if size == 3 { 98 } else if c := p[i+3]; c < locb || hicb < c { 99 err.Index = offset + i 100 err.Size = 4 101 return 102 } 103 i += size 104 } 105 return 106 } 107 108 // Return the size of the next rune if valid, 0 otherwise. 109 func Utf8ValidNext(p []byte) int { 110 c := p[0] 111 112 if c < utf8.RuneSelf { 113 if InvalidAscii(c) { 114 return 0 115 } 116 return 1 117 } 118 119 x := first[c] 120 if x == xx { 121 // Illegal starter byte. 122 return 0 123 } 124 size := int(x & 7) 125 if size > len(p) { 126 // Short or invalid. 127 return 0 128 } 129 accept := acceptRanges[x>>4] 130 if c := p[1]; c < accept.lo || accept.hi < c { 131 return 0 132 } else if size == 2 { 133 } else if c := p[2]; c < locb || hicb < c { 134 return 0 135 } else if size == 3 { 136 } else if c := p[3]; c < locb || hicb < c { 137 return 0 138 } 139 140 return size 141 } 142 143 // acceptRange gives the range of valid values for the second byte in a UTF-8 144 // sequence. 145 type acceptRange struct { 146 lo uint8 // lowest value for second byte. 147 hi uint8 // highest value for second byte. 148 } 149 150 // acceptRanges has size 16 to avoid bounds checks in the code that uses it. 151 var acceptRanges = [16]acceptRange{ 152 0: {locb, hicb}, 153 1: {0xA0, hicb}, 154 2: {locb, 0x9F}, 155 3: {0x90, hicb}, 156 4: {locb, 0x8F}, 157 } 158 159 // first is information about the first byte in a UTF-8 sequence. 160 var first = [256]uint8{ 161 // 1 2 3 4 5 6 7 8 9 A B C D E F 162 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F 163 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F 164 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F 165 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F 166 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F 167 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F 168 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F 169 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F 170 // 1 2 3 4 5 6 7 8 9 A B C D E F 171 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F 172 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F 173 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF 174 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF 175 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF 176 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF 177 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF 178 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF 179 } 180 181 const ( 182 // The default lowest and highest continuation byte. 183 locb = 0b10000000 184 hicb = 0b10111111 185 186 // These names of these constants are chosen to give nice alignment in the 187 // table below. The first nibble is an index into acceptRanges or F for 188 // special one-byte cases. The second nibble is the Rune length or the 189 // Status for the special one-byte case. 190 xx = 0xF1 // invalid: size 1 191 as = 0xF0 // ASCII: size 1 192 s1 = 0x02 // accept 0, size 2 193 s2 = 0x13 // accept 1, size 3 194 s3 = 0x03 // accept 0, size 3 195 s4 = 0x23 // accept 2, size 3 196 s5 = 0x34 // accept 3, size 4 197 s6 = 0x04 // accept 0, size 4 198 s7 = 0x44 // accept 4, size 4 199 )