string.go (11881B)
1 package decoder 2 3 import ( 4 "bytes" 5 "fmt" 6 "reflect" 7 "unicode" 8 "unicode/utf16" 9 "unicode/utf8" 10 "unsafe" 11 12 "github.com/goccy/go-json/internal/errors" 13 ) 14 15 type stringDecoder struct { 16 structName string 17 fieldName string 18 } 19 20 func newStringDecoder(structName, fieldName string) *stringDecoder { 21 return &stringDecoder{ 22 structName: structName, 23 fieldName: fieldName, 24 } 25 } 26 27 func (d *stringDecoder) errUnmarshalType(typeName string, offset int64) *errors.UnmarshalTypeError { 28 return &errors.UnmarshalTypeError{ 29 Value: typeName, 30 Type: reflect.TypeOf(""), 31 Offset: offset, 32 Struct: d.structName, 33 Field: d.fieldName, 34 } 35 } 36 37 func (d *stringDecoder) DecodeStream(s *Stream, depth int64, p unsafe.Pointer) error { 38 bytes, err := d.decodeStreamByte(s) 39 if err != nil { 40 return err 41 } 42 if bytes == nil { 43 return nil 44 } 45 **(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes)) 46 s.reset() 47 return nil 48 } 49 50 func (d *stringDecoder) Decode(ctx *RuntimeContext, cursor, depth int64, p unsafe.Pointer) (int64, error) { 51 bytes, c, err := d.decodeByte(ctx.Buf, cursor) 52 if err != nil { 53 return 0, err 54 } 55 if bytes == nil { 56 return c, nil 57 } 58 cursor = c 59 **(**string)(unsafe.Pointer(&p)) = *(*string)(unsafe.Pointer(&bytes)) 60 return cursor, nil 61 } 62 63 func (d *stringDecoder) DecodePath(ctx *RuntimeContext, cursor, depth int64) ([][]byte, int64, error) { 64 bytes, c, err := d.decodeByte(ctx.Buf, cursor) 65 if err != nil { 66 return nil, 0, err 67 } 68 if bytes == nil { 69 return [][]byte{nullbytes}, c, nil 70 } 71 return [][]byte{bytes}, c, nil 72 } 73 74 var ( 75 hexToInt = [256]int{ 76 '0': 0, 77 '1': 1, 78 '2': 2, 79 '3': 3, 80 '4': 4, 81 '5': 5, 82 '6': 6, 83 '7': 7, 84 '8': 8, 85 '9': 9, 86 'A': 10, 87 'B': 11, 88 'C': 12, 89 'D': 13, 90 'E': 14, 91 'F': 15, 92 'a': 10, 93 'b': 11, 94 'c': 12, 95 'd': 13, 96 'e': 14, 97 'f': 15, 98 } 99 ) 100 101 func unicodeToRune(code []byte) rune { 102 var r rune 103 for i := 0; i < len(code); i++ { 104 r = r*16 + rune(hexToInt[code[i]]) 105 } 106 return r 107 } 108 109 func readAtLeast(s *Stream, n int64, p *unsafe.Pointer) bool { 110 for s.cursor+n >= s.length { 111 if !s.read() { 112 return false 113 } 114 *p = s.bufptr() 115 } 116 return true 117 } 118 119 func decodeUnicodeRune(s *Stream, p unsafe.Pointer) (rune, int64, unsafe.Pointer, error) { 120 const defaultOffset = 5 121 const surrogateOffset = 11 122 123 if !readAtLeast(s, defaultOffset, &p) { 124 return rune(0), 0, nil, errors.ErrInvalidCharacter(s.char(), "escaped string", s.totalOffset()) 125 } 126 127 r := unicodeToRune(s.buf[s.cursor+1 : s.cursor+defaultOffset]) 128 if utf16.IsSurrogate(r) { 129 if !readAtLeast(s, surrogateOffset, &p) { 130 return unicode.ReplacementChar, defaultOffset, p, nil 131 } 132 if s.buf[s.cursor+defaultOffset] != '\\' || s.buf[s.cursor+defaultOffset+1] != 'u' { 133 return unicode.ReplacementChar, defaultOffset, p, nil 134 } 135 r2 := unicodeToRune(s.buf[s.cursor+defaultOffset+2 : s.cursor+surrogateOffset]) 136 if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar { 137 return r, surrogateOffset, p, nil 138 } 139 } 140 return r, defaultOffset, p, nil 141 } 142 143 func decodeUnicode(s *Stream, p unsafe.Pointer) (unsafe.Pointer, error) { 144 const backSlashAndULen = 2 // length of \u 145 146 r, offset, pp, err := decodeUnicodeRune(s, p) 147 if err != nil { 148 return nil, err 149 } 150 unicode := []byte(string(r)) 151 unicodeLen := int64(len(unicode)) 152 s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+offset:]...) 153 unicodeOrgLen := offset - 1 154 s.length = s.length - (backSlashAndULen + (unicodeOrgLen - unicodeLen)) 155 s.cursor = s.cursor - backSlashAndULen + unicodeLen 156 return pp, nil 157 } 158 159 func decodeEscapeString(s *Stream, p unsafe.Pointer) (unsafe.Pointer, error) { 160 s.cursor++ 161 RETRY: 162 switch s.buf[s.cursor] { 163 case '"': 164 s.buf[s.cursor] = '"' 165 case '\\': 166 s.buf[s.cursor] = '\\' 167 case '/': 168 s.buf[s.cursor] = '/' 169 case 'b': 170 s.buf[s.cursor] = '\b' 171 case 'f': 172 s.buf[s.cursor] = '\f' 173 case 'n': 174 s.buf[s.cursor] = '\n' 175 case 'r': 176 s.buf[s.cursor] = '\r' 177 case 't': 178 s.buf[s.cursor] = '\t' 179 case 'u': 180 return decodeUnicode(s, p) 181 case nul: 182 if !s.read() { 183 return nil, errors.ErrInvalidCharacter(s.char(), "escaped string", s.totalOffset()) 184 } 185 p = s.bufptr() 186 goto RETRY 187 default: 188 return nil, errors.ErrUnexpectedEndOfJSON("string", s.totalOffset()) 189 } 190 s.buf = append(s.buf[:s.cursor-1], s.buf[s.cursor:]...) 191 s.length-- 192 s.cursor-- 193 p = s.bufptr() 194 return p, nil 195 } 196 197 var ( 198 runeErrBytes = []byte(string(utf8.RuneError)) 199 runeErrBytesLen = int64(len(runeErrBytes)) 200 ) 201 202 func stringBytes(s *Stream) ([]byte, error) { 203 _, cursor, p := s.stat() 204 cursor++ // skip double quote char 205 start := cursor 206 for { 207 switch char(p, cursor) { 208 case '\\': 209 s.cursor = cursor 210 pp, err := decodeEscapeString(s, p) 211 if err != nil { 212 return nil, err 213 } 214 p = pp 215 cursor = s.cursor 216 case '"': 217 literal := s.buf[start:cursor] 218 cursor++ 219 s.cursor = cursor 220 return literal, nil 221 case 222 // 0x00 is nul, 0x5c is '\\', 0x22 is '"' . 223 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, // 0x00-0x0F 224 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, // 0x10-0x1F 225 0x20, 0x21 /*0x22,*/, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, // 0x20-0x2F 226 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, // 0x30-0x3F 227 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, // 0x40-0x4F 228 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B /*0x5C,*/, 0x5D, 0x5E, 0x5F, // 0x50-0x5F 229 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, // 0x60-0x6F 230 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F: // 0x70-0x7F 231 // character is ASCII. skip to next char 232 case 233 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, // 0x80-0x8F 234 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, // 0x90-0x9F 235 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, // 0xA0-0xAF 236 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, // 0xB0-0xBF 237 0xC0, 0xC1, // 0xC0-0xC1 238 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF: // 0xF5-0xFE 239 // character is invalid 240 s.buf = append(append(append([]byte{}, s.buf[:cursor]...), runeErrBytes...), s.buf[cursor+1:]...) 241 _, _, p = s.stat() 242 cursor += runeErrBytesLen 243 s.length += runeErrBytesLen 244 continue 245 case nul: 246 s.cursor = cursor 247 if s.read() { 248 _, cursor, p = s.stat() 249 continue 250 } 251 goto ERROR 252 case 0xEF: 253 // RuneError is {0xEF, 0xBF, 0xBD} 254 if s.buf[cursor+1] == 0xBF && s.buf[cursor+2] == 0xBD { 255 // found RuneError: skip 256 cursor += 2 257 break 258 } 259 fallthrough 260 default: 261 // multi bytes character 262 if !utf8.FullRune(s.buf[cursor : len(s.buf)-1]) { 263 s.cursor = cursor 264 if s.read() { 265 _, cursor, p = s.stat() 266 continue 267 } 268 goto ERROR 269 } 270 r, size := utf8.DecodeRune(s.buf[cursor:]) 271 if r == utf8.RuneError { 272 s.buf = append(append(append([]byte{}, s.buf[:cursor]...), runeErrBytes...), s.buf[cursor+1:]...) 273 cursor += runeErrBytesLen 274 s.length += runeErrBytesLen 275 _, _, p = s.stat() 276 } else { 277 cursor += int64(size) 278 } 279 continue 280 } 281 cursor++ 282 } 283 ERROR: 284 return nil, errors.ErrUnexpectedEndOfJSON("string", s.totalOffset()) 285 } 286 287 func (d *stringDecoder) decodeStreamByte(s *Stream) ([]byte, error) { 288 for { 289 switch s.char() { 290 case ' ', '\n', '\t', '\r': 291 s.cursor++ 292 continue 293 case '[': 294 return nil, d.errUnmarshalType("array", s.totalOffset()) 295 case '{': 296 return nil, d.errUnmarshalType("object", s.totalOffset()) 297 case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 298 return nil, d.errUnmarshalType("number", s.totalOffset()) 299 case '"': 300 return stringBytes(s) 301 case 'n': 302 if err := nullBytes(s); err != nil { 303 return nil, err 304 } 305 return nil, nil 306 case nul: 307 if s.read() { 308 continue 309 } 310 } 311 break 312 } 313 return nil, errors.ErrInvalidBeginningOfValue(s.char(), s.totalOffset()) 314 } 315 316 func (d *stringDecoder) decodeByte(buf []byte, cursor int64) ([]byte, int64, error) { 317 for { 318 switch buf[cursor] { 319 case ' ', '\n', '\t', '\r': 320 cursor++ 321 case '[': 322 return nil, 0, d.errUnmarshalType("array", cursor) 323 case '{': 324 return nil, 0, d.errUnmarshalType("object", cursor) 325 case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 326 return nil, 0, d.errUnmarshalType("number", cursor) 327 case '"': 328 cursor++ 329 start := cursor 330 b := (*sliceHeader)(unsafe.Pointer(&buf)).data 331 escaped := 0 332 for { 333 switch char(b, cursor) { 334 case '\\': 335 escaped++ 336 cursor++ 337 switch char(b, cursor) { 338 case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': 339 cursor++ 340 case 'u': 341 buflen := int64(len(buf)) 342 if cursor+5 >= buflen { 343 return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor) 344 } 345 for i := int64(1); i <= 4; i++ { 346 c := char(b, cursor+i) 347 if !(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) { 348 return nil, 0, errors.ErrSyntax(fmt.Sprintf("json: invalid character %c in \\u hexadecimal character escape", c), cursor+i) 349 } 350 } 351 cursor += 5 352 default: 353 return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor) 354 } 355 continue 356 case '"': 357 literal := buf[start:cursor] 358 if escaped > 0 { 359 literal = literal[:unescapeString(literal)] 360 } 361 cursor++ 362 return literal, cursor, nil 363 case nul: 364 return nil, 0, errors.ErrUnexpectedEndOfJSON("string", cursor) 365 } 366 cursor++ 367 } 368 case 'n': 369 if err := validateNull(buf, cursor); err != nil { 370 return nil, 0, err 371 } 372 cursor += 4 373 return nil, cursor, nil 374 default: 375 return nil, 0, errors.ErrInvalidBeginningOfValue(buf[cursor], cursor) 376 } 377 } 378 } 379 380 var unescapeMap = [256]byte{ 381 '"': '"', 382 '\\': '\\', 383 '/': '/', 384 'b': '\b', 385 'f': '\f', 386 'n': '\n', 387 'r': '\r', 388 't': '\t', 389 } 390 391 func unsafeAdd(ptr unsafe.Pointer, offset int) unsafe.Pointer { 392 return unsafe.Pointer(uintptr(ptr) + uintptr(offset)) 393 } 394 395 func unescapeString(buf []byte) int { 396 p := (*sliceHeader)(unsafe.Pointer(&buf)).data 397 end := unsafeAdd(p, len(buf)) 398 src := unsafeAdd(p, bytes.IndexByte(buf, '\\')) 399 dst := src 400 for src != end { 401 c := char(src, 0) 402 if c == '\\' { 403 escapeChar := char(src, 1) 404 if escapeChar != 'u' { 405 *(*byte)(dst) = unescapeMap[escapeChar] 406 src = unsafeAdd(src, 2) 407 dst = unsafeAdd(dst, 1) 408 } else { 409 v1 := hexToInt[char(src, 2)] 410 v2 := hexToInt[char(src, 3)] 411 v3 := hexToInt[char(src, 4)] 412 v4 := hexToInt[char(src, 5)] 413 code := rune((v1 << 12) | (v2 << 8) | (v3 << 4) | v4) 414 if code >= 0xd800 && code < 0xdc00 && uintptr(unsafeAdd(src, 11)) < uintptr(end) { 415 if char(src, 6) == '\\' && char(src, 7) == 'u' { 416 v1 := hexToInt[char(src, 8)] 417 v2 := hexToInt[char(src, 9)] 418 v3 := hexToInt[char(src, 10)] 419 v4 := hexToInt[char(src, 11)] 420 lo := rune((v1 << 12) | (v2 << 8) | (v3 << 4) | v4) 421 if lo >= 0xdc00 && lo < 0xe000 { 422 code = (code-0xd800)<<10 | (lo - 0xdc00) + 0x10000 423 src = unsafeAdd(src, 6) 424 } 425 } 426 } 427 var b [utf8.UTFMax]byte 428 n := utf8.EncodeRune(b[:], code) 429 switch n { 430 case 4: 431 *(*byte)(unsafeAdd(dst, 3)) = b[3] 432 fallthrough 433 case 3: 434 *(*byte)(unsafeAdd(dst, 2)) = b[2] 435 fallthrough 436 case 2: 437 *(*byte)(unsafeAdd(dst, 1)) = b[1] 438 fallthrough 439 case 1: 440 *(*byte)(unsafeAdd(dst, 0)) = b[0] 441 } 442 src = unsafeAdd(src, 6) 443 dst = unsafeAdd(dst, n) 444 } 445 } else { 446 *(*byte)(dst) = c 447 src = unsafeAdd(src, 1) 448 dst = unsafeAdd(dst, 1) 449 } 450 } 451 return int(uintptr(dst) - uintptr(p)) 452 }