decode.go (16993B)
1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package text 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "strconv" 12 "unicode/utf8" 13 14 "google.golang.org/protobuf/internal/errors" 15 ) 16 17 // Decoder is a token-based textproto decoder. 18 type Decoder struct { 19 // lastCall is last method called, either readCall or peekCall. 20 // Initial value is readCall. 21 lastCall call 22 23 // lastToken contains the last read token. 24 lastToken Token 25 26 // lastErr contains the last read error. 27 lastErr error 28 29 // openStack is a stack containing the byte characters for MessageOpen and 30 // ListOpen kinds. The top of stack represents the message or the list that 31 // the current token is nested in. An empty stack means the current token is 32 // at the top level message. The characters '{' and '<' both represent the 33 // MessageOpen kind. 34 openStack []byte 35 36 // orig is used in reporting line and column. 37 orig []byte 38 // in contains the unconsumed input. 39 in []byte 40 } 41 42 // NewDecoder returns a Decoder to read the given []byte. 43 func NewDecoder(b []byte) *Decoder { 44 return &Decoder{orig: b, in: b} 45 } 46 47 // ErrUnexpectedEOF means that EOF was encountered in the middle of the input. 48 var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF) 49 50 // call specifies which Decoder method was invoked. 51 type call uint8 52 53 const ( 54 readCall call = iota 55 peekCall 56 ) 57 58 // Peek looks ahead and returns the next token and error without advancing a read. 59 func (d *Decoder) Peek() (Token, error) { 60 defer func() { d.lastCall = peekCall }() 61 if d.lastCall == readCall { 62 d.lastToken, d.lastErr = d.Read() 63 } 64 return d.lastToken, d.lastErr 65 } 66 67 // Read returns the next token. 68 // It will return an error if there is no valid token. 69 func (d *Decoder) Read() (Token, error) { 70 defer func() { d.lastCall = readCall }() 71 if d.lastCall == peekCall { 72 return d.lastToken, d.lastErr 73 } 74 75 tok, err := d.parseNext(d.lastToken.kind) 76 if err != nil { 77 return Token{}, err 78 } 79 80 switch tok.kind { 81 case comma, semicolon: 82 tok, err = d.parseNext(tok.kind) 83 if err != nil { 84 return Token{}, err 85 } 86 } 87 d.lastToken = tok 88 return tok, nil 89 } 90 91 const ( 92 mismatchedFmt = "mismatched close character %q" 93 unexpectedFmt = "unexpected character %q" 94 ) 95 96 // parseNext parses the next Token based on given last kind. 97 func (d *Decoder) parseNext(lastKind Kind) (Token, error) { 98 // Trim leading spaces. 99 d.consume(0) 100 isEOF := false 101 if len(d.in) == 0 { 102 isEOF = true 103 } 104 105 switch lastKind { 106 case EOF: 107 return d.consumeToken(EOF, 0, 0), nil 108 109 case bof: 110 // Start of top level message. Next token can be EOF or Name. 111 if isEOF { 112 return d.consumeToken(EOF, 0, 0), nil 113 } 114 return d.parseFieldName() 115 116 case Name: 117 // Next token can be MessageOpen, ListOpen or Scalar. 118 if isEOF { 119 return Token{}, ErrUnexpectedEOF 120 } 121 switch ch := d.in[0]; ch { 122 case '{', '<': 123 d.pushOpenStack(ch) 124 return d.consumeToken(MessageOpen, 1, 0), nil 125 case '[': 126 d.pushOpenStack(ch) 127 return d.consumeToken(ListOpen, 1, 0), nil 128 default: 129 return d.parseScalar() 130 } 131 132 case Scalar: 133 openKind, closeCh := d.currentOpenKind() 134 switch openKind { 135 case bof: 136 // Top level message. 137 // Next token can be EOF, comma, semicolon or Name. 138 if isEOF { 139 return d.consumeToken(EOF, 0, 0), nil 140 } 141 switch d.in[0] { 142 case ',': 143 return d.consumeToken(comma, 1, 0), nil 144 case ';': 145 return d.consumeToken(semicolon, 1, 0), nil 146 default: 147 return d.parseFieldName() 148 } 149 150 case MessageOpen: 151 // Next token can be MessageClose, comma, semicolon or Name. 152 if isEOF { 153 return Token{}, ErrUnexpectedEOF 154 } 155 switch ch := d.in[0]; ch { 156 case closeCh: 157 d.popOpenStack() 158 return d.consumeToken(MessageClose, 1, 0), nil 159 case otherCloseChar[closeCh]: 160 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 161 case ',': 162 return d.consumeToken(comma, 1, 0), nil 163 case ';': 164 return d.consumeToken(semicolon, 1, 0), nil 165 default: 166 return d.parseFieldName() 167 } 168 169 case ListOpen: 170 // Next token can be ListClose or comma. 171 if isEOF { 172 return Token{}, ErrUnexpectedEOF 173 } 174 switch ch := d.in[0]; ch { 175 case ']': 176 d.popOpenStack() 177 return d.consumeToken(ListClose, 1, 0), nil 178 case ',': 179 return d.consumeToken(comma, 1, 0), nil 180 default: 181 return Token{}, d.newSyntaxError(unexpectedFmt, ch) 182 } 183 } 184 185 case MessageOpen: 186 // Next token can be MessageClose or Name. 187 if isEOF { 188 return Token{}, ErrUnexpectedEOF 189 } 190 _, closeCh := d.currentOpenKind() 191 switch ch := d.in[0]; ch { 192 case closeCh: 193 d.popOpenStack() 194 return d.consumeToken(MessageClose, 1, 0), nil 195 case otherCloseChar[closeCh]: 196 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 197 default: 198 return d.parseFieldName() 199 } 200 201 case MessageClose: 202 openKind, closeCh := d.currentOpenKind() 203 switch openKind { 204 case bof: 205 // Top level message. 206 // Next token can be EOF, comma, semicolon or Name. 207 if isEOF { 208 return d.consumeToken(EOF, 0, 0), nil 209 } 210 switch ch := d.in[0]; ch { 211 case ',': 212 return d.consumeToken(comma, 1, 0), nil 213 case ';': 214 return d.consumeToken(semicolon, 1, 0), nil 215 default: 216 return d.parseFieldName() 217 } 218 219 case MessageOpen: 220 // Next token can be MessageClose, comma, semicolon or Name. 221 if isEOF { 222 return Token{}, ErrUnexpectedEOF 223 } 224 switch ch := d.in[0]; ch { 225 case closeCh: 226 d.popOpenStack() 227 return d.consumeToken(MessageClose, 1, 0), nil 228 case otherCloseChar[closeCh]: 229 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 230 case ',': 231 return d.consumeToken(comma, 1, 0), nil 232 case ';': 233 return d.consumeToken(semicolon, 1, 0), nil 234 default: 235 return d.parseFieldName() 236 } 237 238 case ListOpen: 239 // Next token can be ListClose or comma 240 if isEOF { 241 return Token{}, ErrUnexpectedEOF 242 } 243 switch ch := d.in[0]; ch { 244 case closeCh: 245 d.popOpenStack() 246 return d.consumeToken(ListClose, 1, 0), nil 247 case ',': 248 return d.consumeToken(comma, 1, 0), nil 249 default: 250 return Token{}, d.newSyntaxError(unexpectedFmt, ch) 251 } 252 } 253 254 case ListOpen: 255 // Next token can be ListClose, MessageStart or Scalar. 256 if isEOF { 257 return Token{}, ErrUnexpectedEOF 258 } 259 switch ch := d.in[0]; ch { 260 case ']': 261 d.popOpenStack() 262 return d.consumeToken(ListClose, 1, 0), nil 263 case '{', '<': 264 d.pushOpenStack(ch) 265 return d.consumeToken(MessageOpen, 1, 0), nil 266 default: 267 return d.parseScalar() 268 } 269 270 case ListClose: 271 openKind, closeCh := d.currentOpenKind() 272 switch openKind { 273 case bof: 274 // Top level message. 275 // Next token can be EOF, comma, semicolon or Name. 276 if isEOF { 277 return d.consumeToken(EOF, 0, 0), nil 278 } 279 switch ch := d.in[0]; ch { 280 case ',': 281 return d.consumeToken(comma, 1, 0), nil 282 case ';': 283 return d.consumeToken(semicolon, 1, 0), nil 284 default: 285 return d.parseFieldName() 286 } 287 288 case MessageOpen: 289 // Next token can be MessageClose, comma, semicolon or Name. 290 if isEOF { 291 return Token{}, ErrUnexpectedEOF 292 } 293 switch ch := d.in[0]; ch { 294 case closeCh: 295 d.popOpenStack() 296 return d.consumeToken(MessageClose, 1, 0), nil 297 case otherCloseChar[closeCh]: 298 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 299 case ',': 300 return d.consumeToken(comma, 1, 0), nil 301 case ';': 302 return d.consumeToken(semicolon, 1, 0), nil 303 default: 304 return d.parseFieldName() 305 } 306 307 default: 308 // It is not possible to have this case. Let it panic below. 309 } 310 311 case comma, semicolon: 312 openKind, closeCh := d.currentOpenKind() 313 switch openKind { 314 case bof: 315 // Top level message. Next token can be EOF or Name. 316 if isEOF { 317 return d.consumeToken(EOF, 0, 0), nil 318 } 319 return d.parseFieldName() 320 321 case MessageOpen: 322 // Next token can be MessageClose or Name. 323 if isEOF { 324 return Token{}, ErrUnexpectedEOF 325 } 326 switch ch := d.in[0]; ch { 327 case closeCh: 328 d.popOpenStack() 329 return d.consumeToken(MessageClose, 1, 0), nil 330 case otherCloseChar[closeCh]: 331 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 332 default: 333 return d.parseFieldName() 334 } 335 336 case ListOpen: 337 if lastKind == semicolon { 338 // It is not be possible to have this case as logic here 339 // should not have produced a semicolon Token when inside a 340 // list. Let it panic below. 341 break 342 } 343 // Next token can be MessageOpen or Scalar. 344 if isEOF { 345 return Token{}, ErrUnexpectedEOF 346 } 347 switch ch := d.in[0]; ch { 348 case '{', '<': 349 d.pushOpenStack(ch) 350 return d.consumeToken(MessageOpen, 1, 0), nil 351 default: 352 return d.parseScalar() 353 } 354 } 355 } 356 357 line, column := d.Position(len(d.orig) - len(d.in)) 358 panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind)) 359 } 360 361 var otherCloseChar = map[byte]byte{ 362 '}': '>', 363 '>': '}', 364 } 365 366 // currentOpenKind indicates whether current position is inside a message, list 367 // or top-level message by returning MessageOpen, ListOpen or bof respectively. 368 // If the returned kind is either a MessageOpen or ListOpen, it also returns the 369 // corresponding closing character. 370 func (d *Decoder) currentOpenKind() (Kind, byte) { 371 if len(d.openStack) == 0 { 372 return bof, 0 373 } 374 openCh := d.openStack[len(d.openStack)-1] 375 switch openCh { 376 case '{': 377 return MessageOpen, '}' 378 case '<': 379 return MessageOpen, '>' 380 case '[': 381 return ListOpen, ']' 382 } 383 panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh)) 384 } 385 386 func (d *Decoder) pushOpenStack(ch byte) { 387 d.openStack = append(d.openStack, ch) 388 } 389 390 func (d *Decoder) popOpenStack() { 391 d.openStack = d.openStack[:len(d.openStack)-1] 392 } 393 394 // parseFieldName parses field name and separator. 395 func (d *Decoder) parseFieldName() (tok Token, err error) { 396 defer func() { 397 if err == nil && d.tryConsumeChar(':') { 398 tok.attrs |= hasSeparator 399 } 400 }() 401 402 // Extension or Any type URL. 403 if d.in[0] == '[' { 404 return d.parseTypeName() 405 } 406 407 // Identifier. 408 if size := parseIdent(d.in, false); size > 0 { 409 return d.consumeToken(Name, size, uint8(IdentName)), nil 410 } 411 412 // Field number. Identify if input is a valid number that is not negative 413 // and is decimal integer within 32-bit range. 414 if num := parseNumber(d.in); num.size > 0 { 415 str := num.string(d.in) 416 if !num.neg && num.kind == numDec { 417 if _, err := strconv.ParseInt(str, 10, 32); err == nil { 418 return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil 419 } 420 } 421 return Token{}, d.newSyntaxError("invalid field number: %s", str) 422 } 423 424 return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in)) 425 } 426 427 // parseTypeName parses Any type URL or extension field name. The name is 428 // enclosed in [ and ] characters. The C++ parser does not handle many legal URL 429 // strings. This implementation is more liberal and allows for the pattern 430 // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed 431 // in between [ ], '.', '/' and the sub names. 432 func (d *Decoder) parseTypeName() (Token, error) { 433 startPos := len(d.orig) - len(d.in) 434 // Use alias s to advance first in order to use d.in for error handling. 435 // Caller already checks for [ as first character. 436 s := consume(d.in[1:], 0) 437 if len(s) == 0 { 438 return Token{}, ErrUnexpectedEOF 439 } 440 441 var name []byte 442 for len(s) > 0 && isTypeNameChar(s[0]) { 443 name = append(name, s[0]) 444 s = s[1:] 445 } 446 s = consume(s, 0) 447 448 var closed bool 449 for len(s) > 0 && !closed { 450 switch { 451 case s[0] == ']': 452 s = s[1:] 453 closed = true 454 455 case s[0] == '/', s[0] == '.': 456 if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') { 457 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 458 d.orig[startPos:len(d.orig)-len(s)+1]) 459 } 460 name = append(name, s[0]) 461 s = s[1:] 462 s = consume(s, 0) 463 for len(s) > 0 && isTypeNameChar(s[0]) { 464 name = append(name, s[0]) 465 s = s[1:] 466 } 467 s = consume(s, 0) 468 469 default: 470 return Token{}, d.newSyntaxError( 471 "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1]) 472 } 473 } 474 475 if !closed { 476 return Token{}, ErrUnexpectedEOF 477 } 478 479 // First character cannot be '.'. Last character cannot be '.' or '/'. 480 size := len(name) 481 if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' { 482 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 483 d.orig[startPos:len(d.orig)-len(s)]) 484 } 485 486 d.in = s 487 endPos := len(d.orig) - len(d.in) 488 d.consume(0) 489 490 return Token{ 491 kind: Name, 492 attrs: uint8(TypeName), 493 pos: startPos, 494 raw: d.orig[startPos:endPos], 495 str: string(name), 496 }, nil 497 } 498 499 func isTypeNameChar(b byte) bool { 500 return (b == '-' || b == '_' || 501 ('0' <= b && b <= '9') || 502 ('a' <= b && b <= 'z') || 503 ('A' <= b && b <= 'Z')) 504 } 505 506 func isWhiteSpace(b byte) bool { 507 switch b { 508 case ' ', '\n', '\r', '\t': 509 return true 510 default: 511 return false 512 } 513 } 514 515 // parseIdent parses an unquoted proto identifier and returns size. 516 // If allowNeg is true, it allows '-' to be the first character in the 517 // identifier. This is used when parsing literal values like -infinity, etc. 518 // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*` 519 func parseIdent(input []byte, allowNeg bool) int { 520 var size int 521 522 s := input 523 if len(s) == 0 { 524 return 0 525 } 526 527 if allowNeg && s[0] == '-' { 528 s = s[1:] 529 size++ 530 if len(s) == 0 { 531 return 0 532 } 533 } 534 535 switch { 536 case s[0] == '_', 537 'a' <= s[0] && s[0] <= 'z', 538 'A' <= s[0] && s[0] <= 'Z': 539 s = s[1:] 540 size++ 541 default: 542 return 0 543 } 544 545 for len(s) > 0 && (s[0] == '_' || 546 'a' <= s[0] && s[0] <= 'z' || 547 'A' <= s[0] && s[0] <= 'Z' || 548 '0' <= s[0] && s[0] <= '9') { 549 s = s[1:] 550 size++ 551 } 552 553 if len(s) > 0 && !isDelim(s[0]) { 554 return 0 555 } 556 557 return size 558 } 559 560 // parseScalar parses for a string, literal or number value. 561 func (d *Decoder) parseScalar() (Token, error) { 562 if d.in[0] == '"' || d.in[0] == '\'' { 563 return d.parseStringValue() 564 } 565 566 if tok, ok := d.parseLiteralValue(); ok { 567 return tok, nil 568 } 569 570 if tok, ok := d.parseNumberValue(); ok { 571 return tok, nil 572 } 573 574 return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in)) 575 } 576 577 // parseLiteralValue parses a literal value. A literal value is used for 578 // bools, special floats and enums. This function simply identifies that the 579 // field value is a literal. 580 func (d *Decoder) parseLiteralValue() (Token, bool) { 581 size := parseIdent(d.in, true) 582 if size == 0 { 583 return Token{}, false 584 } 585 return d.consumeToken(Scalar, size, literalValue), true 586 } 587 588 // consumeToken constructs a Token for given Kind from d.in and consumes given 589 // size-length from it. 590 func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token { 591 // Important to compute raw and pos before consuming. 592 tok := Token{ 593 kind: kind, 594 attrs: attrs, 595 pos: len(d.orig) - len(d.in), 596 raw: d.in[:size], 597 } 598 d.consume(size) 599 return tok 600 } 601 602 // newSyntaxError returns a syntax error with line and column information for 603 // current position. 604 func (d *Decoder) newSyntaxError(f string, x ...interface{}) error { 605 e := errors.New(f, x...) 606 line, column := d.Position(len(d.orig) - len(d.in)) 607 return errors.New("syntax error (line %d:%d): %v", line, column, e) 608 } 609 610 // Position returns line and column number of given index of the original input. 611 // It will panic if index is out of range. 612 func (d *Decoder) Position(idx int) (line int, column int) { 613 b := d.orig[:idx] 614 line = bytes.Count(b, []byte("\n")) + 1 615 if i := bytes.LastIndexByte(b, '\n'); i >= 0 { 616 b = b[i+1:] 617 } 618 column = utf8.RuneCount(b) + 1 // ignore multi-rune characters 619 return line, column 620 } 621 622 func (d *Decoder) tryConsumeChar(c byte) bool { 623 if len(d.in) > 0 && d.in[0] == c { 624 d.consume(1) 625 return true 626 } 627 return false 628 } 629 630 // consume consumes n bytes of input and any subsequent whitespace or comments. 631 func (d *Decoder) consume(n int) { 632 d.in = consume(d.in, n) 633 return 634 } 635 636 // consume consumes n bytes of input and any subsequent whitespace or comments. 637 func consume(b []byte, n int) []byte { 638 b = b[n:] 639 for len(b) > 0 { 640 switch b[0] { 641 case ' ', '\n', '\r', '\t': 642 b = b[1:] 643 case '#': 644 if i := bytes.IndexByte(b, '\n'); i >= 0 { 645 b = b[i+len("\n"):] 646 } else { 647 b = nil 648 } 649 default: 650 return b 651 } 652 } 653 return b 654 } 655 656 // errId extracts a byte sequence that looks like an invalid ID 657 // (for the purposes of error reporting). 658 func errId(seq []byte) []byte { 659 const maxLen = 32 660 for i := 0; i < len(seq); { 661 if i > maxLen { 662 return append(seq[:i:i], "…"...) 663 } 664 r, size := utf8.DecodeRune(seq[i:]) 665 if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) { 666 if i == 0 { 667 // Either the first byte is invalid UTF-8 or a 668 // delimiter, or the first rune is non-ASCII. 669 // Return it as-is. 670 i = size 671 } 672 return seq[:i:i] 673 } 674 i += size 675 } 676 // No delimiter found. 677 return seq 678 } 679 680 // isDelim returns true if given byte is a delimiter character. 681 func isDelim(c byte) bool { 682 return !(c == '-' || c == '+' || c == '.' || c == '_' || 683 ('a' <= c && c <= 'z') || 684 ('A' <= c && c <= 'Z') || 685 ('0' <= c && c <= '9')) 686 }