gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

decode.go (16993B)


      1 // Copyright 2018 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package text
      6 
      7 import (
      8 	"bytes"
      9 	"fmt"
     10 	"io"
     11 	"strconv"
     12 	"unicode/utf8"
     13 
     14 	"google.golang.org/protobuf/internal/errors"
     15 )
     16 
     17 // Decoder is a token-based textproto decoder.
     18 type Decoder struct {
     19 	// lastCall is last method called, either readCall or peekCall.
     20 	// Initial value is readCall.
     21 	lastCall call
     22 
     23 	// lastToken contains the last read token.
     24 	lastToken Token
     25 
     26 	// lastErr contains the last read error.
     27 	lastErr error
     28 
     29 	// openStack is a stack containing the byte characters for MessageOpen and
     30 	// ListOpen kinds. The top of stack represents the message or the list that
     31 	// the current token is nested in. An empty stack means the current token is
     32 	// at the top level message. The characters '{' and '<' both represent the
     33 	// MessageOpen kind.
     34 	openStack []byte
     35 
     36 	// orig is used in reporting line and column.
     37 	orig []byte
     38 	// in contains the unconsumed input.
     39 	in []byte
     40 }
     41 
     42 // NewDecoder returns a Decoder to read the given []byte.
     43 func NewDecoder(b []byte) *Decoder {
     44 	return &Decoder{orig: b, in: b}
     45 }
     46 
     47 // ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
     48 var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
     49 
     50 // call specifies which Decoder method was invoked.
     51 type call uint8
     52 
     53 const (
     54 	readCall call = iota
     55 	peekCall
     56 )
     57 
     58 // Peek looks ahead and returns the next token and error without advancing a read.
     59 func (d *Decoder) Peek() (Token, error) {
     60 	defer func() { d.lastCall = peekCall }()
     61 	if d.lastCall == readCall {
     62 		d.lastToken, d.lastErr = d.Read()
     63 	}
     64 	return d.lastToken, d.lastErr
     65 }
     66 
     67 // Read returns the next token.
     68 // It will return an error if there is no valid token.
     69 func (d *Decoder) Read() (Token, error) {
     70 	defer func() { d.lastCall = readCall }()
     71 	if d.lastCall == peekCall {
     72 		return d.lastToken, d.lastErr
     73 	}
     74 
     75 	tok, err := d.parseNext(d.lastToken.kind)
     76 	if err != nil {
     77 		return Token{}, err
     78 	}
     79 
     80 	switch tok.kind {
     81 	case comma, semicolon:
     82 		tok, err = d.parseNext(tok.kind)
     83 		if err != nil {
     84 			return Token{}, err
     85 		}
     86 	}
     87 	d.lastToken = tok
     88 	return tok, nil
     89 }
     90 
     91 const (
     92 	mismatchedFmt = "mismatched close character %q"
     93 	unexpectedFmt = "unexpected character %q"
     94 )
     95 
     96 // parseNext parses the next Token based on given last kind.
     97 func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
     98 	// Trim leading spaces.
     99 	d.consume(0)
    100 	isEOF := false
    101 	if len(d.in) == 0 {
    102 		isEOF = true
    103 	}
    104 
    105 	switch lastKind {
    106 	case EOF:
    107 		return d.consumeToken(EOF, 0, 0), nil
    108 
    109 	case bof:
    110 		// Start of top level message. Next token can be EOF or Name.
    111 		if isEOF {
    112 			return d.consumeToken(EOF, 0, 0), nil
    113 		}
    114 		return d.parseFieldName()
    115 
    116 	case Name:
    117 		// Next token can be MessageOpen, ListOpen or Scalar.
    118 		if isEOF {
    119 			return Token{}, ErrUnexpectedEOF
    120 		}
    121 		switch ch := d.in[0]; ch {
    122 		case '{', '<':
    123 			d.pushOpenStack(ch)
    124 			return d.consumeToken(MessageOpen, 1, 0), nil
    125 		case '[':
    126 			d.pushOpenStack(ch)
    127 			return d.consumeToken(ListOpen, 1, 0), nil
    128 		default:
    129 			return d.parseScalar()
    130 		}
    131 
    132 	case Scalar:
    133 		openKind, closeCh := d.currentOpenKind()
    134 		switch openKind {
    135 		case bof:
    136 			// Top level message.
    137 			// 	Next token can be EOF, comma, semicolon or Name.
    138 			if isEOF {
    139 				return d.consumeToken(EOF, 0, 0), nil
    140 			}
    141 			switch d.in[0] {
    142 			case ',':
    143 				return d.consumeToken(comma, 1, 0), nil
    144 			case ';':
    145 				return d.consumeToken(semicolon, 1, 0), nil
    146 			default:
    147 				return d.parseFieldName()
    148 			}
    149 
    150 		case MessageOpen:
    151 			// Next token can be MessageClose, comma, semicolon or Name.
    152 			if isEOF {
    153 				return Token{}, ErrUnexpectedEOF
    154 			}
    155 			switch ch := d.in[0]; ch {
    156 			case closeCh:
    157 				d.popOpenStack()
    158 				return d.consumeToken(MessageClose, 1, 0), nil
    159 			case otherCloseChar[closeCh]:
    160 				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
    161 			case ',':
    162 				return d.consumeToken(comma, 1, 0), nil
    163 			case ';':
    164 				return d.consumeToken(semicolon, 1, 0), nil
    165 			default:
    166 				return d.parseFieldName()
    167 			}
    168 
    169 		case ListOpen:
    170 			// Next token can be ListClose or comma.
    171 			if isEOF {
    172 				return Token{}, ErrUnexpectedEOF
    173 			}
    174 			switch ch := d.in[0]; ch {
    175 			case ']':
    176 				d.popOpenStack()
    177 				return d.consumeToken(ListClose, 1, 0), nil
    178 			case ',':
    179 				return d.consumeToken(comma, 1, 0), nil
    180 			default:
    181 				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
    182 			}
    183 		}
    184 
    185 	case MessageOpen:
    186 		// Next token can be MessageClose or Name.
    187 		if isEOF {
    188 			return Token{}, ErrUnexpectedEOF
    189 		}
    190 		_, closeCh := d.currentOpenKind()
    191 		switch ch := d.in[0]; ch {
    192 		case closeCh:
    193 			d.popOpenStack()
    194 			return d.consumeToken(MessageClose, 1, 0), nil
    195 		case otherCloseChar[closeCh]:
    196 			return Token{}, d.newSyntaxError(mismatchedFmt, ch)
    197 		default:
    198 			return d.parseFieldName()
    199 		}
    200 
    201 	case MessageClose:
    202 		openKind, closeCh := d.currentOpenKind()
    203 		switch openKind {
    204 		case bof:
    205 			// Top level message.
    206 			// Next token can be EOF, comma, semicolon or Name.
    207 			if isEOF {
    208 				return d.consumeToken(EOF, 0, 0), nil
    209 			}
    210 			switch ch := d.in[0]; ch {
    211 			case ',':
    212 				return d.consumeToken(comma, 1, 0), nil
    213 			case ';':
    214 				return d.consumeToken(semicolon, 1, 0), nil
    215 			default:
    216 				return d.parseFieldName()
    217 			}
    218 
    219 		case MessageOpen:
    220 			// Next token can be MessageClose, comma, semicolon or Name.
    221 			if isEOF {
    222 				return Token{}, ErrUnexpectedEOF
    223 			}
    224 			switch ch := d.in[0]; ch {
    225 			case closeCh:
    226 				d.popOpenStack()
    227 				return d.consumeToken(MessageClose, 1, 0), nil
    228 			case otherCloseChar[closeCh]:
    229 				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
    230 			case ',':
    231 				return d.consumeToken(comma, 1, 0), nil
    232 			case ';':
    233 				return d.consumeToken(semicolon, 1, 0), nil
    234 			default:
    235 				return d.parseFieldName()
    236 			}
    237 
    238 		case ListOpen:
    239 			// Next token can be ListClose or comma
    240 			if isEOF {
    241 				return Token{}, ErrUnexpectedEOF
    242 			}
    243 			switch ch := d.in[0]; ch {
    244 			case closeCh:
    245 				d.popOpenStack()
    246 				return d.consumeToken(ListClose, 1, 0), nil
    247 			case ',':
    248 				return d.consumeToken(comma, 1, 0), nil
    249 			default:
    250 				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
    251 			}
    252 		}
    253 
    254 	case ListOpen:
    255 		// Next token can be ListClose, MessageStart or Scalar.
    256 		if isEOF {
    257 			return Token{}, ErrUnexpectedEOF
    258 		}
    259 		switch ch := d.in[0]; ch {
    260 		case ']':
    261 			d.popOpenStack()
    262 			return d.consumeToken(ListClose, 1, 0), nil
    263 		case '{', '<':
    264 			d.pushOpenStack(ch)
    265 			return d.consumeToken(MessageOpen, 1, 0), nil
    266 		default:
    267 			return d.parseScalar()
    268 		}
    269 
    270 	case ListClose:
    271 		openKind, closeCh := d.currentOpenKind()
    272 		switch openKind {
    273 		case bof:
    274 			// Top level message.
    275 			// Next token can be EOF, comma, semicolon or Name.
    276 			if isEOF {
    277 				return d.consumeToken(EOF, 0, 0), nil
    278 			}
    279 			switch ch := d.in[0]; ch {
    280 			case ',':
    281 				return d.consumeToken(comma, 1, 0), nil
    282 			case ';':
    283 				return d.consumeToken(semicolon, 1, 0), nil
    284 			default:
    285 				return d.parseFieldName()
    286 			}
    287 
    288 		case MessageOpen:
    289 			// Next token can be MessageClose, comma, semicolon or Name.
    290 			if isEOF {
    291 				return Token{}, ErrUnexpectedEOF
    292 			}
    293 			switch ch := d.in[0]; ch {
    294 			case closeCh:
    295 				d.popOpenStack()
    296 				return d.consumeToken(MessageClose, 1, 0), nil
    297 			case otherCloseChar[closeCh]:
    298 				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
    299 			case ',':
    300 				return d.consumeToken(comma, 1, 0), nil
    301 			case ';':
    302 				return d.consumeToken(semicolon, 1, 0), nil
    303 			default:
    304 				return d.parseFieldName()
    305 			}
    306 
    307 		default:
    308 			// It is not possible to have this case. Let it panic below.
    309 		}
    310 
    311 	case comma, semicolon:
    312 		openKind, closeCh := d.currentOpenKind()
    313 		switch openKind {
    314 		case bof:
    315 			// Top level message. Next token can be EOF or Name.
    316 			if isEOF {
    317 				return d.consumeToken(EOF, 0, 0), nil
    318 			}
    319 			return d.parseFieldName()
    320 
    321 		case MessageOpen:
    322 			// Next token can be MessageClose or Name.
    323 			if isEOF {
    324 				return Token{}, ErrUnexpectedEOF
    325 			}
    326 			switch ch := d.in[0]; ch {
    327 			case closeCh:
    328 				d.popOpenStack()
    329 				return d.consumeToken(MessageClose, 1, 0), nil
    330 			case otherCloseChar[closeCh]:
    331 				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
    332 			default:
    333 				return d.parseFieldName()
    334 			}
    335 
    336 		case ListOpen:
    337 			if lastKind == semicolon {
    338 				// It is not be possible to have this case as logic here
    339 				// should not have produced a semicolon Token when inside a
    340 				// list. Let it panic below.
    341 				break
    342 			}
    343 			// Next token can be MessageOpen or Scalar.
    344 			if isEOF {
    345 				return Token{}, ErrUnexpectedEOF
    346 			}
    347 			switch ch := d.in[0]; ch {
    348 			case '{', '<':
    349 				d.pushOpenStack(ch)
    350 				return d.consumeToken(MessageOpen, 1, 0), nil
    351 			default:
    352 				return d.parseScalar()
    353 			}
    354 		}
    355 	}
    356 
    357 	line, column := d.Position(len(d.orig) - len(d.in))
    358 	panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
    359 }
    360 
    361 var otherCloseChar = map[byte]byte{
    362 	'}': '>',
    363 	'>': '}',
    364 }
    365 
    366 // currentOpenKind indicates whether current position is inside a message, list
    367 // or top-level message by returning MessageOpen, ListOpen or bof respectively.
    368 // If the returned kind is either a MessageOpen or ListOpen, it also returns the
    369 // corresponding closing character.
    370 func (d *Decoder) currentOpenKind() (Kind, byte) {
    371 	if len(d.openStack) == 0 {
    372 		return bof, 0
    373 	}
    374 	openCh := d.openStack[len(d.openStack)-1]
    375 	switch openCh {
    376 	case '{':
    377 		return MessageOpen, '}'
    378 	case '<':
    379 		return MessageOpen, '>'
    380 	case '[':
    381 		return ListOpen, ']'
    382 	}
    383 	panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh))
    384 }
    385 
    386 func (d *Decoder) pushOpenStack(ch byte) {
    387 	d.openStack = append(d.openStack, ch)
    388 }
    389 
    390 func (d *Decoder) popOpenStack() {
    391 	d.openStack = d.openStack[:len(d.openStack)-1]
    392 }
    393 
    394 // parseFieldName parses field name and separator.
    395 func (d *Decoder) parseFieldName() (tok Token, err error) {
    396 	defer func() {
    397 		if err == nil && d.tryConsumeChar(':') {
    398 			tok.attrs |= hasSeparator
    399 		}
    400 	}()
    401 
    402 	// Extension or Any type URL.
    403 	if d.in[0] == '[' {
    404 		return d.parseTypeName()
    405 	}
    406 
    407 	// Identifier.
    408 	if size := parseIdent(d.in, false); size > 0 {
    409 		return d.consumeToken(Name, size, uint8(IdentName)), nil
    410 	}
    411 
    412 	// Field number. Identify if input is a valid number that is not negative
    413 	// and is decimal integer within 32-bit range.
    414 	if num := parseNumber(d.in); num.size > 0 {
    415 		str := num.string(d.in)
    416 		if !num.neg && num.kind == numDec {
    417 			if _, err := strconv.ParseInt(str, 10, 32); err == nil {
    418 				return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
    419 			}
    420 		}
    421 		return Token{}, d.newSyntaxError("invalid field number: %s", str)
    422 	}
    423 
    424 	return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
    425 }
    426 
    427 // parseTypeName parses Any type URL or extension field name. The name is
    428 // enclosed in [ and ] characters. The C++ parser does not handle many legal URL
    429 // strings. This implementation is more liberal and allows for the pattern
    430 // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
    431 // in between [ ], '.', '/' and the sub names.
    432 func (d *Decoder) parseTypeName() (Token, error) {
    433 	startPos := len(d.orig) - len(d.in)
    434 	// Use alias s to advance first in order to use d.in for error handling.
    435 	// Caller already checks for [ as first character.
    436 	s := consume(d.in[1:], 0)
    437 	if len(s) == 0 {
    438 		return Token{}, ErrUnexpectedEOF
    439 	}
    440 
    441 	var name []byte
    442 	for len(s) > 0 && isTypeNameChar(s[0]) {
    443 		name = append(name, s[0])
    444 		s = s[1:]
    445 	}
    446 	s = consume(s, 0)
    447 
    448 	var closed bool
    449 	for len(s) > 0 && !closed {
    450 		switch {
    451 		case s[0] == ']':
    452 			s = s[1:]
    453 			closed = true
    454 
    455 		case s[0] == '/', s[0] == '.':
    456 			if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
    457 				return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
    458 					d.orig[startPos:len(d.orig)-len(s)+1])
    459 			}
    460 			name = append(name, s[0])
    461 			s = s[1:]
    462 			s = consume(s, 0)
    463 			for len(s) > 0 && isTypeNameChar(s[0]) {
    464 				name = append(name, s[0])
    465 				s = s[1:]
    466 			}
    467 			s = consume(s, 0)
    468 
    469 		default:
    470 			return Token{}, d.newSyntaxError(
    471 				"invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
    472 		}
    473 	}
    474 
    475 	if !closed {
    476 		return Token{}, ErrUnexpectedEOF
    477 	}
    478 
    479 	// First character cannot be '.'. Last character cannot be '.' or '/'.
    480 	size := len(name)
    481 	if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
    482 		return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
    483 			d.orig[startPos:len(d.orig)-len(s)])
    484 	}
    485 
    486 	d.in = s
    487 	endPos := len(d.orig) - len(d.in)
    488 	d.consume(0)
    489 
    490 	return Token{
    491 		kind:  Name,
    492 		attrs: uint8(TypeName),
    493 		pos:   startPos,
    494 		raw:   d.orig[startPos:endPos],
    495 		str:   string(name),
    496 	}, nil
    497 }
    498 
    499 func isTypeNameChar(b byte) bool {
    500 	return (b == '-' || b == '_' ||
    501 		('0' <= b && b <= '9') ||
    502 		('a' <= b && b <= 'z') ||
    503 		('A' <= b && b <= 'Z'))
    504 }
    505 
    506 func isWhiteSpace(b byte) bool {
    507 	switch b {
    508 	case ' ', '\n', '\r', '\t':
    509 		return true
    510 	default:
    511 		return false
    512 	}
    513 }
    514 
    515 // parseIdent parses an unquoted proto identifier and returns size.
    516 // If allowNeg is true, it allows '-' to be the first character in the
    517 // identifier. This is used when parsing literal values like -infinity, etc.
    518 // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
    519 func parseIdent(input []byte, allowNeg bool) int {
    520 	var size int
    521 
    522 	s := input
    523 	if len(s) == 0 {
    524 		return 0
    525 	}
    526 
    527 	if allowNeg && s[0] == '-' {
    528 		s = s[1:]
    529 		size++
    530 		if len(s) == 0 {
    531 			return 0
    532 		}
    533 	}
    534 
    535 	switch {
    536 	case s[0] == '_',
    537 		'a' <= s[0] && s[0] <= 'z',
    538 		'A' <= s[0] && s[0] <= 'Z':
    539 		s = s[1:]
    540 		size++
    541 	default:
    542 		return 0
    543 	}
    544 
    545 	for len(s) > 0 && (s[0] == '_' ||
    546 		'a' <= s[0] && s[0] <= 'z' ||
    547 		'A' <= s[0] && s[0] <= 'Z' ||
    548 		'0' <= s[0] && s[0] <= '9') {
    549 		s = s[1:]
    550 		size++
    551 	}
    552 
    553 	if len(s) > 0 && !isDelim(s[0]) {
    554 		return 0
    555 	}
    556 
    557 	return size
    558 }
    559 
    560 // parseScalar parses for a string, literal or number value.
    561 func (d *Decoder) parseScalar() (Token, error) {
    562 	if d.in[0] == '"' || d.in[0] == '\'' {
    563 		return d.parseStringValue()
    564 	}
    565 
    566 	if tok, ok := d.parseLiteralValue(); ok {
    567 		return tok, nil
    568 	}
    569 
    570 	if tok, ok := d.parseNumberValue(); ok {
    571 		return tok, nil
    572 	}
    573 
    574 	return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
    575 }
    576 
    577 // parseLiteralValue parses a literal value. A literal value is used for
    578 // bools, special floats and enums. This function simply identifies that the
    579 // field value is a literal.
    580 func (d *Decoder) parseLiteralValue() (Token, bool) {
    581 	size := parseIdent(d.in, true)
    582 	if size == 0 {
    583 		return Token{}, false
    584 	}
    585 	return d.consumeToken(Scalar, size, literalValue), true
    586 }
    587 
    588 // consumeToken constructs a Token for given Kind from d.in and consumes given
    589 // size-length from it.
    590 func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
    591 	// Important to compute raw and pos before consuming.
    592 	tok := Token{
    593 		kind:  kind,
    594 		attrs: attrs,
    595 		pos:   len(d.orig) - len(d.in),
    596 		raw:   d.in[:size],
    597 	}
    598 	d.consume(size)
    599 	return tok
    600 }
    601 
    602 // newSyntaxError returns a syntax error with line and column information for
    603 // current position.
    604 func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
    605 	e := errors.New(f, x...)
    606 	line, column := d.Position(len(d.orig) - len(d.in))
    607 	return errors.New("syntax error (line %d:%d): %v", line, column, e)
    608 }
    609 
    610 // Position returns line and column number of given index of the original input.
    611 // It will panic if index is out of range.
    612 func (d *Decoder) Position(idx int) (line int, column int) {
    613 	b := d.orig[:idx]
    614 	line = bytes.Count(b, []byte("\n")) + 1
    615 	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
    616 		b = b[i+1:]
    617 	}
    618 	column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
    619 	return line, column
    620 }
    621 
    622 func (d *Decoder) tryConsumeChar(c byte) bool {
    623 	if len(d.in) > 0 && d.in[0] == c {
    624 		d.consume(1)
    625 		return true
    626 	}
    627 	return false
    628 }
    629 
    630 // consume consumes n bytes of input and any subsequent whitespace or comments.
    631 func (d *Decoder) consume(n int) {
    632 	d.in = consume(d.in, n)
    633 	return
    634 }
    635 
    636 // consume consumes n bytes of input and any subsequent whitespace or comments.
    637 func consume(b []byte, n int) []byte {
    638 	b = b[n:]
    639 	for len(b) > 0 {
    640 		switch b[0] {
    641 		case ' ', '\n', '\r', '\t':
    642 			b = b[1:]
    643 		case '#':
    644 			if i := bytes.IndexByte(b, '\n'); i >= 0 {
    645 				b = b[i+len("\n"):]
    646 			} else {
    647 				b = nil
    648 			}
    649 		default:
    650 			return b
    651 		}
    652 	}
    653 	return b
    654 }
    655 
    656 // errId extracts a byte sequence that looks like an invalid ID
    657 // (for the purposes of error reporting).
    658 func errId(seq []byte) []byte {
    659 	const maxLen = 32
    660 	for i := 0; i < len(seq); {
    661 		if i > maxLen {
    662 			return append(seq[:i:i], "…"...)
    663 		}
    664 		r, size := utf8.DecodeRune(seq[i:])
    665 		if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
    666 			if i == 0 {
    667 				// Either the first byte is invalid UTF-8 or a
    668 				// delimiter, or the first rune is non-ASCII.
    669 				// Return it as-is.
    670 				i = size
    671 			}
    672 			return seq[:i:i]
    673 		}
    674 		i += size
    675 	}
    676 	// No delimiter found.
    677 	return seq
    678 }
    679 
    680 // isDelim returns true if given byte is a delimiter character.
    681 func isDelim(c byte) bool {
    682 	return !(c == '-' || c == '+' || c == '.' || c == '_' ||
    683 		('a' <= c && c <= 'z') ||
    684 		('A' <= c && c <= 'Z') ||
    685 		('0' <= c && c <= '9'))
    686 }