gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

lex.go (11583B)


      1 // Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
      2 package html
      3 
      4 import (
      5 	"strconv"
      6 
      7 	"github.com/tdewolff/parse/v2"
      8 )
      9 
     10 // TokenType determines the type of token, eg. a number or a semicolon.
     11 type TokenType uint32
     12 
     13 // TokenType values.
     14 const (
     15 	ErrorToken TokenType = iota // extra token when errors occur
     16 	CommentToken
     17 	DoctypeToken
     18 	StartTagToken
     19 	StartTagCloseToken
     20 	StartTagVoidToken
     21 	EndTagToken
     22 	AttributeToken
     23 	TextToken
     24 	SvgToken
     25 	MathToken
     26 )
     27 
     28 // String returns the string representation of a TokenType.
     29 func (tt TokenType) String() string {
     30 	switch tt {
     31 	case ErrorToken:
     32 		return "Error"
     33 	case CommentToken:
     34 		return "Comment"
     35 	case DoctypeToken:
     36 		return "Doctype"
     37 	case StartTagToken:
     38 		return "StartTag"
     39 	case StartTagCloseToken:
     40 		return "StartTagClose"
     41 	case StartTagVoidToken:
     42 		return "StartTagVoid"
     43 	case EndTagToken:
     44 		return "EndTag"
     45 	case AttributeToken:
     46 		return "Attribute"
     47 	case TextToken:
     48 		return "Text"
     49 	case SvgToken:
     50 		return "Svg"
     51 	case MathToken:
     52 		return "Math"
     53 	}
     54 	return "Invalid(" + strconv.Itoa(int(tt)) + ")"
     55 }
     56 
     57 ////////////////////////////////////////////////////////////////
     58 
     59 // Lexer is the state for the lexer.
     60 type Lexer struct {
     61 	r   *parse.Input
     62 	err error
     63 
     64 	rawTag Hash
     65 	inTag  bool
     66 
     67 	text    []byte
     68 	attrVal []byte
     69 }
     70 
     71 // NewLexer returns a new Lexer for a given io.Reader.
     72 func NewLexer(r *parse.Input) *Lexer {
     73 	return &Lexer{
     74 		r: r,
     75 	}
     76 }
     77 
     78 // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
     79 func (l *Lexer) Err() error {
     80 	if l.err != nil {
     81 		return l.err
     82 	}
     83 	return l.r.Err()
     84 }
     85 
     86 // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
     87 func (l *Lexer) Text() []byte {
     88 	return l.text
     89 }
     90 
     91 // AttrVal returns the attribute value when an AttributeToken was returned from Next.
     92 func (l *Lexer) AttrVal() []byte {
     93 	return l.attrVal
     94 }
     95 
     96 // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
     97 func (l *Lexer) Next() (TokenType, []byte) {
     98 	l.text = nil
     99 	var c byte
    100 	if l.inTag {
    101 		l.attrVal = nil
    102 		for { // before attribute name state
    103 			if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
    104 				l.r.Move(1)
    105 				continue
    106 			}
    107 			break
    108 		}
    109 		if c == 0 && l.r.Err() != nil {
    110 			return ErrorToken, nil
    111 		} else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
    112 			return AttributeToken, l.shiftAttribute()
    113 		}
    114 		l.r.Skip()
    115 		l.inTag = false
    116 		if c == '/' {
    117 			l.r.Move(2)
    118 			return StartTagVoidToken, l.r.Shift()
    119 		}
    120 		l.r.Move(1)
    121 		return StartTagCloseToken, l.r.Shift()
    122 	}
    123 
    124 	if l.rawTag != 0 {
    125 		if rawText := l.shiftRawText(); len(rawText) > 0 {
    126 			l.text = rawText
    127 			l.rawTag = 0
    128 			return TextToken, rawText
    129 		}
    130 		l.rawTag = 0
    131 	}
    132 
    133 	for {
    134 		c = l.r.Peek(0)
    135 		if c == '<' {
    136 			c = l.r.Peek(1)
    137 			isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
    138 			if l.r.Pos() > 0 {
    139 				if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
    140 					// return currently buffered texttoken so that we can return tag next iteration
    141 					l.text = l.r.Shift()
    142 					return TextToken, l.text
    143 				}
    144 			} else if isEndTag {
    145 				l.r.Move(2)
    146 				// only endtags that are not followed by > or EOF arrive here
    147 				if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
    148 					return CommentToken, l.shiftBogusComment()
    149 				}
    150 				return EndTagToken, l.shiftEndTag()
    151 			} else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
    152 				l.r.Move(1)
    153 				l.inTag = true
    154 				return l.shiftStartTag()
    155 			} else if c == '!' {
    156 				l.r.Move(2)
    157 				return l.readMarkup()
    158 			} else if c == '?' {
    159 				l.r.Move(1)
    160 				return CommentToken, l.shiftBogusComment()
    161 			}
    162 		} else if c == 0 && l.r.Err() != nil {
    163 			if l.r.Pos() > 0 {
    164 				l.text = l.r.Shift()
    165 				return TextToken, l.text
    166 			}
    167 			return ErrorToken, nil
    168 		}
    169 		l.r.Move(1)
    170 	}
    171 }
    172 
    173 ////////////////////////////////////////////////////////////////
    174 
    175 // The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html
    176 
    177 func (l *Lexer) shiftRawText() []byte {
    178 	if l.rawTag == Plaintext {
    179 		for {
    180 			if l.r.Peek(0) == 0 && l.r.Err() != nil {
    181 				return l.r.Shift()
    182 			}
    183 			l.r.Move(1)
    184 		}
    185 	} else { // RCDATA, RAWTEXT and SCRIPT
    186 		for {
    187 			c := l.r.Peek(0)
    188 			if c == '<' {
    189 				if l.r.Peek(1) == '/' {
    190 					mark := l.r.Pos()
    191 					l.r.Move(2)
    192 					for {
    193 						if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
    194 							break
    195 						}
    196 						l.r.Move(1)
    197 					}
    198 					if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
    199 						l.r.Rewind(mark)
    200 						return l.r.Shift()
    201 					}
    202 				} else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
    203 					l.r.Move(4)
    204 					inScript := false
    205 					for {
    206 						c := l.r.Peek(0)
    207 						if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
    208 							l.r.Move(3)
    209 							break
    210 						} else if c == '<' {
    211 							isEnd := l.r.Peek(1) == '/'
    212 							if isEnd {
    213 								l.r.Move(2)
    214 							} else {
    215 								l.r.Move(1)
    216 							}
    217 							mark := l.r.Pos()
    218 							for {
    219 								if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
    220 									break
    221 								}
    222 								l.r.Move(1)
    223 							}
    224 							if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
    225 								if !isEnd {
    226 									inScript = true
    227 								} else {
    228 									if !inScript {
    229 										l.r.Rewind(mark - 2)
    230 										return l.r.Shift()
    231 									}
    232 									inScript = false
    233 								}
    234 							}
    235 						} else if c == 0 && l.r.Err() != nil {
    236 							return l.r.Shift()
    237 						} else {
    238 							l.r.Move(1)
    239 						}
    240 					}
    241 				} else {
    242 					l.r.Move(1)
    243 				}
    244 			} else if c == 0 && l.r.Err() != nil {
    245 				return l.r.Shift()
    246 			} else {
    247 				l.r.Move(1)
    248 			}
    249 		}
    250 	}
    251 }
    252 
    253 func (l *Lexer) readMarkup() (TokenType, []byte) {
    254 	if l.at('-', '-') {
    255 		l.r.Move(2)
    256 		for {
    257 			if l.r.Peek(0) == 0 && l.r.Err() != nil {
    258 				l.text = l.r.Lexeme()[4:]
    259 				return CommentToken, l.r.Shift()
    260 			} else if l.at('-', '-', '>') {
    261 				l.text = l.r.Lexeme()[4:]
    262 				l.r.Move(3)
    263 				return CommentToken, l.r.Shift()
    264 			} else if l.at('-', '-', '!', '>') {
    265 				l.text = l.r.Lexeme()[4:]
    266 				l.r.Move(4)
    267 				return CommentToken, l.r.Shift()
    268 			}
    269 			l.r.Move(1)
    270 		}
    271 	} else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
    272 		l.r.Move(7)
    273 		for {
    274 			if l.r.Peek(0) == 0 && l.r.Err() != nil {
    275 				l.text = l.r.Lexeme()[9:]
    276 				return TextToken, l.r.Shift()
    277 			} else if l.at(']', ']', '>') {
    278 				l.text = l.r.Lexeme()[9:]
    279 				l.r.Move(3)
    280 				return TextToken, l.r.Shift()
    281 			}
    282 			l.r.Move(1)
    283 		}
    284 	} else {
    285 		if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
    286 			l.r.Move(7)
    287 			if l.r.Peek(0) == ' ' {
    288 				l.r.Move(1)
    289 			}
    290 			for {
    291 				if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
    292 					l.text = l.r.Lexeme()[9:]
    293 					if c == '>' {
    294 						l.r.Move(1)
    295 					}
    296 					return DoctypeToken, l.r.Shift()
    297 				}
    298 				l.r.Move(1)
    299 			}
    300 		}
    301 	}
    302 	return CommentToken, l.shiftBogusComment()
    303 }
    304 
    305 func (l *Lexer) shiftBogusComment() []byte {
    306 	for {
    307 		c := l.r.Peek(0)
    308 		if c == '>' {
    309 			l.text = l.r.Lexeme()[2:]
    310 			l.r.Move(1)
    311 			return l.r.Shift()
    312 		} else if c == 0 && l.r.Err() != nil {
    313 			l.text = l.r.Lexeme()[2:]
    314 			return l.r.Shift()
    315 		}
    316 		l.r.Move(1)
    317 	}
    318 }
    319 
    320 func (l *Lexer) shiftStartTag() (TokenType, []byte) {
    321 	for {
    322 		if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
    323 			break
    324 		}
    325 		l.r.Move(1)
    326 	}
    327 	l.text = parse.ToLower(l.r.Lexeme()[1:])
    328 	if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
    329 		if h == Svg || h == Math {
    330 			data := l.shiftXML(h)
    331 			if l.err != nil {
    332 				return ErrorToken, nil
    333 			}
    334 
    335 			l.inTag = false
    336 			if h == Svg {
    337 				return SvgToken, data
    338 			}
    339 			return MathToken, data
    340 		}
    341 		l.rawTag = h
    342 	}
    343 	return StartTagToken, l.r.Shift()
    344 }
    345 
    346 func (l *Lexer) shiftAttribute() []byte {
    347 	nameStart := l.r.Pos()
    348 	var c byte
    349 	for { // attribute name state
    350 		if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
    351 			break
    352 		}
    353 		l.r.Move(1)
    354 	}
    355 	nameEnd := l.r.Pos()
    356 	for { // after attribute name state
    357 		if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
    358 			l.r.Move(1)
    359 			continue
    360 		}
    361 		break
    362 	}
    363 	if c == '=' {
    364 		l.r.Move(1)
    365 		for { // before attribute value state
    366 			if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
    367 				l.r.Move(1)
    368 				continue
    369 			}
    370 			break
    371 		}
    372 		attrPos := l.r.Pos()
    373 		delim := c
    374 		if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
    375 			l.r.Move(1)
    376 			for {
    377 				c := l.r.Peek(0)
    378 				if c == delim {
    379 					l.r.Move(1)
    380 					break
    381 				} else if c == 0 && l.r.Err() != nil {
    382 					break
    383 				}
    384 				l.r.Move(1)
    385 			}
    386 		} else { // attribute value unquoted state
    387 			for {
    388 				if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
    389 					break
    390 				}
    391 				l.r.Move(1)
    392 			}
    393 		}
    394 		l.attrVal = l.r.Lexeme()[attrPos:]
    395 	} else {
    396 		l.r.Rewind(nameEnd)
    397 		l.attrVal = nil
    398 	}
    399 	l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd])
    400 	return l.r.Shift()
    401 }
    402 
    403 func (l *Lexer) shiftEndTag() []byte {
    404 	for {
    405 		c := l.r.Peek(0)
    406 		if c == '>' {
    407 			l.text = l.r.Lexeme()[2:]
    408 			l.r.Move(1)
    409 			break
    410 		} else if c == 0 && l.r.Err() != nil {
    411 			l.text = l.r.Lexeme()[2:]
    412 			break
    413 		}
    414 		l.r.Move(1)
    415 	}
    416 
    417 	end := len(l.text)
    418 	for end > 0 {
    419 		if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
    420 			end--
    421 			continue
    422 		}
    423 		break
    424 	}
    425 	l.text = l.text[:end]
    426 	return parse.ToLower(l.r.Shift())
    427 }
    428 
    429 // shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
    430 // So far we have already parsed `<svg` or `<math`.
    431 func (l *Lexer) shiftXML(rawTag Hash) []byte {
    432 	inQuote := false
    433 	for {
    434 		c := l.r.Peek(0)
    435 		if c == '"' {
    436 			inQuote = !inQuote
    437 			l.r.Move(1)
    438 		} else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
    439 			mark := l.r.Pos()
    440 			l.r.Move(2)
    441 			for {
    442 				if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
    443 					break
    444 				}
    445 				l.r.Move(1)
    446 			}
    447 			if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
    448 				break
    449 			}
    450 		} else if c == 0 {
    451 			if l.r.Err() == nil {
    452 				l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
    453 			}
    454 			return l.r.Shift()
    455 		} else {
    456 			l.r.Move(1)
    457 		}
    458 	}
    459 
    460 	for {
    461 		c := l.r.Peek(0)
    462 		if c == '>' {
    463 			l.r.Move(1)
    464 			break
    465 		} else if c == 0 {
    466 			if l.r.Err() == nil {
    467 				l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
    468 			}
    469 			return l.r.Shift()
    470 		}
    471 		l.r.Move(1)
    472 	}
    473 	return l.r.Shift()
    474 }
    475 
    476 ////////////////////////////////////////////////////////////////
    477 
    478 func (l *Lexer) at(b ...byte) bool {
    479 	for i, c := range b {
    480 		if l.r.Peek(i) != c {
    481 			return false
    482 		}
    483 	}
    484 	return true
    485 }
    486 
    487 func (l *Lexer) atCaseInsensitive(b ...byte) bool {
    488 	for i, c := range b {
    489 		if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
    490 			return false
    491 		}
    492 	}
    493 	return true
    494 }