lex.go (11583B)
1 // Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html. 2 package html 3 4 import ( 5 "strconv" 6 7 "github.com/tdewolff/parse/v2" 8 ) 9 10 // TokenType determines the type of token, eg. a number or a semicolon. 11 type TokenType uint32 12 13 // TokenType values. 14 const ( 15 ErrorToken TokenType = iota // extra token when errors occur 16 CommentToken 17 DoctypeToken 18 StartTagToken 19 StartTagCloseToken 20 StartTagVoidToken 21 EndTagToken 22 AttributeToken 23 TextToken 24 SvgToken 25 MathToken 26 ) 27 28 // String returns the string representation of a TokenType. 29 func (tt TokenType) String() string { 30 switch tt { 31 case ErrorToken: 32 return "Error" 33 case CommentToken: 34 return "Comment" 35 case DoctypeToken: 36 return "Doctype" 37 case StartTagToken: 38 return "StartTag" 39 case StartTagCloseToken: 40 return "StartTagClose" 41 case StartTagVoidToken: 42 return "StartTagVoid" 43 case EndTagToken: 44 return "EndTag" 45 case AttributeToken: 46 return "Attribute" 47 case TextToken: 48 return "Text" 49 case SvgToken: 50 return "Svg" 51 case MathToken: 52 return "Math" 53 } 54 return "Invalid(" + strconv.Itoa(int(tt)) + ")" 55 } 56 57 //////////////////////////////////////////////////////////////// 58 59 // Lexer is the state for the lexer. 60 type Lexer struct { 61 r *parse.Input 62 err error 63 64 rawTag Hash 65 inTag bool 66 67 text []byte 68 attrVal []byte 69 } 70 71 // NewLexer returns a new Lexer for a given io.Reader. 72 func NewLexer(r *parse.Input) *Lexer { 73 return &Lexer{ 74 r: r, 75 } 76 } 77 78 // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. 79 func (l *Lexer) Err() error { 80 if l.err != nil { 81 return l.err 82 } 83 return l.r.Err() 84 } 85 86 // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. 87 func (l *Lexer) Text() []byte { 88 return l.text 89 } 90 91 // AttrVal returns the attribute value when an AttributeToken was returned from Next. 92 func (l *Lexer) AttrVal() []byte { 93 return l.attrVal 94 } 95 96 // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. 97 func (l *Lexer) Next() (TokenType, []byte) { 98 l.text = nil 99 var c byte 100 if l.inTag { 101 l.attrVal = nil 102 for { // before attribute name state 103 if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { 104 l.r.Move(1) 105 continue 106 } 107 break 108 } 109 if c == 0 && l.r.Err() != nil { 110 return ErrorToken, nil 111 } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') { 112 return AttributeToken, l.shiftAttribute() 113 } 114 l.r.Skip() 115 l.inTag = false 116 if c == '/' { 117 l.r.Move(2) 118 return StartTagVoidToken, l.r.Shift() 119 } 120 l.r.Move(1) 121 return StartTagCloseToken, l.r.Shift() 122 } 123 124 if l.rawTag != 0 { 125 if rawText := l.shiftRawText(); len(rawText) > 0 { 126 l.text = rawText 127 l.rawTag = 0 128 return TextToken, rawText 129 } 130 l.rawTag = 0 131 } 132 133 for { 134 c = l.r.Peek(0) 135 if c == '<' { 136 c = l.r.Peek(1) 137 isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil) 138 if l.r.Pos() > 0 { 139 if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' { 140 // return currently buffered texttoken so that we can return tag next iteration 141 l.text = l.r.Shift() 142 return TextToken, l.text 143 } 144 } else if isEndTag { 145 l.r.Move(2) 146 // only endtags that are not followed by > or EOF arrive here 147 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { 148 return CommentToken, l.shiftBogusComment() 149 } 150 return EndTagToken, l.shiftEndTag() 151 } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { 152 l.r.Move(1) 153 l.inTag = true 154 return l.shiftStartTag() 155 } else if c == '!' { 156 l.r.Move(2) 157 return l.readMarkup() 158 } else if c == '?' { 159 l.r.Move(1) 160 return CommentToken, l.shiftBogusComment() 161 } 162 } else if c == 0 && l.r.Err() != nil { 163 if l.r.Pos() > 0 { 164 l.text = l.r.Shift() 165 return TextToken, l.text 166 } 167 return ErrorToken, nil 168 } 169 l.r.Move(1) 170 } 171 } 172 173 //////////////////////////////////////////////////////////////// 174 175 // The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html 176 177 func (l *Lexer) shiftRawText() []byte { 178 if l.rawTag == Plaintext { 179 for { 180 if l.r.Peek(0) == 0 && l.r.Err() != nil { 181 return l.r.Shift() 182 } 183 l.r.Move(1) 184 } 185 } else { // RCDATA, RAWTEXT and SCRIPT 186 for { 187 c := l.r.Peek(0) 188 if c == '<' { 189 if l.r.Peek(1) == '/' { 190 mark := l.r.Pos() 191 l.r.Move(2) 192 for { 193 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { 194 break 195 } 196 l.r.Move(1) 197 } 198 if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice 199 l.r.Rewind(mark) 200 return l.r.Shift() 201 } 202 } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { 203 l.r.Move(4) 204 inScript := false 205 for { 206 c := l.r.Peek(0) 207 if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { 208 l.r.Move(3) 209 break 210 } else if c == '<' { 211 isEnd := l.r.Peek(1) == '/' 212 if isEnd { 213 l.r.Move(2) 214 } else { 215 l.r.Move(1) 216 } 217 mark := l.r.Pos() 218 for { 219 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { 220 break 221 } 222 l.r.Move(1) 223 } 224 if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice 225 if !isEnd { 226 inScript = true 227 } else { 228 if !inScript { 229 l.r.Rewind(mark - 2) 230 return l.r.Shift() 231 } 232 inScript = false 233 } 234 } 235 } else if c == 0 && l.r.Err() != nil { 236 return l.r.Shift() 237 } else { 238 l.r.Move(1) 239 } 240 } 241 } else { 242 l.r.Move(1) 243 } 244 } else if c == 0 && l.r.Err() != nil { 245 return l.r.Shift() 246 } else { 247 l.r.Move(1) 248 } 249 } 250 } 251 } 252 253 func (l *Lexer) readMarkup() (TokenType, []byte) { 254 if l.at('-', '-') { 255 l.r.Move(2) 256 for { 257 if l.r.Peek(0) == 0 && l.r.Err() != nil { 258 l.text = l.r.Lexeme()[4:] 259 return CommentToken, l.r.Shift() 260 } else if l.at('-', '-', '>') { 261 l.text = l.r.Lexeme()[4:] 262 l.r.Move(3) 263 return CommentToken, l.r.Shift() 264 } else if l.at('-', '-', '!', '>') { 265 l.text = l.r.Lexeme()[4:] 266 l.r.Move(4) 267 return CommentToken, l.r.Shift() 268 } 269 l.r.Move(1) 270 } 271 } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') { 272 l.r.Move(7) 273 for { 274 if l.r.Peek(0) == 0 && l.r.Err() != nil { 275 l.text = l.r.Lexeme()[9:] 276 return TextToken, l.r.Shift() 277 } else if l.at(']', ']', '>') { 278 l.text = l.r.Lexeme()[9:] 279 l.r.Move(3) 280 return TextToken, l.r.Shift() 281 } 282 l.r.Move(1) 283 } 284 } else { 285 if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') { 286 l.r.Move(7) 287 if l.r.Peek(0) == ' ' { 288 l.r.Move(1) 289 } 290 for { 291 if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil { 292 l.text = l.r.Lexeme()[9:] 293 if c == '>' { 294 l.r.Move(1) 295 } 296 return DoctypeToken, l.r.Shift() 297 } 298 l.r.Move(1) 299 } 300 } 301 } 302 return CommentToken, l.shiftBogusComment() 303 } 304 305 func (l *Lexer) shiftBogusComment() []byte { 306 for { 307 c := l.r.Peek(0) 308 if c == '>' { 309 l.text = l.r.Lexeme()[2:] 310 l.r.Move(1) 311 return l.r.Shift() 312 } else if c == 0 && l.r.Err() != nil { 313 l.text = l.r.Lexeme()[2:] 314 return l.r.Shift() 315 } 316 l.r.Move(1) 317 } 318 } 319 320 func (l *Lexer) shiftStartTag() (TokenType, []byte) { 321 for { 322 if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { 323 break 324 } 325 l.r.Move(1) 326 } 327 l.text = parse.ToLower(l.r.Lexeme()[1:]) 328 if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math { 329 if h == Svg || h == Math { 330 data := l.shiftXML(h) 331 if l.err != nil { 332 return ErrorToken, nil 333 } 334 335 l.inTag = false 336 if h == Svg { 337 return SvgToken, data 338 } 339 return MathToken, data 340 } 341 l.rawTag = h 342 } 343 return StartTagToken, l.r.Shift() 344 } 345 346 func (l *Lexer) shiftAttribute() []byte { 347 nameStart := l.r.Pos() 348 var c byte 349 for { // attribute name state 350 if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { 351 break 352 } 353 l.r.Move(1) 354 } 355 nameEnd := l.r.Pos() 356 for { // after attribute name state 357 if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { 358 l.r.Move(1) 359 continue 360 } 361 break 362 } 363 if c == '=' { 364 l.r.Move(1) 365 for { // before attribute value state 366 if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { 367 l.r.Move(1) 368 continue 369 } 370 break 371 } 372 attrPos := l.r.Pos() 373 delim := c 374 if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state 375 l.r.Move(1) 376 for { 377 c := l.r.Peek(0) 378 if c == delim { 379 l.r.Move(1) 380 break 381 } else if c == 0 && l.r.Err() != nil { 382 break 383 } 384 l.r.Move(1) 385 } 386 } else { // attribute value unquoted state 387 for { 388 if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { 389 break 390 } 391 l.r.Move(1) 392 } 393 } 394 l.attrVal = l.r.Lexeme()[attrPos:] 395 } else { 396 l.r.Rewind(nameEnd) 397 l.attrVal = nil 398 } 399 l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd]) 400 return l.r.Shift() 401 } 402 403 func (l *Lexer) shiftEndTag() []byte { 404 for { 405 c := l.r.Peek(0) 406 if c == '>' { 407 l.text = l.r.Lexeme()[2:] 408 l.r.Move(1) 409 break 410 } else if c == 0 && l.r.Err() != nil { 411 l.text = l.r.Lexeme()[2:] 412 break 413 } 414 l.r.Move(1) 415 } 416 417 end := len(l.text) 418 for end > 0 { 419 if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' { 420 end-- 421 continue 422 } 423 break 424 } 425 l.text = l.text[:end] 426 return parse.ToLower(l.r.Shift()) 427 } 428 429 // shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself. 430 // So far we have already parsed `<svg` or `<math`. 431 func (l *Lexer) shiftXML(rawTag Hash) []byte { 432 inQuote := false 433 for { 434 c := l.r.Peek(0) 435 if c == '"' { 436 inQuote = !inQuote 437 l.r.Move(1) 438 } else if c == '<' && !inQuote && l.r.Peek(1) == '/' { 439 mark := l.r.Pos() 440 l.r.Move(2) 441 for { 442 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { 443 break 444 } 445 l.r.Move(1) 446 } 447 if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice 448 break 449 } 450 } else if c == 0 { 451 if l.r.Err() == nil { 452 l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") 453 } 454 return l.r.Shift() 455 } else { 456 l.r.Move(1) 457 } 458 } 459 460 for { 461 c := l.r.Peek(0) 462 if c == '>' { 463 l.r.Move(1) 464 break 465 } else if c == 0 { 466 if l.r.Err() == nil { 467 l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") 468 } 469 return l.r.Shift() 470 } 471 l.r.Move(1) 472 } 473 return l.r.Shift() 474 } 475 476 //////////////////////////////////////////////////////////////// 477 478 func (l *Lexer) at(b ...byte) bool { 479 for i, c := range b { 480 if l.r.Peek(i) != c { 481 return false 482 } 483 } 484 return true 485 } 486 487 func (l *Lexer) atCaseInsensitive(b ...byte) bool { 488 for i, c := range b { 489 if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c { 490 return false 491 } 492 } 493 return true 494 }