gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

token.go (31496B)


      1 // Copyright 2010 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package html
      6 
      7 import (
      8 	"bytes"
      9 	"errors"
     10 	"io"
     11 	"strconv"
     12 	"strings"
     13 
     14 	"golang.org/x/net/html/atom"
     15 )
     16 
     17 // A TokenType is the type of a Token.
     18 type TokenType uint32
     19 
     20 const (
     21 	// ErrorToken means that an error occurred during tokenization.
     22 	ErrorToken TokenType = iota
     23 	// TextToken means a text node.
     24 	TextToken
     25 	// A StartTagToken looks like <a>.
     26 	StartTagToken
     27 	// An EndTagToken looks like </a>.
     28 	EndTagToken
     29 	// A SelfClosingTagToken tag looks like <br/>.
     30 	SelfClosingTagToken
     31 	// A CommentToken looks like <!--x-->.
     32 	CommentToken
     33 	// A DoctypeToken looks like <!DOCTYPE x>
     34 	DoctypeToken
     35 )
     36 
     37 // ErrBufferExceeded means that the buffering limit was exceeded.
     38 var ErrBufferExceeded = errors.New("max buffer exceeded")
     39 
     40 // String returns a string representation of the TokenType.
     41 func (t TokenType) String() string {
     42 	switch t {
     43 	case ErrorToken:
     44 		return "Error"
     45 	case TextToken:
     46 		return "Text"
     47 	case StartTagToken:
     48 		return "StartTag"
     49 	case EndTagToken:
     50 		return "EndTag"
     51 	case SelfClosingTagToken:
     52 		return "SelfClosingTag"
     53 	case CommentToken:
     54 		return "Comment"
     55 	case DoctypeToken:
     56 		return "Doctype"
     57 	}
     58 	return "Invalid(" + strconv.Itoa(int(t)) + ")"
     59 }
     60 
     61 // An Attribute is an attribute namespace-key-value triple. Namespace is
     62 // non-empty for foreign attributes like xlink, Key is alphabetic (and hence
     63 // does not contain escapable characters like '&', '<' or '>'), and Val is
     64 // unescaped (it looks like "a<b" rather than "a&lt;b").
     65 //
     66 // Namespace is only used by the parser, not the tokenizer.
     67 type Attribute struct {
     68 	Namespace, Key, Val string
     69 }
     70 
     71 // A Token consists of a TokenType and some Data (tag name for start and end
     72 // tags, content for text, comments and doctypes). A tag Token may also contain
     73 // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
     74 // rather than "a&lt;b"). For tag Tokens, DataAtom is the atom for Data, or
     75 // zero if Data is not a known tag name.
     76 type Token struct {
     77 	Type     TokenType
     78 	DataAtom atom.Atom
     79 	Data     string
     80 	Attr     []Attribute
     81 }
     82 
     83 // tagString returns a string representation of a tag Token's Data and Attr.
     84 func (t Token) tagString() string {
     85 	if len(t.Attr) == 0 {
     86 		return t.Data
     87 	}
     88 	buf := bytes.NewBufferString(t.Data)
     89 	for _, a := range t.Attr {
     90 		buf.WriteByte(' ')
     91 		buf.WriteString(a.Key)
     92 		buf.WriteString(`="`)
     93 		escape(buf, a.Val)
     94 		buf.WriteByte('"')
     95 	}
     96 	return buf.String()
     97 }
     98 
     99 // String returns a string representation of the Token.
    100 func (t Token) String() string {
    101 	switch t.Type {
    102 	case ErrorToken:
    103 		return ""
    104 	case TextToken:
    105 		return EscapeString(t.Data)
    106 	case StartTagToken:
    107 		return "<" + t.tagString() + ">"
    108 	case EndTagToken:
    109 		return "</" + t.tagString() + ">"
    110 	case SelfClosingTagToken:
    111 		return "<" + t.tagString() + "/>"
    112 	case CommentToken:
    113 		return "<!--" + escapeCommentString(t.Data) + "-->"
    114 	case DoctypeToken:
    115 		return "<!DOCTYPE " + EscapeString(t.Data) + ">"
    116 	}
    117 	return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
    118 }
    119 
    120 // span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
    121 // the end is exclusive.
    122 type span struct {
    123 	start, end int
    124 }
    125 
    126 // A Tokenizer returns a stream of HTML Tokens.
    127 type Tokenizer struct {
    128 	// r is the source of the HTML text.
    129 	r io.Reader
    130 	// tt is the TokenType of the current token.
    131 	tt TokenType
    132 	// err is the first error encountered during tokenization. It is possible
    133 	// for tt != Error && err != nil to hold: this means that Next returned a
    134 	// valid token but the subsequent Next call will return an error token.
    135 	// For example, if the HTML text input was just "plain", then the first
    136 	// Next call would set z.err to io.EOF but return a TextToken, and all
    137 	// subsequent Next calls would return an ErrorToken.
    138 	// err is never reset. Once it becomes non-nil, it stays non-nil.
    139 	err error
    140 	// readErr is the error returned by the io.Reader r. It is separate from
    141 	// err because it is valid for an io.Reader to return (n int, err1 error)
    142 	// such that n > 0 && err1 != nil, and callers should always process the
    143 	// n > 0 bytes before considering the error err1.
    144 	readErr error
    145 	// buf[raw.start:raw.end] holds the raw bytes of the current token.
    146 	// buf[raw.end:] is buffered input that will yield future tokens.
    147 	raw span
    148 	buf []byte
    149 	// maxBuf limits the data buffered in buf. A value of 0 means unlimited.
    150 	maxBuf int
    151 	// buf[data.start:data.end] holds the raw bytes of the current token's data:
    152 	// a text token's text, a tag token's tag name, etc.
    153 	data span
    154 	// pendingAttr is the attribute key and value currently being tokenized.
    155 	// When complete, pendingAttr is pushed onto attr. nAttrReturned is
    156 	// incremented on each call to TagAttr.
    157 	pendingAttr   [2]span
    158 	attr          [][2]span
    159 	nAttrReturned int
    160 	// rawTag is the "script" in "</script>" that closes the next token. If
    161 	// non-empty, the subsequent call to Next will return a raw or RCDATA text
    162 	// token: one that treats "<p>" as text instead of an element.
    163 	// rawTag's contents are lower-cased.
    164 	rawTag string
    165 	// textIsRaw is whether the current text token's data is not escaped.
    166 	textIsRaw bool
    167 	// convertNUL is whether NUL bytes in the current token's data should
    168 	// be converted into \ufffd replacement characters.
    169 	convertNUL bool
    170 	// allowCDATA is whether CDATA sections are allowed in the current context.
    171 	allowCDATA bool
    172 }
    173 
    174 // AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as
    175 // the text "foo". The default value is false, which means to recognize it as
    176 // a bogus comment "<!-- [CDATA[foo]] -->" instead.
    177 //
    178 // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and
    179 // only if tokenizing foreign content, such as MathML and SVG. However,
    180 // tracking foreign-contentness is difficult to do purely in the tokenizer,
    181 // as opposed to the parser, due to HTML integration points: an <svg> element
    182 // can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-
    183 // HTML. For strict compliance with the HTML5 tokenization algorithm, it is the
    184 // responsibility of the user of a tokenizer to call AllowCDATA as appropriate.
    185 // In practice, if using the tokenizer without caring whether MathML or SVG
    186 // CDATA is text or comments, such as tokenizing HTML to find all the anchor
    187 // text, it is acceptable to ignore this responsibility.
    188 func (z *Tokenizer) AllowCDATA(allowCDATA bool) {
    189 	z.allowCDATA = allowCDATA
    190 }
    191 
    192 // NextIsNotRawText instructs the tokenizer that the next token should not be
    193 // considered as 'raw text'. Some elements, such as script and title elements,
    194 // normally require the next token after the opening tag to be 'raw text' that
    195 // has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"
    196 // yields a start tag token for "<title>", a text token for "a<b>c</b>d", and
    197 // an end tag token for "</title>". There are no distinct start tag or end tag
    198 // tokens for the "<b>" and "</b>".
    199 //
    200 // This tokenizer implementation will generally look for raw text at the right
    201 // times. Strictly speaking, an HTML5 compliant tokenizer should not look for
    202 // raw text if in foreign content: <title> generally needs raw text, but a
    203 // <title> inside an <svg> does not. Another example is that a <textarea>
    204 // generally needs raw text, but a <textarea> is not allowed as an immediate
    205 // child of a <select>; in normal parsing, a <textarea> implies </select>, but
    206 // one cannot close the implicit element when parsing a <select>'s InnerHTML.
    207 // Similarly to AllowCDATA, tracking the correct moment to override raw-text-
    208 // ness is difficult to do purely in the tokenizer, as opposed to the parser.
    209 // For strict compliance with the HTML5 tokenization algorithm, it is the
    210 // responsibility of the user of a tokenizer to call NextIsNotRawText as
    211 // appropriate. In practice, like AllowCDATA, it is acceptable to ignore this
    212 // responsibility for basic usage.
    213 //
    214 // Note that this 'raw text' concept is different from the one offered by the
    215 // Tokenizer.Raw method.
    216 func (z *Tokenizer) NextIsNotRawText() {
    217 	z.rawTag = ""
    218 }
    219 
    220 // Err returns the error associated with the most recent ErrorToken token.
    221 // This is typically io.EOF, meaning the end of tokenization.
    222 func (z *Tokenizer) Err() error {
    223 	if z.tt != ErrorToken {
    224 		return nil
    225 	}
    226 	return z.err
    227 }
    228 
    229 // readByte returns the next byte from the input stream, doing a buffered read
    230 // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte
    231 // slice that holds all the bytes read so far for the current token.
    232 // It sets z.err if the underlying reader returns an error.
    233 // Pre-condition: z.err == nil.
    234 func (z *Tokenizer) readByte() byte {
    235 	if z.raw.end >= len(z.buf) {
    236 		// Our buffer is exhausted and we have to read from z.r. Check if the
    237 		// previous read resulted in an error.
    238 		if z.readErr != nil {
    239 			z.err = z.readErr
    240 			return 0
    241 		}
    242 		// We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length
    243 		// z.raw.end - z.raw.start is more than half the capacity of z.buf, then we
    244 		// allocate a new buffer before the copy.
    245 		c := cap(z.buf)
    246 		d := z.raw.end - z.raw.start
    247 		var buf1 []byte
    248 		if 2*d > c {
    249 			buf1 = make([]byte, d, 2*c)
    250 		} else {
    251 			buf1 = z.buf[:d]
    252 		}
    253 		copy(buf1, z.buf[z.raw.start:z.raw.end])
    254 		if x := z.raw.start; x != 0 {
    255 			// Adjust the data/attr spans to refer to the same contents after the copy.
    256 			z.data.start -= x
    257 			z.data.end -= x
    258 			z.pendingAttr[0].start -= x
    259 			z.pendingAttr[0].end -= x
    260 			z.pendingAttr[1].start -= x
    261 			z.pendingAttr[1].end -= x
    262 			for i := range z.attr {
    263 				z.attr[i][0].start -= x
    264 				z.attr[i][0].end -= x
    265 				z.attr[i][1].start -= x
    266 				z.attr[i][1].end -= x
    267 			}
    268 		}
    269 		z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]
    270 		// Now that we have copied the live bytes to the start of the buffer,
    271 		// we read from z.r into the remainder.
    272 		var n int
    273 		n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)])
    274 		if n == 0 {
    275 			z.err = z.readErr
    276 			return 0
    277 		}
    278 		z.buf = buf1[:d+n]
    279 	}
    280 	x := z.buf[z.raw.end]
    281 	z.raw.end++
    282 	if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf {
    283 		z.err = ErrBufferExceeded
    284 		return 0
    285 	}
    286 	return x
    287 }
    288 
    289 // Buffered returns a slice containing data buffered but not yet tokenized.
    290 func (z *Tokenizer) Buffered() []byte {
    291 	return z.buf[z.raw.end:]
    292 }
    293 
    294 // readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil).
    295 // It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil)
    296 // too many times in succession.
    297 func readAtLeastOneByte(r io.Reader, b []byte) (int, error) {
    298 	for i := 0; i < 100; i++ {
    299 		if n, err := r.Read(b); n != 0 || err != nil {
    300 			return n, err
    301 		}
    302 	}
    303 	return 0, io.ErrNoProgress
    304 }
    305 
    306 // skipWhiteSpace skips past any white space.
    307 func (z *Tokenizer) skipWhiteSpace() {
    308 	if z.err != nil {
    309 		return
    310 	}
    311 	for {
    312 		c := z.readByte()
    313 		if z.err != nil {
    314 			return
    315 		}
    316 		switch c {
    317 		case ' ', '\n', '\r', '\t', '\f':
    318 			// No-op.
    319 		default:
    320 			z.raw.end--
    321 			return
    322 		}
    323 	}
    324 }
    325 
    326 // readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and
    327 // is typically something like "script" or "textarea".
    328 func (z *Tokenizer) readRawOrRCDATA() {
    329 	if z.rawTag == "script" {
    330 		z.readScript()
    331 		z.textIsRaw = true
    332 		z.rawTag = ""
    333 		return
    334 	}
    335 loop:
    336 	for {
    337 		c := z.readByte()
    338 		if z.err != nil {
    339 			break loop
    340 		}
    341 		if c != '<' {
    342 			continue loop
    343 		}
    344 		c = z.readByte()
    345 		if z.err != nil {
    346 			break loop
    347 		}
    348 		if c != '/' {
    349 			z.raw.end--
    350 			continue loop
    351 		}
    352 		if z.readRawEndTag() || z.err != nil {
    353 			break loop
    354 		}
    355 	}
    356 	z.data.end = z.raw.end
    357 	// A textarea's or title's RCDATA can contain escaped entities.
    358 	z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
    359 	z.rawTag = ""
    360 }
    361 
    362 // readRawEndTag attempts to read a tag like "</foo>", where "foo" is z.rawTag.
    363 // If it succeeds, it backs up the input position to reconsume the tag and
    364 // returns true. Otherwise it returns false. The opening "</" has already been
    365 // consumed.
    366 func (z *Tokenizer) readRawEndTag() bool {
    367 	for i := 0; i < len(z.rawTag); i++ {
    368 		c := z.readByte()
    369 		if z.err != nil {
    370 			return false
    371 		}
    372 		if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
    373 			z.raw.end--
    374 			return false
    375 		}
    376 	}
    377 	c := z.readByte()
    378 	if z.err != nil {
    379 		return false
    380 	}
    381 	switch c {
    382 	case ' ', '\n', '\r', '\t', '\f', '/', '>':
    383 		// The 3 is 2 for the leading "</" plus 1 for the trailing character c.
    384 		z.raw.end -= 3 + len(z.rawTag)
    385 		return true
    386 	}
    387 	z.raw.end--
    388 	return false
    389 }
    390 
    391 // readScript reads until the next </script> tag, following the byzantine
    392 // rules for escaping/hiding the closing tag.
    393 func (z *Tokenizer) readScript() {
    394 	defer func() {
    395 		z.data.end = z.raw.end
    396 	}()
    397 	var c byte
    398 
    399 scriptData:
    400 	c = z.readByte()
    401 	if z.err != nil {
    402 		return
    403 	}
    404 	if c == '<' {
    405 		goto scriptDataLessThanSign
    406 	}
    407 	goto scriptData
    408 
    409 scriptDataLessThanSign:
    410 	c = z.readByte()
    411 	if z.err != nil {
    412 		return
    413 	}
    414 	switch c {
    415 	case '/':
    416 		goto scriptDataEndTagOpen
    417 	case '!':
    418 		goto scriptDataEscapeStart
    419 	}
    420 	z.raw.end--
    421 	goto scriptData
    422 
    423 scriptDataEndTagOpen:
    424 	if z.readRawEndTag() || z.err != nil {
    425 		return
    426 	}
    427 	goto scriptData
    428 
    429 scriptDataEscapeStart:
    430 	c = z.readByte()
    431 	if z.err != nil {
    432 		return
    433 	}
    434 	if c == '-' {
    435 		goto scriptDataEscapeStartDash
    436 	}
    437 	z.raw.end--
    438 	goto scriptData
    439 
    440 scriptDataEscapeStartDash:
    441 	c = z.readByte()
    442 	if z.err != nil {
    443 		return
    444 	}
    445 	if c == '-' {
    446 		goto scriptDataEscapedDashDash
    447 	}
    448 	z.raw.end--
    449 	goto scriptData
    450 
    451 scriptDataEscaped:
    452 	c = z.readByte()
    453 	if z.err != nil {
    454 		return
    455 	}
    456 	switch c {
    457 	case '-':
    458 		goto scriptDataEscapedDash
    459 	case '<':
    460 		goto scriptDataEscapedLessThanSign
    461 	}
    462 	goto scriptDataEscaped
    463 
    464 scriptDataEscapedDash:
    465 	c = z.readByte()
    466 	if z.err != nil {
    467 		return
    468 	}
    469 	switch c {
    470 	case '-':
    471 		goto scriptDataEscapedDashDash
    472 	case '<':
    473 		goto scriptDataEscapedLessThanSign
    474 	}
    475 	goto scriptDataEscaped
    476 
    477 scriptDataEscapedDashDash:
    478 	c = z.readByte()
    479 	if z.err != nil {
    480 		return
    481 	}
    482 	switch c {
    483 	case '-':
    484 		goto scriptDataEscapedDashDash
    485 	case '<':
    486 		goto scriptDataEscapedLessThanSign
    487 	case '>':
    488 		goto scriptData
    489 	}
    490 	goto scriptDataEscaped
    491 
    492 scriptDataEscapedLessThanSign:
    493 	c = z.readByte()
    494 	if z.err != nil {
    495 		return
    496 	}
    497 	if c == '/' {
    498 		goto scriptDataEscapedEndTagOpen
    499 	}
    500 	if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
    501 		goto scriptDataDoubleEscapeStart
    502 	}
    503 	z.raw.end--
    504 	goto scriptData
    505 
    506 scriptDataEscapedEndTagOpen:
    507 	if z.readRawEndTag() || z.err != nil {
    508 		return
    509 	}
    510 	goto scriptDataEscaped
    511 
    512 scriptDataDoubleEscapeStart:
    513 	z.raw.end--
    514 	for i := 0; i < len("script"); i++ {
    515 		c = z.readByte()
    516 		if z.err != nil {
    517 			return
    518 		}
    519 		if c != "script"[i] && c != "SCRIPT"[i] {
    520 			z.raw.end--
    521 			goto scriptDataEscaped
    522 		}
    523 	}
    524 	c = z.readByte()
    525 	if z.err != nil {
    526 		return
    527 	}
    528 	switch c {
    529 	case ' ', '\n', '\r', '\t', '\f', '/', '>':
    530 		goto scriptDataDoubleEscaped
    531 	}
    532 	z.raw.end--
    533 	goto scriptDataEscaped
    534 
    535 scriptDataDoubleEscaped:
    536 	c = z.readByte()
    537 	if z.err != nil {
    538 		return
    539 	}
    540 	switch c {
    541 	case '-':
    542 		goto scriptDataDoubleEscapedDash
    543 	case '<':
    544 		goto scriptDataDoubleEscapedLessThanSign
    545 	}
    546 	goto scriptDataDoubleEscaped
    547 
    548 scriptDataDoubleEscapedDash:
    549 	c = z.readByte()
    550 	if z.err != nil {
    551 		return
    552 	}
    553 	switch c {
    554 	case '-':
    555 		goto scriptDataDoubleEscapedDashDash
    556 	case '<':
    557 		goto scriptDataDoubleEscapedLessThanSign
    558 	}
    559 	goto scriptDataDoubleEscaped
    560 
    561 scriptDataDoubleEscapedDashDash:
    562 	c = z.readByte()
    563 	if z.err != nil {
    564 		return
    565 	}
    566 	switch c {
    567 	case '-':
    568 		goto scriptDataDoubleEscapedDashDash
    569 	case '<':
    570 		goto scriptDataDoubleEscapedLessThanSign
    571 	case '>':
    572 		goto scriptData
    573 	}
    574 	goto scriptDataDoubleEscaped
    575 
    576 scriptDataDoubleEscapedLessThanSign:
    577 	c = z.readByte()
    578 	if z.err != nil {
    579 		return
    580 	}
    581 	if c == '/' {
    582 		goto scriptDataDoubleEscapeEnd
    583 	}
    584 	z.raw.end--
    585 	goto scriptDataDoubleEscaped
    586 
    587 scriptDataDoubleEscapeEnd:
    588 	if z.readRawEndTag() {
    589 		z.raw.end += len("</script>")
    590 		goto scriptDataEscaped
    591 	}
    592 	if z.err != nil {
    593 		return
    594 	}
    595 	goto scriptDataDoubleEscaped
    596 }
    597 
    598 // readComment reads the next comment token starting with "<!--". The opening
    599 // "<!--" has already been consumed.
    600 func (z *Tokenizer) readComment() {
    601 	// When modifying this function, consider manually increasing the
    602 	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
    603 	// That increase should only be temporary, not committed, as it
    604 	// exponentially affects the test running time.
    605 
    606 	z.data.start = z.raw.end
    607 	defer func() {
    608 		if z.data.end < z.data.start {
    609 			// It's a comment with no data, like <!-->.
    610 			z.data.end = z.data.start
    611 		}
    612 	}()
    613 
    614 	var dashCount int
    615 	beginning := true
    616 	for {
    617 		c := z.readByte()
    618 		if z.err != nil {
    619 			z.data.end = z.calculateAbruptCommentDataEnd()
    620 			return
    621 		}
    622 		switch c {
    623 		case '-':
    624 			dashCount++
    625 			continue
    626 		case '>':
    627 			if dashCount >= 2 || beginning {
    628 				z.data.end = z.raw.end - len("-->")
    629 				return
    630 			}
    631 		case '!':
    632 			if dashCount >= 2 {
    633 				c = z.readByte()
    634 				if z.err != nil {
    635 					z.data.end = z.calculateAbruptCommentDataEnd()
    636 					return
    637 				} else if c == '>' {
    638 					z.data.end = z.raw.end - len("--!>")
    639 					return
    640 				} else if c == '-' {
    641 					dashCount = 1
    642 					beginning = false
    643 					continue
    644 				}
    645 			}
    646 		}
    647 		dashCount = 0
    648 		beginning = false
    649 	}
    650 }
    651 
    652 func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
    653 	raw := z.Raw()
    654 	const prefixLen = len("<!--")
    655 	if len(raw) >= prefixLen {
    656 		raw = raw[prefixLen:]
    657 		if hasSuffix(raw, "--!") {
    658 			return z.raw.end - 3
    659 		} else if hasSuffix(raw, "--") {
    660 			return z.raw.end - 2
    661 		} else if hasSuffix(raw, "-") {
    662 			return z.raw.end - 1
    663 		}
    664 	}
    665 	return z.raw.end
    666 }
    667 
    668 func hasSuffix(b []byte, suffix string) bool {
    669 	if len(b) < len(suffix) {
    670 		return false
    671 	}
    672 	b = b[len(b)-len(suffix):]
    673 	for i := range b {
    674 		if b[i] != suffix[i] {
    675 			return false
    676 		}
    677 	}
    678 	return true
    679 }
    680 
    681 // readUntilCloseAngle reads until the next ">".
    682 func (z *Tokenizer) readUntilCloseAngle() {
    683 	z.data.start = z.raw.end
    684 	for {
    685 		c := z.readByte()
    686 		if z.err != nil {
    687 			z.data.end = z.raw.end
    688 			return
    689 		}
    690 		if c == '>' {
    691 			z.data.end = z.raw.end - len(">")
    692 			return
    693 		}
    694 	}
    695 }
    696 
    697 // readMarkupDeclaration reads the next token starting with "<!". It might be
    698 // a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or
    699 // "<!a bogus comment". The opening "<!" has already been consumed.
    700 func (z *Tokenizer) readMarkupDeclaration() TokenType {
    701 	z.data.start = z.raw.end
    702 	var c [2]byte
    703 	for i := 0; i < 2; i++ {
    704 		c[i] = z.readByte()
    705 		if z.err != nil {
    706 			z.data.end = z.raw.end
    707 			return CommentToken
    708 		}
    709 	}
    710 	if c[0] == '-' && c[1] == '-' {
    711 		z.readComment()
    712 		return CommentToken
    713 	}
    714 	z.raw.end -= 2
    715 	if z.readDoctype() {
    716 		return DoctypeToken
    717 	}
    718 	if z.allowCDATA && z.readCDATA() {
    719 		z.convertNUL = true
    720 		return TextToken
    721 	}
    722 	// It's a bogus comment.
    723 	z.readUntilCloseAngle()
    724 	return CommentToken
    725 }
    726 
    727 // readDoctype attempts to read a doctype declaration and returns true if
    728 // successful. The opening "<!" has already been consumed.
    729 func (z *Tokenizer) readDoctype() bool {
    730 	const s = "DOCTYPE"
    731 	for i := 0; i < len(s); i++ {
    732 		c := z.readByte()
    733 		if z.err != nil {
    734 			z.data.end = z.raw.end
    735 			return false
    736 		}
    737 		if c != s[i] && c != s[i]+('a'-'A') {
    738 			// Back up to read the fragment of "DOCTYPE" again.
    739 			z.raw.end = z.data.start
    740 			return false
    741 		}
    742 	}
    743 	if z.skipWhiteSpace(); z.err != nil {
    744 		z.data.start = z.raw.end
    745 		z.data.end = z.raw.end
    746 		return true
    747 	}
    748 	z.readUntilCloseAngle()
    749 	return true
    750 }
    751 
    752 // readCDATA attempts to read a CDATA section and returns true if
    753 // successful. The opening "<!" has already been consumed.
    754 func (z *Tokenizer) readCDATA() bool {
    755 	const s = "[CDATA["
    756 	for i := 0; i < len(s); i++ {
    757 		c := z.readByte()
    758 		if z.err != nil {
    759 			z.data.end = z.raw.end
    760 			return false
    761 		}
    762 		if c != s[i] {
    763 			// Back up to read the fragment of "[CDATA[" again.
    764 			z.raw.end = z.data.start
    765 			return false
    766 		}
    767 	}
    768 	z.data.start = z.raw.end
    769 	brackets := 0
    770 	for {
    771 		c := z.readByte()
    772 		if z.err != nil {
    773 			z.data.end = z.raw.end
    774 			return true
    775 		}
    776 		switch c {
    777 		case ']':
    778 			brackets++
    779 		case '>':
    780 			if brackets >= 2 {
    781 				z.data.end = z.raw.end - len("]]>")
    782 				return true
    783 			}
    784 			brackets = 0
    785 		default:
    786 			brackets = 0
    787 		}
    788 	}
    789 }
    790 
    791 // startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]
    792 // case-insensitively matches any element of ss.
    793 func (z *Tokenizer) startTagIn(ss ...string) bool {
    794 loop:
    795 	for _, s := range ss {
    796 		if z.data.end-z.data.start != len(s) {
    797 			continue loop
    798 		}
    799 		for i := 0; i < len(s); i++ {
    800 			c := z.buf[z.data.start+i]
    801 			if 'A' <= c && c <= 'Z' {
    802 				c += 'a' - 'A'
    803 			}
    804 			if c != s[i] {
    805 				continue loop
    806 			}
    807 		}
    808 		return true
    809 	}
    810 	return false
    811 }
    812 
    813 // readStartTag reads the next start tag token. The opening "<a" has already
    814 // been consumed, where 'a' means anything in [A-Za-z].
    815 func (z *Tokenizer) readStartTag() TokenType {
    816 	z.readTag(true)
    817 	if z.err != nil {
    818 		return ErrorToken
    819 	}
    820 	// Several tags flag the tokenizer's next token as raw.
    821 	c, raw := z.buf[z.data.start], false
    822 	if 'A' <= c && c <= 'Z' {
    823 		c += 'a' - 'A'
    824 	}
    825 	switch c {
    826 	case 'i':
    827 		raw = z.startTagIn("iframe")
    828 	case 'n':
    829 		raw = z.startTagIn("noembed", "noframes", "noscript")
    830 	case 'p':
    831 		raw = z.startTagIn("plaintext")
    832 	case 's':
    833 		raw = z.startTagIn("script", "style")
    834 	case 't':
    835 		raw = z.startTagIn("textarea", "title")
    836 	case 'x':
    837 		raw = z.startTagIn("xmp")
    838 	}
    839 	if raw {
    840 		z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end]))
    841 	}
    842 	// Look for a self-closing token like "<br/>".
    843 	if z.err == nil && z.buf[z.raw.end-2] == '/' {
    844 		return SelfClosingTagToken
    845 	}
    846 	return StartTagToken
    847 }
    848 
    849 // readTag reads the next tag token and its attributes. If saveAttr, those
    850 // attributes are saved in z.attr, otherwise z.attr is set to an empty slice.
    851 // The opening "<a" or "</a" has already been consumed, where 'a' means anything
    852 // in [A-Za-z].
    853 func (z *Tokenizer) readTag(saveAttr bool) {
    854 	z.attr = z.attr[:0]
    855 	z.nAttrReturned = 0
    856 	// Read the tag name and attribute key/value pairs.
    857 	z.readTagName()
    858 	if z.skipWhiteSpace(); z.err != nil {
    859 		return
    860 	}
    861 	for {
    862 		c := z.readByte()
    863 		if z.err != nil || c == '>' {
    864 			break
    865 		}
    866 		z.raw.end--
    867 		z.readTagAttrKey()
    868 		z.readTagAttrVal()
    869 		// Save pendingAttr if saveAttr and that attribute has a non-empty key.
    870 		if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end {
    871 			z.attr = append(z.attr, z.pendingAttr)
    872 		}
    873 		if z.skipWhiteSpace(); z.err != nil {
    874 			break
    875 		}
    876 	}
    877 }
    878 
    879 // readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
    880 // is positioned such that the first byte of the tag name (the "d" in "<div")
    881 // has already been consumed.
    882 func (z *Tokenizer) readTagName() {
    883 	z.data.start = z.raw.end - 1
    884 	for {
    885 		c := z.readByte()
    886 		if z.err != nil {
    887 			z.data.end = z.raw.end
    888 			return
    889 		}
    890 		switch c {
    891 		case ' ', '\n', '\r', '\t', '\f':
    892 			z.data.end = z.raw.end - 1
    893 			return
    894 		case '/', '>':
    895 			z.raw.end--
    896 			z.data.end = z.raw.end
    897 			return
    898 		}
    899 	}
    900 }
    901 
    902 // readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".
    903 // Precondition: z.err == nil.
    904 func (z *Tokenizer) readTagAttrKey() {
    905 	z.pendingAttr[0].start = z.raw.end
    906 	for {
    907 		c := z.readByte()
    908 		if z.err != nil {
    909 			z.pendingAttr[0].end = z.raw.end
    910 			return
    911 		}
    912 		switch c {
    913 		case ' ', '\n', '\r', '\t', '\f', '/':
    914 			z.pendingAttr[0].end = z.raw.end - 1
    915 			return
    916 		case '=', '>':
    917 			z.raw.end--
    918 			z.pendingAttr[0].end = z.raw.end
    919 			return
    920 		}
    921 	}
    922 }
    923 
    924 // readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".
    925 func (z *Tokenizer) readTagAttrVal() {
    926 	z.pendingAttr[1].start = z.raw.end
    927 	z.pendingAttr[1].end = z.raw.end
    928 	if z.skipWhiteSpace(); z.err != nil {
    929 		return
    930 	}
    931 	c := z.readByte()
    932 	if z.err != nil {
    933 		return
    934 	}
    935 	if c != '=' {
    936 		z.raw.end--
    937 		return
    938 	}
    939 	if z.skipWhiteSpace(); z.err != nil {
    940 		return
    941 	}
    942 	quote := z.readByte()
    943 	if z.err != nil {
    944 		return
    945 	}
    946 	switch quote {
    947 	case '>':
    948 		z.raw.end--
    949 		return
    950 
    951 	case '\'', '"':
    952 		z.pendingAttr[1].start = z.raw.end
    953 		for {
    954 			c := z.readByte()
    955 			if z.err != nil {
    956 				z.pendingAttr[1].end = z.raw.end
    957 				return
    958 			}
    959 			if c == quote {
    960 				z.pendingAttr[1].end = z.raw.end - 1
    961 				return
    962 			}
    963 		}
    964 
    965 	default:
    966 		z.pendingAttr[1].start = z.raw.end - 1
    967 		for {
    968 			c := z.readByte()
    969 			if z.err != nil {
    970 				z.pendingAttr[1].end = z.raw.end
    971 				return
    972 			}
    973 			switch c {
    974 			case ' ', '\n', '\r', '\t', '\f':
    975 				z.pendingAttr[1].end = z.raw.end - 1
    976 				return
    977 			case '>':
    978 				z.raw.end--
    979 				z.pendingAttr[1].end = z.raw.end
    980 				return
    981 			}
    982 		}
    983 	}
    984 }
    985 
    986 // Next scans the next token and returns its type.
    987 func (z *Tokenizer) Next() TokenType {
    988 	z.raw.start = z.raw.end
    989 	z.data.start = z.raw.end
    990 	z.data.end = z.raw.end
    991 	if z.err != nil {
    992 		z.tt = ErrorToken
    993 		return z.tt
    994 	}
    995 	if z.rawTag != "" {
    996 		if z.rawTag == "plaintext" {
    997 			// Read everything up to EOF.
    998 			for z.err == nil {
    999 				z.readByte()
   1000 			}
   1001 			z.data.end = z.raw.end
   1002 			z.textIsRaw = true
   1003 		} else {
   1004 			z.readRawOrRCDATA()
   1005 		}
   1006 		if z.data.end > z.data.start {
   1007 			z.tt = TextToken
   1008 			z.convertNUL = true
   1009 			return z.tt
   1010 		}
   1011 	}
   1012 	z.textIsRaw = false
   1013 	z.convertNUL = false
   1014 
   1015 loop:
   1016 	for {
   1017 		c := z.readByte()
   1018 		if z.err != nil {
   1019 			break loop
   1020 		}
   1021 		if c != '<' {
   1022 			continue loop
   1023 		}
   1024 
   1025 		// Check if the '<' we have just read is part of a tag, comment
   1026 		// or doctype. If not, it's part of the accumulated text token.
   1027 		c = z.readByte()
   1028 		if z.err != nil {
   1029 			break loop
   1030 		}
   1031 		var tokenType TokenType
   1032 		switch {
   1033 		case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
   1034 			tokenType = StartTagToken
   1035 		case c == '/':
   1036 			tokenType = EndTagToken
   1037 		case c == '!' || c == '?':
   1038 			// We use CommentToken to mean any of "<!--actual comments-->",
   1039 			// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".
   1040 			tokenType = CommentToken
   1041 		default:
   1042 			// Reconsume the current character.
   1043 			z.raw.end--
   1044 			continue
   1045 		}
   1046 
   1047 		// We have a non-text token, but we might have accumulated some text
   1048 		// before that. If so, we return the text first, and return the non-
   1049 		// text token on the subsequent call to Next.
   1050 		if x := z.raw.end - len("<a"); z.raw.start < x {
   1051 			z.raw.end = x
   1052 			z.data.end = x
   1053 			z.tt = TextToken
   1054 			return z.tt
   1055 		}
   1056 		switch tokenType {
   1057 		case StartTagToken:
   1058 			z.tt = z.readStartTag()
   1059 			return z.tt
   1060 		case EndTagToken:
   1061 			c = z.readByte()
   1062 			if z.err != nil {
   1063 				break loop
   1064 			}
   1065 			if c == '>' {
   1066 				// "</>" does not generate a token at all. Generate an empty comment
   1067 				// to allow passthrough clients to pick up the data using Raw.
   1068 				// Reset the tokenizer state and start again.
   1069 				z.tt = CommentToken
   1070 				return z.tt
   1071 			}
   1072 			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
   1073 				z.readTag(false)
   1074 				if z.err != nil {
   1075 					z.tt = ErrorToken
   1076 				} else {
   1077 					z.tt = EndTagToken
   1078 				}
   1079 				return z.tt
   1080 			}
   1081 			z.raw.end--
   1082 			z.readUntilCloseAngle()
   1083 			z.tt = CommentToken
   1084 			return z.tt
   1085 		case CommentToken:
   1086 			if c == '!' {
   1087 				z.tt = z.readMarkupDeclaration()
   1088 				return z.tt
   1089 			}
   1090 			z.raw.end--
   1091 			z.readUntilCloseAngle()
   1092 			z.tt = CommentToken
   1093 			return z.tt
   1094 		}
   1095 	}
   1096 	if z.raw.start < z.raw.end {
   1097 		z.data.end = z.raw.end
   1098 		z.tt = TextToken
   1099 		return z.tt
   1100 	}
   1101 	z.tt = ErrorToken
   1102 	return z.tt
   1103 }
   1104 
   1105 // Raw returns the unmodified text of the current token. Calling Next, Token,
   1106 // Text, TagName or TagAttr may change the contents of the returned slice.
   1107 //
   1108 // The token stream's raw bytes partition the byte stream (up until an
   1109 // ErrorToken). There are no overlaps or gaps between two consecutive token's
   1110 // raw bytes. One implication is that the byte offset of the current token is
   1111 // the sum of the lengths of all previous tokens' raw bytes.
   1112 func (z *Tokenizer) Raw() []byte {
   1113 	return z.buf[z.raw.start:z.raw.end]
   1114 }
   1115 
   1116 // convertNewlines converts "\r" and "\r\n" in s to "\n".
   1117 // The conversion happens in place, but the resulting slice may be shorter.
   1118 func convertNewlines(s []byte) []byte {
   1119 	for i, c := range s {
   1120 		if c != '\r' {
   1121 			continue
   1122 		}
   1123 
   1124 		src := i + 1
   1125 		if src >= len(s) || s[src] != '\n' {
   1126 			s[i] = '\n'
   1127 			continue
   1128 		}
   1129 
   1130 		dst := i
   1131 		for src < len(s) {
   1132 			if s[src] == '\r' {
   1133 				if src+1 < len(s) && s[src+1] == '\n' {
   1134 					src++
   1135 				}
   1136 				s[dst] = '\n'
   1137 			} else {
   1138 				s[dst] = s[src]
   1139 			}
   1140 			src++
   1141 			dst++
   1142 		}
   1143 		return s[:dst]
   1144 	}
   1145 	return s
   1146 }
   1147 
   1148 var (
   1149 	nul         = []byte("\x00")
   1150 	replacement = []byte("\ufffd")
   1151 )
   1152 
   1153 // Text returns the unescaped text of a text, comment or doctype token. The
   1154 // contents of the returned slice may change on the next call to Next.
   1155 func (z *Tokenizer) Text() []byte {
   1156 	switch z.tt {
   1157 	case TextToken, CommentToken, DoctypeToken:
   1158 		s := z.buf[z.data.start:z.data.end]
   1159 		z.data.start = z.raw.end
   1160 		z.data.end = z.raw.end
   1161 		s = convertNewlines(s)
   1162 		if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) {
   1163 			s = bytes.Replace(s, nul, replacement, -1)
   1164 		}
   1165 		if !z.textIsRaw {
   1166 			s = unescape(s, false)
   1167 		}
   1168 		return s
   1169 	}
   1170 	return nil
   1171 }
   1172 
   1173 // TagName returns the lower-cased name of a tag token (the `img` out of
   1174 // `<IMG SRC="foo">`) and whether the tag has attributes.
   1175 // The contents of the returned slice may change on the next call to Next.
   1176 func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
   1177 	if z.data.start < z.data.end {
   1178 		switch z.tt {
   1179 		case StartTagToken, EndTagToken, SelfClosingTagToken:
   1180 			s := z.buf[z.data.start:z.data.end]
   1181 			z.data.start = z.raw.end
   1182 			z.data.end = z.raw.end
   1183 			return lower(s), z.nAttrReturned < len(z.attr)
   1184 		}
   1185 	}
   1186 	return nil, false
   1187 }
   1188 
   1189 // TagAttr returns the lower-cased key and unescaped value of the next unparsed
   1190 // attribute for the current tag token and whether there are more attributes.
   1191 // The contents of the returned slices may change on the next call to Next.
   1192 func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
   1193 	if z.nAttrReturned < len(z.attr) {
   1194 		switch z.tt {
   1195 		case StartTagToken, SelfClosingTagToken:
   1196 			x := z.attr[z.nAttrReturned]
   1197 			z.nAttrReturned++
   1198 			key = z.buf[x[0].start:x[0].end]
   1199 			val = z.buf[x[1].start:x[1].end]
   1200 			return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr)
   1201 		}
   1202 	}
   1203 	return nil, nil, false
   1204 }
   1205 
   1206 // Token returns the current Token. The result's Data and Attr values remain
   1207 // valid after subsequent Next calls.
   1208 func (z *Tokenizer) Token() Token {
   1209 	t := Token{Type: z.tt}
   1210 	switch z.tt {
   1211 	case TextToken, CommentToken, DoctypeToken:
   1212 		t.Data = string(z.Text())
   1213 	case StartTagToken, SelfClosingTagToken, EndTagToken:
   1214 		name, moreAttr := z.TagName()
   1215 		for moreAttr {
   1216 			var key, val []byte
   1217 			key, val, moreAttr = z.TagAttr()
   1218 			t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
   1219 		}
   1220 		if a := atom.Lookup(name); a != 0 {
   1221 			t.DataAtom, t.Data = a, a.String()
   1222 		} else {
   1223 			t.DataAtom, t.Data = 0, string(name)
   1224 		}
   1225 	}
   1226 	return t
   1227 }
   1228 
   1229 // SetMaxBuf sets a limit on the amount of data buffered during tokenization.
   1230 // A value of 0 means unlimited.
   1231 func (z *Tokenizer) SetMaxBuf(n int) {
   1232 	z.maxBuf = n
   1233 }
   1234 
   1235 // NewTokenizer returns a new HTML Tokenizer for the given Reader.
   1236 // The input is assumed to be UTF-8 encoded.
   1237 func NewTokenizer(r io.Reader) *Tokenizer {
   1238 	return NewTokenizerFragment(r, "")
   1239 }
   1240 
   1241 // NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for
   1242 // tokenizing an existing element's InnerHTML fragment. contextTag is that
   1243 // element's tag, such as "div" or "iframe".
   1244 //
   1245 // For example, how the InnerHTML "a<b" is tokenized depends on whether it is
   1246 // for a <p> tag or a <script> tag.
   1247 //
   1248 // The input is assumed to be UTF-8 encoded.
   1249 func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer {
   1250 	z := &Tokenizer{
   1251 		r:   r,
   1252 		buf: make([]byte, 0, 4096),
   1253 	}
   1254 	if contextTag != "" {
   1255 		switch s := strings.ToLower(contextTag); s {
   1256 		case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
   1257 			z.rawTag = s
   1258 		}
   1259 	}
   1260 	return z
   1261 }