buffer.go (2997B)
1 package html 2 3 import ( 4 "github.com/tdewolff/parse/v2" 5 "github.com/tdewolff/parse/v2/html" 6 ) 7 8 // Token is a single token unit with an attribute value (if given) and hash of the data. 9 type Token struct { 10 html.TokenType 11 Hash Hash 12 Data []byte 13 Text []byte 14 AttrVal []byte 15 Traits traits 16 Offset int 17 } 18 19 // TokenBuffer is a buffer that allows for token look-ahead. 20 type TokenBuffer struct { 21 r *parse.Input 22 l *html.Lexer 23 24 buf []Token 25 pos int 26 27 attrBuffer []*Token 28 } 29 30 // NewTokenBuffer returns a new TokenBuffer. 31 func NewTokenBuffer(r *parse.Input, l *html.Lexer) *TokenBuffer { 32 return &TokenBuffer{ 33 r: r, 34 l: l, 35 buf: make([]Token, 0, 8), 36 } 37 } 38 39 func (z *TokenBuffer) read(t *Token) { 40 t.Offset = z.r.Offset() 41 t.TokenType, t.Data = z.l.Next() 42 t.Text = z.l.Text() 43 if t.TokenType == html.AttributeToken { 44 t.Offset += 1 + len(t.Text) + 1 45 t.AttrVal = z.l.AttrVal() 46 if len(t.AttrVal) > 1 && (t.AttrVal[0] == '"' || t.AttrVal[0] == '\'') { 47 t.Offset++ 48 t.AttrVal = t.AttrVal[1 : len(t.AttrVal)-1] // quotes will be readded in attribute loop if necessary 49 } 50 t.Hash = ToHash(t.Text) 51 t.Traits = attrMap[t.Hash] 52 } else if t.TokenType == html.StartTagToken || t.TokenType == html.EndTagToken { 53 t.AttrVal = nil 54 t.Hash = ToHash(t.Text) 55 t.Traits = tagMap[t.Hash] // zero if not exist 56 } else { 57 t.AttrVal = nil 58 t.Hash = 0 59 t.Traits = 0 60 } 61 } 62 63 // Peek returns the ith element and possibly does an allocation. 64 // Peeking past an error will panic. 65 func (z *TokenBuffer) Peek(pos int) *Token { 66 pos += z.pos 67 if pos >= len(z.buf) { 68 if len(z.buf) > 0 && z.buf[len(z.buf)-1].TokenType == html.ErrorToken { 69 return &z.buf[len(z.buf)-1] 70 } 71 72 c := cap(z.buf) 73 d := len(z.buf) - z.pos 74 p := pos - z.pos + 1 // required peek length 75 var buf []Token 76 if 2*p > c { 77 buf = make([]Token, 0, 2*c+p) 78 } else { 79 buf = z.buf 80 } 81 copy(buf[:d], z.buf[z.pos:]) 82 83 buf = buf[:p] 84 pos -= z.pos 85 for i := d; i < p; i++ { 86 z.read(&buf[i]) 87 if buf[i].TokenType == html.ErrorToken { 88 buf = buf[:i+1] 89 pos = i 90 break 91 } 92 } 93 z.pos, z.buf = 0, buf 94 } 95 return &z.buf[pos] 96 } 97 98 // Shift returns the first element and advances position. 99 func (z *TokenBuffer) Shift() *Token { 100 if z.pos >= len(z.buf) { 101 t := &z.buf[:1][0] 102 z.read(t) 103 return t 104 } 105 t := &z.buf[z.pos] 106 z.pos++ 107 return t 108 } 109 110 // Attributes extracts the gives attribute hashes from a tag. 111 // It returns in the same order pointers to the requested token data or nil. 112 func (z *TokenBuffer) Attributes(hashes ...Hash) []*Token { 113 n := 0 114 for { 115 if t := z.Peek(n); t.TokenType != html.AttributeToken { 116 break 117 } 118 n++ 119 } 120 if len(hashes) > cap(z.attrBuffer) { 121 z.attrBuffer = make([]*Token, len(hashes)) 122 } else { 123 z.attrBuffer = z.attrBuffer[:len(hashes)] 124 for i := range z.attrBuffer { 125 z.attrBuffer[i] = nil 126 } 127 } 128 for i := z.pos; i < z.pos+n; i++ { 129 attr := &z.buf[i] 130 for j, hash := range hashes { 131 if hash == attr.Hash { 132 z.attrBuffer[j] = attr 133 } 134 } 135 } 136 return z.attrBuffer 137 }