scanner.go (10619B)
1 // Copyright 2012 The Gorilla Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package scanner 6 7 import ( 8 "fmt" 9 "regexp" 10 "strings" 11 "unicode" 12 "unicode/utf8" 13 ) 14 15 // tokenType identifies the type of lexical tokens. 16 type tokenType int 17 18 // String returns a string representation of the token type. 19 func (t tokenType) String() string { 20 return tokenNames[t] 21 } 22 23 // Token represents a token and the corresponding string. 24 type Token struct { 25 Type tokenType 26 Value string 27 Line int 28 Column int 29 } 30 31 // String returns a string representation of the token. 32 func (t *Token) String() string { 33 if len(t.Value) > 10 { 34 return fmt.Sprintf("%s (line: %d, column: %d): %.10q...", 35 t.Type, t.Line, t.Column, t.Value) 36 } 37 return fmt.Sprintf("%s (line: %d, column: %d): %q", 38 t.Type, t.Line, t.Column, t.Value) 39 } 40 41 // All tokens ----------------------------------------------------------------- 42 43 // The complete list of tokens in CSS3. 44 const ( 45 // Scanner flags. 46 TokenError tokenType = iota 47 TokenEOF 48 // From now on, only tokens from the CSS specification. 49 TokenIdent 50 TokenAtKeyword 51 TokenString 52 TokenHash 53 TokenNumber 54 TokenPercentage 55 TokenDimension 56 TokenURI 57 TokenUnicodeRange 58 TokenCDO 59 TokenCDC 60 TokenS 61 TokenComment 62 TokenFunction 63 TokenIncludes 64 TokenDashMatch 65 TokenPrefixMatch 66 TokenSuffixMatch 67 TokenSubstringMatch 68 TokenChar 69 TokenBOM 70 ) 71 72 // tokenNames maps tokenType's to their names. Used for conversion to string. 73 var tokenNames = map[tokenType]string{ 74 TokenError: "error", 75 TokenEOF: "EOF", 76 TokenIdent: "IDENT", 77 TokenAtKeyword: "ATKEYWORD", 78 TokenString: "STRING", 79 TokenHash: "HASH", 80 TokenNumber: "NUMBER", 81 TokenPercentage: "PERCENTAGE", 82 TokenDimension: "DIMENSION", 83 TokenURI: "URI", 84 TokenUnicodeRange: "UNICODE-RANGE", 85 TokenCDO: "CDO", 86 TokenCDC: "CDC", 87 TokenS: "S", 88 TokenComment: "COMMENT", 89 TokenFunction: "FUNCTION", 90 TokenIncludes: "INCLUDES", 91 TokenDashMatch: "DASHMATCH", 92 TokenPrefixMatch: "PREFIXMATCH", 93 TokenSuffixMatch: "SUFFIXMATCH", 94 TokenSubstringMatch: "SUBSTRINGMATCH", 95 TokenChar: "CHAR", 96 TokenBOM: "BOM", 97 } 98 99 // Macros and productions ----------------------------------------------------- 100 // http://www.w3.org/TR/css3-syntax/#tokenization 101 102 var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`) 103 104 // macros maps macro names to patterns to be expanded. 105 var macros = map[string]string{ 106 // must be escaped: `\.+*?()|[]{}^$` 107 "ident": `-?{nmstart}{nmchar}*`, 108 "name": `{nmchar}+`, 109 "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`, 110 "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", 111 "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`, 112 "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", 113 "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`, 114 "num": `[0-9]*\.[0-9]+|[0-9]+`, 115 "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, 116 "stringchar": `{urlchar}|[ ]|\\{nl}`, 117 "nl": `[\n\r\f]|\r\n`, 118 "w": `{wc}*`, 119 "wc": `[\t\n\f\r ]`, 120 121 // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}] 122 // ASCII characters range = `[\u0020-\u007e]` 123 // Skip space \u0020 = `[\u0021-\u007e]` 124 // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]` 125 // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]` 126 // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]` 127 // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves 128 "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", 129 } 130 131 // productions maps the list of tokens to patterns to be expanded. 132 var productions = map[tokenType]string{ 133 // Unused regexps (matched using other methods) are commented out. 134 TokenIdent: `{ident}`, 135 TokenAtKeyword: `@{ident}`, 136 TokenString: `{string}`, 137 TokenHash: `#{name}`, 138 TokenNumber: `{num}`, 139 TokenPercentage: `{num}%`, 140 TokenDimension: `{num}{ident}`, 141 TokenURI: `url\({w}(?:{string}|{urlchar}*?){w}\)`, 142 TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`, 143 //TokenCDO: `<!--`, 144 TokenCDC: `-->`, 145 TokenS: `{wc}+`, 146 TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, 147 TokenFunction: `{ident}\(`, 148 //TokenIncludes: `~=`, 149 //TokenDashMatch: `\|=`, 150 //TokenPrefixMatch: `\^=`, 151 //TokenSuffixMatch: `\$=`, 152 //TokenSubstringMatch: `\*=`, 153 //TokenChar: `[^"']`, 154 //TokenBOM: "\uFEFF", 155 } 156 157 // matchers maps the list of tokens to compiled regular expressions. 158 // 159 // The map is filled on init() using the macros and productions defined in 160 // the CSS specification. 161 var matchers = map[tokenType]*regexp.Regexp{} 162 163 // matchOrder is the order to test regexps when first-char shortcuts 164 // can't be used. 165 var matchOrder = []tokenType{ 166 TokenURI, 167 TokenFunction, 168 TokenUnicodeRange, 169 TokenIdent, 170 TokenDimension, 171 TokenPercentage, 172 TokenNumber, 173 TokenCDC, 174 } 175 176 func init() { 177 // replace macros and compile regexps for productions. 178 replaceMacro := func(s string) string { 179 return "(?:" + macros[s[1:len(s)-1]] + ")" 180 } 181 for t, s := range productions { 182 for macroRegexp.MatchString(s) { 183 s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) 184 } 185 matchers[t] = regexp.MustCompile("^(?:" + s + ")") 186 } 187 } 188 189 // Scanner -------------------------------------------------------------------- 190 191 // New returns a new CSS scanner for the given input. 192 func New(input string) *Scanner { 193 // Normalize newlines. 194 input = strings.Replace(input, "\r\n", "\n", -1) 195 return &Scanner{ 196 input: input, 197 row: 1, 198 col: 1, 199 } 200 } 201 202 // Scanner scans an input and emits tokens following the CSS3 specification. 203 type Scanner struct { 204 input string 205 pos int 206 row int 207 col int 208 err *Token 209 } 210 211 // Next returns the next token from the input. 212 // 213 // At the end of the input the token type is TokenEOF. 214 // 215 // If the input can't be tokenized the token type is TokenError. This occurs 216 // in case of unclosed quotation marks or comments. 217 func (s *Scanner) Next() *Token { 218 if s.err != nil { 219 return s.err 220 } 221 if s.pos >= len(s.input) { 222 s.err = &Token{TokenEOF, "", s.row, s.col} 223 return s.err 224 } 225 if s.pos == 0 { 226 // Test BOM only once, at the beginning of the file. 227 if strings.HasPrefix(s.input, "\uFEFF") { 228 return s.emitSimple(TokenBOM, "\uFEFF") 229 } 230 } 231 // There's a lot we can guess based on the first byte so we'll take a 232 // shortcut before testing multiple regexps. 233 input := s.input[s.pos:] 234 switch input[0] { 235 case '\t', '\n', '\f', '\r', ' ': 236 // Whitespace. 237 return s.emitToken(TokenS, matchers[TokenS].FindString(input)) 238 case '.': 239 // Dot is too common to not have a quick check. 240 // We'll test if this is a Char; if it is followed by a number it is a 241 // dimension/percentage/number, and this will be matched later. 242 if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { 243 return s.emitSimple(TokenChar, ".") 244 } 245 case '#': 246 // Another common one: Hash or Char. 247 if match := matchers[TokenHash].FindString(input); match != "" { 248 return s.emitToken(TokenHash, match) 249 } 250 return s.emitSimple(TokenChar, "#") 251 case '@': 252 // Another common one: AtKeyword or Char. 253 if match := matchers[TokenAtKeyword].FindString(input); match != "" { 254 return s.emitSimple(TokenAtKeyword, match) 255 } 256 return s.emitSimple(TokenChar, "@") 257 case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': 258 // More common chars. 259 return s.emitSimple(TokenChar, string(input[0])) 260 case '"', '\'': 261 // String or error. 262 match := matchers[TokenString].FindString(input) 263 if match != "" { 264 return s.emitToken(TokenString, match) 265 } 266 267 s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} 268 return s.err 269 case '/': 270 // Comment, error or Char. 271 if len(input) > 1 && input[1] == '*' { 272 match := matchers[TokenComment].FindString(input) 273 if match != "" { 274 return s.emitToken(TokenComment, match) 275 } else { 276 s.err = &Token{TokenError, "unclosed comment", s.row, s.col} 277 return s.err 278 } 279 } 280 return s.emitSimple(TokenChar, "/") 281 case '~': 282 // Includes or Char. 283 return s.emitPrefixOrChar(TokenIncludes, "~=") 284 case '|': 285 // DashMatch or Char. 286 return s.emitPrefixOrChar(TokenDashMatch, "|=") 287 case '^': 288 // PrefixMatch or Char. 289 return s.emitPrefixOrChar(TokenPrefixMatch, "^=") 290 case '$': 291 // SuffixMatch or Char. 292 return s.emitPrefixOrChar(TokenSuffixMatch, "$=") 293 case '*': 294 // SubstringMatch or Char. 295 return s.emitPrefixOrChar(TokenSubstringMatch, "*=") 296 case '<': 297 // CDO or Char. 298 return s.emitPrefixOrChar(TokenCDO, "<!--") 299 } 300 // Test all regexps, in order. 301 for _, token := range matchOrder { 302 if match := matchers[token].FindString(input); match != "" { 303 return s.emitToken(token, match) 304 } 305 } 306 // We already handled unclosed quotation marks and comments, 307 // so this can only be a Char. 308 r, width := utf8.DecodeRuneInString(input) 309 token := &Token{TokenChar, string(r), s.row, s.col} 310 s.col += width 311 s.pos += width 312 return token 313 } 314 315 // updatePosition updates input coordinates based on the consumed text. 316 func (s *Scanner) updatePosition(text string) { 317 width := utf8.RuneCountInString(text) 318 lines := strings.Count(text, "\n") 319 s.row += lines 320 if lines == 0 { 321 s.col += width 322 } else { 323 s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):]) 324 } 325 s.pos += len(text) // while col is a rune index, pos is a byte index 326 } 327 328 // emitToken returns a Token for the string v and updates the scanner position. 329 func (s *Scanner) emitToken(t tokenType, v string) *Token { 330 token := &Token{t, v, s.row, s.col} 331 s.updatePosition(v) 332 return token 333 } 334 335 // emitSimple returns a Token for the string v and updates the scanner 336 // position in a simplified manner. 337 // 338 // The string is known to have only ASCII characters and to not have a newline. 339 func (s *Scanner) emitSimple(t tokenType, v string) *Token { 340 token := &Token{t, v, s.row, s.col} 341 s.col += len(v) 342 s.pos += len(v) 343 return token 344 } 345 346 // emitPrefixOrChar returns a Token for type t if the current position 347 // matches the given prefix. Otherwise it returns a Char token using the 348 // first character from the prefix. 349 // 350 // The prefix is known to have only ASCII characters and to not have a newline. 351 func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token { 352 if strings.HasPrefix(s.input[s.pos:], prefix) { 353 return s.emitSimple(t, prefix) 354 } 355 return s.emitSimple(TokenChar, string(prefix[0])) 356 }