gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

scanner.go (10619B)


      1 // Copyright 2012 The Gorilla Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package scanner
      6 
      7 import (
      8 	"fmt"
      9 	"regexp"
     10 	"strings"
     11 	"unicode"
     12 	"unicode/utf8"
     13 )
     14 
     15 // tokenType identifies the type of lexical tokens.
     16 type tokenType int
     17 
     18 // String returns a string representation of the token type.
     19 func (t tokenType) String() string {
     20 	return tokenNames[t]
     21 }
     22 
     23 // Token represents a token and the corresponding string.
     24 type Token struct {
     25 	Type   tokenType
     26 	Value  string
     27 	Line   int
     28 	Column int
     29 }
     30 
     31 // String returns a string representation of the token.
     32 func (t *Token) String() string {
     33 	if len(t.Value) > 10 {
     34 		return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
     35 			t.Type, t.Line, t.Column, t.Value)
     36 	}
     37 	return fmt.Sprintf("%s (line: %d, column: %d): %q",
     38 		t.Type, t.Line, t.Column, t.Value)
     39 }
     40 
     41 // All tokens -----------------------------------------------------------------
     42 
     43 // The complete list of tokens in CSS3.
     44 const (
     45 	// Scanner flags.
     46 	TokenError tokenType = iota
     47 	TokenEOF
     48 	// From now on, only tokens from the CSS specification.
     49 	TokenIdent
     50 	TokenAtKeyword
     51 	TokenString
     52 	TokenHash
     53 	TokenNumber
     54 	TokenPercentage
     55 	TokenDimension
     56 	TokenURI
     57 	TokenUnicodeRange
     58 	TokenCDO
     59 	TokenCDC
     60 	TokenS
     61 	TokenComment
     62 	TokenFunction
     63 	TokenIncludes
     64 	TokenDashMatch
     65 	TokenPrefixMatch
     66 	TokenSuffixMatch
     67 	TokenSubstringMatch
     68 	TokenChar
     69 	TokenBOM
     70 )
     71 
     72 // tokenNames maps tokenType's to their names. Used for conversion to string.
     73 var tokenNames = map[tokenType]string{
     74 	TokenError:          "error",
     75 	TokenEOF:            "EOF",
     76 	TokenIdent:          "IDENT",
     77 	TokenAtKeyword:      "ATKEYWORD",
     78 	TokenString:         "STRING",
     79 	TokenHash:           "HASH",
     80 	TokenNumber:         "NUMBER",
     81 	TokenPercentage:     "PERCENTAGE",
     82 	TokenDimension:      "DIMENSION",
     83 	TokenURI:            "URI",
     84 	TokenUnicodeRange:   "UNICODE-RANGE",
     85 	TokenCDO:            "CDO",
     86 	TokenCDC:            "CDC",
     87 	TokenS:              "S",
     88 	TokenComment:        "COMMENT",
     89 	TokenFunction:       "FUNCTION",
     90 	TokenIncludes:       "INCLUDES",
     91 	TokenDashMatch:      "DASHMATCH",
     92 	TokenPrefixMatch:    "PREFIXMATCH",
     93 	TokenSuffixMatch:    "SUFFIXMATCH",
     94 	TokenSubstringMatch: "SUBSTRINGMATCH",
     95 	TokenChar:           "CHAR",
     96 	TokenBOM:            "BOM",
     97 }
     98 
     99 // Macros and productions -----------------------------------------------------
    100 // http://www.w3.org/TR/css3-syntax/#tokenization
    101 
    102 var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
    103 
    104 // macros maps macro names to patterns to be expanded.
    105 var macros = map[string]string{
    106 	// must be escaped: `\.+*?()|[]{}^$`
    107 	"ident":      `-?{nmstart}{nmchar}*`,
    108 	"name":       `{nmchar}+`,
    109 	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
    110 	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
    111 	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
    112 	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
    113 	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
    114 	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
    115 	"string":     `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
    116 	"stringchar": `{urlchar}|[ ]|\\{nl}`,
    117 	"nl":         `[\n\r\f]|\r\n`,
    118 	"w":          `{wc}*`,
    119 	"wc":         `[\t\n\f\r ]`,
    120 
    121 	// urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
    122 	// ASCII characters range = `[\u0020-\u007e]`
    123 	// Skip space \u0020 = `[\u0021-\u007e]`
    124 	// Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
    125 	// Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
    126 	// Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
    127 	// Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
    128 	"urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
    129 }
    130 
    131 // productions maps the list of tokens to patterns to be expanded.
    132 var productions = map[tokenType]string{
    133 	// Unused regexps (matched using other methods) are commented out.
    134 	TokenIdent:        `{ident}`,
    135 	TokenAtKeyword:    `@{ident}`,
    136 	TokenString:       `{string}`,
    137 	TokenHash:         `#{name}`,
    138 	TokenNumber:       `{num}`,
    139 	TokenPercentage:   `{num}%`,
    140 	TokenDimension:    `{num}{ident}`,
    141 	TokenURI:          `url\({w}(?:{string}|{urlchar}*?){w}\)`,
    142 	TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
    143 	//TokenCDO:            `<!--`,
    144 	TokenCDC:      `-->`,
    145 	TokenS:        `{wc}+`,
    146 	TokenComment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
    147 	TokenFunction: `{ident}\(`,
    148 	//TokenIncludes:       `~=`,
    149 	//TokenDashMatch:      `\|=`,
    150 	//TokenPrefixMatch:    `\^=`,
    151 	//TokenSuffixMatch:    `\$=`,
    152 	//TokenSubstringMatch: `\*=`,
    153 	//TokenChar:           `[^"']`,
    154 	//TokenBOM:            "\uFEFF",
    155 }
    156 
    157 // matchers maps the list of tokens to compiled regular expressions.
    158 //
    159 // The map is filled on init() using the macros and productions defined in
    160 // the CSS specification.
    161 var matchers = map[tokenType]*regexp.Regexp{}
    162 
    163 // matchOrder is the order to test regexps when first-char shortcuts
    164 // can't be used.
    165 var matchOrder = []tokenType{
    166 	TokenURI,
    167 	TokenFunction,
    168 	TokenUnicodeRange,
    169 	TokenIdent,
    170 	TokenDimension,
    171 	TokenPercentage,
    172 	TokenNumber,
    173 	TokenCDC,
    174 }
    175 
    176 func init() {
    177 	// replace macros and compile regexps for productions.
    178 	replaceMacro := func(s string) string {
    179 		return "(?:" + macros[s[1:len(s)-1]] + ")"
    180 	}
    181 	for t, s := range productions {
    182 		for macroRegexp.MatchString(s) {
    183 			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
    184 		}
    185 		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
    186 	}
    187 }
    188 
    189 // Scanner --------------------------------------------------------------------
    190 
    191 // New returns a new CSS scanner for the given input.
    192 func New(input string) *Scanner {
    193 	// Normalize newlines.
    194 	input = strings.Replace(input, "\r\n", "\n", -1)
    195 	return &Scanner{
    196 		input: input,
    197 		row:   1,
    198 		col:   1,
    199 	}
    200 }
    201 
    202 // Scanner scans an input and emits tokens following the CSS3 specification.
    203 type Scanner struct {
    204 	input string
    205 	pos   int
    206 	row   int
    207 	col   int
    208 	err   *Token
    209 }
    210 
    211 // Next returns the next token from the input.
    212 //
    213 // At the end of the input the token type is TokenEOF.
    214 //
    215 // If the input can't be tokenized the token type is TokenError. This occurs
    216 // in case of unclosed quotation marks or comments.
    217 func (s *Scanner) Next() *Token {
    218 	if s.err != nil {
    219 		return s.err
    220 	}
    221 	if s.pos >= len(s.input) {
    222 		s.err = &Token{TokenEOF, "", s.row, s.col}
    223 		return s.err
    224 	}
    225 	if s.pos == 0 {
    226 		// Test BOM only once, at the beginning of the file.
    227 		if strings.HasPrefix(s.input, "\uFEFF") {
    228 			return s.emitSimple(TokenBOM, "\uFEFF")
    229 		}
    230 	}
    231 	// There's a lot we can guess based on the first byte so we'll take a
    232 	// shortcut before testing multiple regexps.
    233 	input := s.input[s.pos:]
    234 	switch input[0] {
    235 	case '\t', '\n', '\f', '\r', ' ':
    236 		// Whitespace.
    237 		return s.emitToken(TokenS, matchers[TokenS].FindString(input))
    238 	case '.':
    239 		// Dot is too common to not have a quick check.
    240 		// We'll test if this is a Char; if it is followed by a number it is a
    241 		// dimension/percentage/number, and this will be matched later.
    242 		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
    243 			return s.emitSimple(TokenChar, ".")
    244 		}
    245 	case '#':
    246 		// Another common one: Hash or Char.
    247 		if match := matchers[TokenHash].FindString(input); match != "" {
    248 			return s.emitToken(TokenHash, match)
    249 		}
    250 		return s.emitSimple(TokenChar, "#")
    251 	case '@':
    252 		// Another common one: AtKeyword or Char.
    253 		if match := matchers[TokenAtKeyword].FindString(input); match != "" {
    254 			return s.emitSimple(TokenAtKeyword, match)
    255 		}
    256 		return s.emitSimple(TokenChar, "@")
    257 	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
    258 		// More common chars.
    259 		return s.emitSimple(TokenChar, string(input[0]))
    260 	case '"', '\'':
    261 		// String or error.
    262 		match := matchers[TokenString].FindString(input)
    263 		if match != "" {
    264 			return s.emitToken(TokenString, match)
    265 		}
    266 
    267 		s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
    268 		return s.err
    269 	case '/':
    270 		// Comment, error or Char.
    271 		if len(input) > 1 && input[1] == '*' {
    272 			match := matchers[TokenComment].FindString(input)
    273 			if match != "" {
    274 				return s.emitToken(TokenComment, match)
    275 			} else {
    276 				s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
    277 				return s.err
    278 			}
    279 		}
    280 		return s.emitSimple(TokenChar, "/")
    281 	case '~':
    282 		// Includes or Char.
    283 		return s.emitPrefixOrChar(TokenIncludes, "~=")
    284 	case '|':
    285 		// DashMatch or Char.
    286 		return s.emitPrefixOrChar(TokenDashMatch, "|=")
    287 	case '^':
    288 		// PrefixMatch or Char.
    289 		return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
    290 	case '$':
    291 		// SuffixMatch or Char.
    292 		return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
    293 	case '*':
    294 		// SubstringMatch or Char.
    295 		return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
    296 	case '<':
    297 		// CDO or Char.
    298 		return s.emitPrefixOrChar(TokenCDO, "<!--")
    299 	}
    300 	// Test all regexps, in order.
    301 	for _, token := range matchOrder {
    302 		if match := matchers[token].FindString(input); match != "" {
    303 			return s.emitToken(token, match)
    304 		}
    305 	}
    306 	// We already handled unclosed quotation marks and comments,
    307 	// so this can only be a Char.
    308 	r, width := utf8.DecodeRuneInString(input)
    309 	token := &Token{TokenChar, string(r), s.row, s.col}
    310 	s.col += width
    311 	s.pos += width
    312 	return token
    313 }
    314 
    315 // updatePosition updates input coordinates based on the consumed text.
    316 func (s *Scanner) updatePosition(text string) {
    317 	width := utf8.RuneCountInString(text)
    318 	lines := strings.Count(text, "\n")
    319 	s.row += lines
    320 	if lines == 0 {
    321 		s.col += width
    322 	} else {
    323 		s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
    324 	}
    325 	s.pos += len(text) // while col is a rune index, pos is a byte index
    326 }
    327 
    328 // emitToken returns a Token for the string v and updates the scanner position.
    329 func (s *Scanner) emitToken(t tokenType, v string) *Token {
    330 	token := &Token{t, v, s.row, s.col}
    331 	s.updatePosition(v)
    332 	return token
    333 }
    334 
    335 // emitSimple returns a Token for the string v and updates the scanner
    336 // position in a simplified manner.
    337 //
    338 // The string is known to have only ASCII characters and to not have a newline.
    339 func (s *Scanner) emitSimple(t tokenType, v string) *Token {
    340 	token := &Token{t, v, s.row, s.col}
    341 	s.col += len(v)
    342 	s.pos += len(v)
    343 	return token
    344 }
    345 
    346 // emitPrefixOrChar returns a Token for type t if the current position
    347 // matches the given prefix. Otherwise it returns a Char token using the
    348 // first character from the prefix.
    349 //
    350 // The prefix is known to have only ASCII characters and to not have a newline.
    351 func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
    352 	if strings.HasPrefix(s.input[s.pos:], prefix) {
    353 		return s.emitSimple(t, prefix)
    354 	}
    355 	return s.emitSimple(TokenChar, string(prefix[0]))
    356 }