lex.go (8799B)
1 // Copyright 2013-2022 Frank Schroeder. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 // 5 // Parts of the lexer are from the template/text/parser package 6 // For these parts the following applies: 7 // 8 // Copyright 2011 The Go Authors. All rights reserved. 9 // Use of this source code is governed by a BSD-style 10 // license that can be found in the LICENSE file of the go 1.2 11 // distribution. 12 13 package properties 14 15 import ( 16 "fmt" 17 "strconv" 18 "strings" 19 "unicode/utf8" 20 ) 21 22 // item represents a token or text string returned from the scanner. 23 type item struct { 24 typ itemType // The type of this item. 25 pos int // The starting position, in bytes, of this item in the input string. 26 val string // The value of this item. 27 } 28 29 func (i item) String() string { 30 switch { 31 case i.typ == itemEOF: 32 return "EOF" 33 case i.typ == itemError: 34 return i.val 35 case len(i.val) > 10: 36 return fmt.Sprintf("%.10q...", i.val) 37 } 38 return fmt.Sprintf("%q", i.val) 39 } 40 41 // itemType identifies the type of lex items. 42 type itemType int 43 44 const ( 45 itemError itemType = iota // error occurred; value is text of error 46 itemEOF 47 itemKey // a key 48 itemValue // a value 49 itemComment // a comment 50 ) 51 52 // defines a constant for EOF 53 const eof = -1 54 55 // permitted whitespace characters space, FF and TAB 56 const whitespace = " \f\t" 57 58 // stateFn represents the state of the scanner as a function that returns the next state. 59 type stateFn func(*lexer) stateFn 60 61 // lexer holds the state of the scanner. 62 type lexer struct { 63 input string // the string being scanned 64 state stateFn // the next lexing function to enter 65 pos int // current position in the input 66 start int // start position of this item 67 width int // width of last rune read from input 68 lastPos int // position of most recent item returned by nextItem 69 runes []rune // scanned runes for this item 70 items chan item // channel of scanned items 71 } 72 73 // next returns the next rune in the input. 74 func (l *lexer) next() rune { 75 if l.pos >= len(l.input) { 76 l.width = 0 77 return eof 78 } 79 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 80 l.width = w 81 l.pos += l.width 82 return r 83 } 84 85 // peek returns but does not consume the next rune in the input. 86 func (l *lexer) peek() rune { 87 r := l.next() 88 l.backup() 89 return r 90 } 91 92 // backup steps back one rune. Can only be called once per call of next. 93 func (l *lexer) backup() { 94 l.pos -= l.width 95 } 96 97 // emit passes an item back to the client. 98 func (l *lexer) emit(t itemType) { 99 i := item{t, l.start, string(l.runes)} 100 l.items <- i 101 l.start = l.pos 102 l.runes = l.runes[:0] 103 } 104 105 // ignore skips over the pending input before this point. 106 func (l *lexer) ignore() { 107 l.start = l.pos 108 } 109 110 // appends the rune to the current value 111 func (l *lexer) appendRune(r rune) { 112 l.runes = append(l.runes, r) 113 } 114 115 // accept consumes the next rune if it's from the valid set. 116 func (l *lexer) accept(valid string) bool { 117 if strings.ContainsRune(valid, l.next()) { 118 return true 119 } 120 l.backup() 121 return false 122 } 123 124 // acceptRun consumes a run of runes from the valid set. 125 func (l *lexer) acceptRun(valid string) { 126 for strings.ContainsRune(valid, l.next()) { 127 } 128 l.backup() 129 } 130 131 // lineNumber reports which line we're on, based on the position of 132 // the previous item returned by nextItem. Doing it this way 133 // means we don't have to worry about peek double counting. 134 func (l *lexer) lineNumber() int { 135 return 1 + strings.Count(l.input[:l.lastPos], "\n") 136 } 137 138 // errorf returns an error token and terminates the scan by passing 139 // back a nil pointer that will be the next state, terminating l.nextItem. 140 func (l *lexer) errorf(format string, args ...interface{}) stateFn { 141 l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)} 142 return nil 143 } 144 145 // nextItem returns the next item from the input. 146 func (l *lexer) nextItem() item { 147 i := <-l.items 148 l.lastPos = i.pos 149 return i 150 } 151 152 // lex creates a new scanner for the input string. 153 func lex(input string) *lexer { 154 l := &lexer{ 155 input: input, 156 items: make(chan item), 157 runes: make([]rune, 0, 32), 158 } 159 go l.run() 160 return l 161 } 162 163 // run runs the state machine for the lexer. 164 func (l *lexer) run() { 165 for l.state = lexBeforeKey(l); l.state != nil; { 166 l.state = l.state(l) 167 } 168 } 169 170 // state functions 171 172 // lexBeforeKey scans until a key begins. 173 func lexBeforeKey(l *lexer) stateFn { 174 switch r := l.next(); { 175 case isEOF(r): 176 l.emit(itemEOF) 177 return nil 178 179 case isEOL(r): 180 l.ignore() 181 return lexBeforeKey 182 183 case isComment(r): 184 return lexComment 185 186 case isWhitespace(r): 187 l.ignore() 188 return lexBeforeKey 189 190 default: 191 l.backup() 192 return lexKey 193 } 194 } 195 196 // lexComment scans a comment line. The comment character has already been scanned. 197 func lexComment(l *lexer) stateFn { 198 l.acceptRun(whitespace) 199 l.ignore() 200 for { 201 switch r := l.next(); { 202 case isEOF(r): 203 l.ignore() 204 l.emit(itemEOF) 205 return nil 206 case isEOL(r): 207 l.emit(itemComment) 208 return lexBeforeKey 209 default: 210 l.appendRune(r) 211 } 212 } 213 } 214 215 // lexKey scans the key up to a delimiter 216 func lexKey(l *lexer) stateFn { 217 var r rune 218 219 Loop: 220 for { 221 switch r = l.next(); { 222 223 case isEscape(r): 224 err := l.scanEscapeSequence() 225 if err != nil { 226 return l.errorf(err.Error()) 227 } 228 229 case isEndOfKey(r): 230 l.backup() 231 break Loop 232 233 case isEOF(r): 234 break Loop 235 236 default: 237 l.appendRune(r) 238 } 239 } 240 241 if len(l.runes) > 0 { 242 l.emit(itemKey) 243 } 244 245 if isEOF(r) { 246 l.emit(itemEOF) 247 return nil 248 } 249 250 return lexBeforeValue 251 } 252 253 // lexBeforeValue scans the delimiter between key and value. 254 // Leading and trailing whitespace is ignored. 255 // We expect to be just after the key. 256 func lexBeforeValue(l *lexer) stateFn { 257 l.acceptRun(whitespace) 258 l.accept(":=") 259 l.acceptRun(whitespace) 260 l.ignore() 261 return lexValue 262 } 263 264 // lexValue scans text until the end of the line. We expect to be just after the delimiter. 265 func lexValue(l *lexer) stateFn { 266 for { 267 switch r := l.next(); { 268 case isEscape(r): 269 if isEOL(l.peek()) { 270 l.next() 271 l.acceptRun(whitespace) 272 } else { 273 err := l.scanEscapeSequence() 274 if err != nil { 275 return l.errorf(err.Error()) 276 } 277 } 278 279 case isEOL(r): 280 l.emit(itemValue) 281 l.ignore() 282 return lexBeforeKey 283 284 case isEOF(r): 285 l.emit(itemValue) 286 l.emit(itemEOF) 287 return nil 288 289 default: 290 l.appendRune(r) 291 } 292 } 293 } 294 295 // scanEscapeSequence scans either one of the escaped characters 296 // or a unicode literal. We expect to be after the escape character. 297 func (l *lexer) scanEscapeSequence() error { 298 switch r := l.next(); { 299 300 case isEscapedCharacter(r): 301 l.appendRune(decodeEscapedCharacter(r)) 302 return nil 303 304 case atUnicodeLiteral(r): 305 return l.scanUnicodeLiteral() 306 307 case isEOF(r): 308 return fmt.Errorf("premature EOF") 309 310 // silently drop the escape character and append the rune as is 311 default: 312 l.appendRune(r) 313 return nil 314 } 315 } 316 317 // scans a unicode literal in the form \uXXXX. We expect to be after the \u. 318 func (l *lexer) scanUnicodeLiteral() error { 319 // scan the digits 320 d := make([]rune, 4) 321 for i := 0; i < 4; i++ { 322 d[i] = l.next() 323 if d[i] == eof || !strings.ContainsRune("0123456789abcdefABCDEF", d[i]) { 324 return fmt.Errorf("invalid unicode literal") 325 } 326 } 327 328 // decode the digits into a rune 329 r, err := strconv.ParseInt(string(d), 16, 0) 330 if err != nil { 331 return err 332 } 333 334 l.appendRune(rune(r)) 335 return nil 336 } 337 338 // decodeEscapedCharacter returns the unescaped rune. We expect to be after the escape character. 339 func decodeEscapedCharacter(r rune) rune { 340 switch r { 341 case 'f': 342 return '\f' 343 case 'n': 344 return '\n' 345 case 'r': 346 return '\r' 347 case 't': 348 return '\t' 349 default: 350 return r 351 } 352 } 353 354 // atUnicodeLiteral reports whether we are at a unicode literal. 355 // The escape character has already been consumed. 356 func atUnicodeLiteral(r rune) bool { 357 return r == 'u' 358 } 359 360 // isComment reports whether we are at the start of a comment. 361 func isComment(r rune) bool { 362 return r == '#' || r == '!' 363 } 364 365 // isEndOfKey reports whether the rune terminates the current key. 366 func isEndOfKey(r rune) bool { 367 return strings.ContainsRune(" \f\t\r\n:=", r) 368 } 369 370 // isEOF reports whether we are at EOF. 371 func isEOF(r rune) bool { 372 return r == eof 373 } 374 375 // isEOL reports whether we are at a new line character. 376 func isEOL(r rune) bool { 377 return r == '\n' || r == '\r' 378 } 379 380 // isEscape reports whether the rune is the escape character which 381 // prefixes unicode literals and other escaped characters. 382 func isEscape(r rune) bool { 383 return r == '\\' 384 } 385 386 // isEscapedCharacter reports whether we are at one of the characters that need escaping. 387 // The escape character has already been consumed. 388 func isEscapedCharacter(r rune) bool { 389 return strings.ContainsRune(" :=fnrt", r) 390 } 391 392 // isWhitespace reports whether the rune is a whitespace character. 393 func isWhitespace(r rune) bool { 394 return strings.ContainsRune(whitespace, r) 395 }