// TOML lexer. // // Written using the principles developed by Rob Pike in // http://www.youtube.com/watch?v=HxaD_trXwRE package toml import ( "errors" "fmt" "io" "regexp" "strconv" "strings" "github.com/pelletier/go-buffruneio" ) var dateRegexp *regexp.Regexp // Define state functions type tomlLexStateFn func() tomlLexStateFn // Define lexer type tomlLexer struct { input *buffruneio.Reader // Textual source buffer []rune // Runes composing the current token tokens chan token depth int line int col int endbufferLine int endbufferCol int } // Basic read operations on input func (l *tomlLexer) read() rune { r, _, err := l.input.ReadRune() if err != nil { panic(err) } if r == '\n' { l.endbufferLine++ l.endbufferCol = 1 } else { l.endbufferCol++ } return r } func (l *tomlLexer) next() rune { r := l.read() if r != eof { l.buffer = append(l.buffer, r) } return r } func (l *tomlLexer) ignore() { l.buffer = make([]rune, 0) l.line = l.endbufferLine l.col = l.endbufferCol } func (l *tomlLexer) skip() { l.next() l.ignore() } func (l *tomlLexer) fastForward(n int) { for i := 0; i < n; i++ { l.next() } } func (l *tomlLexer) emitWithValue(t tokenType, value string) { l.tokens <- token{ Position: Position{l.line, l.col}, typ: t, val: value, } l.ignore() } func (l *tomlLexer) emit(t tokenType) { l.emitWithValue(t, string(l.buffer)) } func (l *tomlLexer) peek() rune { r, _, err := l.input.ReadRune() if err != nil { panic(err) } l.input.UnreadRune() return r } func (l *tomlLexer) follow(next string) bool { for _, expectedRune := range next { r, _, err := l.input.ReadRune() defer l.input.UnreadRune() if err != nil { panic(err) } if expectedRune != r { return false } } return true } // Error management func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { l.tokens <- token{ Position: Position{l.line, l.col}, typ: tokenError, val: fmt.Sprintf(format, args...), } return nil } // State functions func (l *tomlLexer) lexVoid() tomlLexStateFn { for { next := l.peek() switch next { case '[': return l.lexTableKey case '#': return l.lexComment(l.lexVoid) case '=': return l.lexEqual case '\r': fallthrough case '\n': l.skip() continue } if isSpace(next) { l.skip() } if l.depth > 0 { return l.lexRvalue } if isKeyStartChar(next) { return l.lexKey } if next == eof { l.next() break } } l.emit(tokenEOF) return nil } func (l *tomlLexer) lexRvalue() tomlLexStateFn { for { next := l.peek() switch next { case '.': return l.errorf("cannot start float with a dot") case '=': return l.lexEqual case '[': l.depth++ return l.lexLeftBracket case ']': l.depth-- return l.lexRightBracket case '{': return l.lexLeftCurlyBrace case '}': return l.lexRightCurlyBrace case '#': return l.lexComment(l.lexRvalue) case '"': return l.lexString case '\'': return l.lexLiteralString case ',': return l.lexComma case '\r': fallthrough case '\n': l.skip() if l.depth == 0 { return l.lexVoid } return l.lexRvalue case '_': return l.errorf("cannot start number with underscore") } if l.follow("true") { return l.lexTrue } if l.follow("false") { return l.lexFalse } if isSpace(next) { l.skip() continue } if next == eof { l.next() break } possibleDate := string(l.input.PeekRunes(35)) dateMatch := dateRegexp.FindString(possibleDate) if dateMatch != "" { l.fastForward(len(dateMatch)) return l.lexDate } if next == '+' || next == '-' || isDigit(next) { return l.lexNumber } if isAlphanumeric(next) { return l.lexKey } return l.errorf("no value can start with %c", next) } l.emit(tokenEOF) return nil } func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { l.next() l.emit(tokenLeftCurlyBrace) return l.lexRvalue } func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { l.next() l.emit(tokenRightCurlyBrace) return l.lexRvalue } func (l *tomlLexer) lexDate() tomlLexStateFn { l.emit(tokenDate) return l.lexRvalue } func (l *tomlLexer) lexTrue() tomlLexStateFn { l.fastForward(4) l.emit(tokenTrue) return l.lexRvalue } func (l *tomlLexer) lexFalse() tomlLexStateFn { l.fastForward(5) l.emit(tokenFalse) return l.lexRvalue } func (l *tomlLexer) lexEqual() tomlLexStateFn { l.next() l.emit(tokenEqual) return l.lexRvalue } func (l *tomlLexer) lexComma() tomlLexStateFn { l.next() l.emit(tokenComma) return l.lexRvalue } func (l *tomlLexer) lexKey() tomlLexStateFn { growingString := "" for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { if r == '"' { l.next() str, err := l.lexStringAsString(`"`, false, true) if err != nil { return l.errorf(err.Error()) } growingString += `"` + str + `"` l.next() continue } else if r == '\n' { return l.errorf("keys cannot contain new lines") } else if isSpace(r) { break } else if !isValidBareChar(r) { return l.errorf("keys cannot contain %c character", r) } growingString += string(r) l.next() } l.emitWithValue(tokenKey, growingString) return l.lexVoid } func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { return func() tomlLexStateFn { for next := l.peek(); next != '\n' && next != eof; next = l.peek() { if next == '\r' && l.follow("\r\n") { break } l.next() } l.ignore() return previousState } } func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { l.next() l.emit(tokenLeftBracket) return l.lexRvalue } func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { growingString := "" if discardLeadingNewLine { if l.follow("\r\n") { l.skip() l.skip() } else if l.peek() == '\n' { l.skip() } } // find end of string for { if l.follow(terminator) { return growingString, nil } next := l.peek() if next == eof { break } growingString += string(l.next()) } return "", errors.New("unclosed string") } func (l *tomlLexer) lexLiteralString() tomlLexStateFn { l.skip() // handle special case for triple-quote terminator := "'" discardLeadingNewLine := false if l.follow("''") { l.skip() l.skip() terminator = "'''" discardLeadingNewLine = true } str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) if err != nil { return l.errorf(err.Error()) } l.emitWithValue(tokenString, str) l.fastForward(len(terminator)) l.ignore() return l.lexRvalue } // Lex a string and return the results as a string. // Terminator is the substring indicating the end of the token. // The resulting string does not include the terminator. func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { growingString := "" if discardLeadingNewLine { if l.follow("\r\n") { l.skip() l.skip() } else if l.peek() == '\n' { l.skip() } } for { if l.follow(terminator) { return growingString, nil } if l.follow("\\") { l.next() switch l.peek() { case '\r': fallthrough case '\n': fallthrough case '\t': fallthrough case ' ': // skip all whitespace chars following backslash for strings.ContainsRune("\r\n\t ", l.peek()) { l.next() } case '"': growingString += "\"" l.next() case 'n': growingString += "\n" l.next() case 'b': growingString += "\b" l.next() case 'f': growingString += "\f" l.next() case '/': growingString += "/" l.next() case 't': growingString += "\t" l.next() case 'r': growingString += "\r" l.next() case '\\': growingString += "\\" l.next() case 'u': l.next() code := "" for i := 0; i < 4; i++ { c := l.peek() if !isHexDigit(c) { return "", errors.New("unfinished unicode escape") } l.next() code = code + string(c) } intcode, err := strconv.ParseInt(code, 16, 32) if err != nil { return "", errors.New("invalid unicode escape: \\u" + code) } growingString += string(rune(intcode)) case 'U': l.next() code := "" for i := 0; i < 8; i++ { c := l.peek() if !isHexDigit(c) { return "", errors.New("unfinished unicode escape") } l.next() code = code + string(c) } intcode, err := strconv.ParseInt(code, 16, 64) if err != nil { return "", errors.New("invalid unicode escape: \\U" + code) } growingString += string(rune(intcode)) default: return "", errors.New("invalid escape sequence: \\" + string(l.peek())) } } else { r := l.peek() if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) { return "", fmt.Errorf("unescaped control character %U", r) } l.next() growingString += string(r) } if l.peek() == eof { break } } return "", errors.New("unclosed string") } func (l *tomlLexer) lexString() tomlLexStateFn { l.skip() // handle special case for triple-quote terminator := `"` discardLeadingNewLine := false acceptNewLines := false if l.follow(`""`) { l.skip() l.skip() terminator = `"""` discardLeadingNewLine = true acceptNewLines = true } str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) if err != nil { return l.errorf(err.Error()) } l.emitWithValue(tokenString, str) l.fastForward(len(terminator)) l.ignore() return l.lexRvalue } func (l *tomlLexer) lexTableKey() tomlLexStateFn { l.next() if l.peek() == '[' { // token '[[' signifies an array of tables l.next() l.emit(tokenDoubleLeftBracket) return l.lexInsideTableArrayKey } // vanilla table key l.emit(tokenLeftBracket) return l.lexInsideTableKey } func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { for r := l.peek(); r != eof; r = l.peek() { switch r { case ']': if len(l.buffer) > 0 { l.emit(tokenKeyGroupArray) } l.next() if l.peek() != ']' { break } l.next() l.emit(tokenDoubleRightBracket) return l.lexVoid case '[': return l.errorf("table array key cannot contain ']'") default: l.next() } } return l.errorf("unclosed table array key") } func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { for r := l.peek(); r != eof; r = l.peek() { switch r { case ']': if len(l.buffer) > 0 { l.emit(tokenKeyGroup) } l.next() l.emit(tokenRightBracket) return l.lexVoid case '[': return l.errorf("table key cannot contain ']'") default: l.next() } } return l.errorf("unclosed table key") } func (l *tomlLexer) lexRightBracket() tomlLexStateFn { l.next() l.emit(tokenRightBracket) return l.lexRvalue } func (l *tomlLexer) lexNumber() tomlLexStateFn { r := l.peek() if r == '+' || r == '-' { l.next() } pointSeen := false expSeen := false digitSeen := false for { next := l.peek() if next == '.' { if pointSeen { return l.errorf("cannot have two dots in one float") } l.next() if !isDigit(l.peek()) { return l.errorf("float cannot end with a dot") } pointSeen = true } else if next == 'e' || next == 'E' { expSeen = true l.next() r := l.peek() if r == '+' || r == '-' { l.next() } } else if isDigit(next) { digitSeen = true l.next() } else if next == '_' { l.next() } else { break } if pointSeen && !digitSeen { return l.errorf("cannot start float with a dot") } } if !digitSeen { return l.errorf("no digit in that number") } if pointSeen || expSeen { l.emit(tokenFloat) } else { l.emit(tokenInteger) } return l.lexRvalue } func (l *tomlLexer) run() { for state := l.lexVoid; state != nil; { state = state() } close(l.tokens) } func init() { dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`) } // Entry point func lexToml(input io.Reader) chan token { bufferedInput := buffruneio.NewReader(input) l := &tomlLexer{ input: bufferedInput, tokens: make(chan token), line: 1, col: 1, endbufferLine: 1, endbufferCol: 1, } go l.run() return l.tokens }