diff options
Diffstat (limited to 'vendor/github.com/pelletier/go-toml/lexer.go')
-rw-r--r-- | vendor/github.com/pelletier/go-toml/lexer.go | 750 |
1 files changed, 750 insertions, 0 deletions
diff --git a/vendor/github.com/pelletier/go-toml/lexer.go b/vendor/github.com/pelletier/go-toml/lexer.go new file mode 100644 index 00000000..d11de428 --- /dev/null +++ b/vendor/github.com/pelletier/go-toml/lexer.go @@ -0,0 +1,750 @@ +// TOML lexer. +// +// Written using the principles developed by Rob Pike in +// http://www.youtube.com/watch?v=HxaD_trXwRE + +package toml + +import ( + "bytes" + "errors" + "fmt" + "regexp" + "strconv" + "strings" +) + +var dateRegexp *regexp.Regexp + +// Define state functions +type tomlLexStateFn func() tomlLexStateFn + +// Define lexer +type tomlLexer struct { + inputIdx int + input []rune // Textual source + currentTokenStart int + currentTokenStop int + tokens []token + depth int + line int + col int + endbufferLine int + endbufferCol int +} + +// Basic read operations on input + +func (l *tomlLexer) read() rune { + r := l.peek() + if r == '\n' { + l.endbufferLine++ + l.endbufferCol = 1 + } else { + l.endbufferCol++ + } + l.inputIdx++ + return r +} + +func (l *tomlLexer) next() rune { + r := l.read() + + if r != eof { + l.currentTokenStop++ + } + return r +} + +func (l *tomlLexer) ignore() { + l.currentTokenStart = l.currentTokenStop + l.line = l.endbufferLine + l.col = l.endbufferCol +} + +func (l *tomlLexer) skip() { + l.next() + l.ignore() +} + +func (l *tomlLexer) fastForward(n int) { + for i := 0; i < n; i++ { + l.next() + } +} + +func (l *tomlLexer) emitWithValue(t tokenType, value string) { + l.tokens = append(l.tokens, token{ + Position: Position{l.line, l.col}, + typ: t, + val: value, + }) + l.ignore() +} + +func (l *tomlLexer) emit(t tokenType) { + l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop])) +} + +func (l *tomlLexer) peek() rune { + if l.inputIdx >= len(l.input) { + return eof + } + return l.input[l.inputIdx] +} + +func (l *tomlLexer) peekString(size int) string { + maxIdx := len(l.input) + upperIdx := l.inputIdx + size // FIXME: potential overflow + if upperIdx > maxIdx { + upperIdx = maxIdx + } + return string(l.input[l.inputIdx:upperIdx]) +} + +func (l *tomlLexer) follow(next string) bool { + return next == l.peekString(len(next)) +} + +// Error management + +func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { + l.tokens = append(l.tokens, token{ + Position: Position{l.line, l.col}, + typ: tokenError, + val: fmt.Sprintf(format, args...), + }) + return nil +} + +// State functions + +func (l *tomlLexer) lexVoid() tomlLexStateFn { + for { + next := l.peek() + switch next { + case '[': + return l.lexTableKey + case '#': + return l.lexComment(l.lexVoid) + case '=': + return l.lexEqual + case '\r': + fallthrough + case '\n': + l.skip() + continue + } + + if isSpace(next) { + l.skip() + } + + if l.depth > 0 { + return l.lexRvalue + } + + if isKeyStartChar(next) { + return l.lexKey + } + + if next == eof { + l.next() + break + } + } + + l.emit(tokenEOF) + return nil +} + +func (l *tomlLexer) lexRvalue() tomlLexStateFn { + for { + next := l.peek() + switch next { + case '.': + return l.errorf("cannot start float with a dot") + case '=': + return l.lexEqual + case '[': + l.depth++ + return l.lexLeftBracket + case ']': + l.depth-- + return l.lexRightBracket + case '{': + return l.lexLeftCurlyBrace + case '}': + return l.lexRightCurlyBrace + case '#': + return l.lexComment(l.lexRvalue) + case '"': + return l.lexString + case '\'': + return l.lexLiteralString + case ',': + return l.lexComma + case '\r': + fallthrough + case '\n': + l.skip() + if l.depth == 0 { + return l.lexVoid + } + return l.lexRvalue + case '_': + return l.errorf("cannot start number with underscore") + } + + if l.follow("true") { + return l.lexTrue + } + + if l.follow("false") { + return l.lexFalse + } + + if l.follow("inf") { + return l.lexInf + } + + if l.follow("nan") { + return l.lexNan + } + + if isSpace(next) { + l.skip() + continue + } + + if next == eof { + l.next() + break + } + + possibleDate := l.peekString(35) + dateMatch := dateRegexp.FindString(possibleDate) + if dateMatch != "" { + l.fastForward(len(dateMatch)) + return l.lexDate + } + + if next == '+' || next == '-' || isDigit(next) { + return l.lexNumber + } + + if isAlphanumeric(next) { + return l.lexKey + } + + return l.errorf("no value can start with %c", next) + } + + l.emit(tokenEOF) + return nil +} + +func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { + l.next() + l.emit(tokenLeftCurlyBrace) + return l.lexRvalue +} + +func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { + l.next() + l.emit(tokenRightCurlyBrace) + return l.lexRvalue +} + +func (l *tomlLexer) lexDate() tomlLexStateFn { + l.emit(tokenDate) + return l.lexRvalue +} + +func (l *tomlLexer) lexTrue() tomlLexStateFn { + l.fastForward(4) + l.emit(tokenTrue) + return l.lexRvalue +} + +func (l *tomlLexer) lexFalse() tomlLexStateFn { + l.fastForward(5) + l.emit(tokenFalse) + return l.lexRvalue +} + +func (l *tomlLexer) lexInf() tomlLexStateFn { + l.fastForward(3) + l.emit(tokenInf) + return l.lexRvalue +} + +func (l *tomlLexer) lexNan() tomlLexStateFn { + l.fastForward(3) + l.emit(tokenNan) + return l.lexRvalue +} + +func (l *tomlLexer) lexEqual() tomlLexStateFn { + l.next() + l.emit(tokenEqual) + return l.lexRvalue +} + +func (l *tomlLexer) lexComma() tomlLexStateFn { + l.next() + l.emit(tokenComma) + return l.lexRvalue +} + +// Parse the key and emits its value without escape sequences. +// bare keys, basic string keys and literal string keys are supported. +func (l *tomlLexer) lexKey() tomlLexStateFn { + growingString := "" + + for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { + if r == '"' { + l.next() + str, err := l.lexStringAsString(`"`, false, true) + if err != nil { + return l.errorf(err.Error()) + } + growingString += str + l.next() + continue + } else if r == '\'' { + l.next() + str, err := l.lexLiteralStringAsString(`'`, false) + if err != nil { + return l.errorf(err.Error()) + } + growingString += str + l.next() + continue + } else if r == '\n' { + return l.errorf("keys cannot contain new lines") + } else if isSpace(r) { + break + } else if !isValidBareChar(r) { + return l.errorf("keys cannot contain %c character", r) + } + growingString += string(r) + l.next() + } + l.emitWithValue(tokenKey, growingString) + return l.lexVoid +} + +func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { + return func() tomlLexStateFn { + for next := l.peek(); next != '\n' && next != eof; next = l.peek() { + if next == '\r' && l.follow("\r\n") { + break + } + l.next() + } + l.ignore() + return previousState + } +} + +func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { + l.next() + l.emit(tokenLeftBracket) + return l.lexRvalue +} + +func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { + growingString := "" + + if discardLeadingNewLine { + if l.follow("\r\n") { + l.skip() + l.skip() + } else if l.peek() == '\n' { + l.skip() + } + } + + // find end of string + for { + if l.follow(terminator) { + return growingString, nil + } + + next := l.peek() + if next == eof { + break + } + growingString += string(l.next()) + } + + return "", errors.New("unclosed string") +} + +func (l *tomlLexer) lexLiteralString() tomlLexStateFn { + l.skip() + + // handle special case for triple-quote + terminator := "'" + discardLeadingNewLine := false + if l.follow("''") { + l.skip() + l.skip() + terminator = "'''" + discardLeadingNewLine = true + } + + str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) + if err != nil { + return l.errorf(err.Error()) + } + + l.emitWithValue(tokenString, str) + l.fastForward(len(terminator)) + l.ignore() + return l.lexRvalue +} + +// Lex a string and return the results as a string. +// Terminator is the substring indicating the end of the token. +// The resulting string does not include the terminator. +func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { + growingString := "" + + if discardLeadingNewLine { + if l.follow("\r\n") { + l.skip() + l.skip() + } else if l.peek() == '\n' { + l.skip() + } + } + + for { + if l.follow(terminator) { + return growingString, nil + } + + if l.follow("\\") { + l.next() + switch l.peek() { + case '\r': + fallthrough + case '\n': + fallthrough + case '\t': + fallthrough + case ' ': + // skip all whitespace chars following backslash + for strings.ContainsRune("\r\n\t ", l.peek()) { + l.next() + } + case '"': + growingString += "\"" + l.next() + case 'n': + growingString += "\n" + l.next() + case 'b': + growingString += "\b" + l.next() + case 'f': + growingString += "\f" + l.next() + case '/': + growingString += "/" + l.next() + case 't': + growingString += "\t" + l.next() + case 'r': + growingString += "\r" + l.next() + case '\\': + growingString += "\\" + l.next() + case 'u': + l.next() + code := "" + for i := 0; i < 4; i++ { + c := l.peek() + if !isHexDigit(c) { + return "", errors.New("unfinished unicode escape") + } + l.next() + code = code + string(c) + } + intcode, err := strconv.ParseInt(code, 16, 32) + if err != nil { + return "", errors.New("invalid unicode escape: \\u" + code) + } + growingString += string(rune(intcode)) + case 'U': + l.next() + code := "" + for i := 0; i < 8; i++ { + c := l.peek() + if !isHexDigit(c) { + return "", errors.New("unfinished unicode escape") + } + l.next() + code = code + string(c) + } + intcode, err := strconv.ParseInt(code, 16, 64) + if err != nil { + return "", errors.New("invalid unicode escape: \\U" + code) + } + growingString += string(rune(intcode)) + default: + return "", errors.New("invalid escape sequence: \\" + string(l.peek())) + } + } else { + r := l.peek() + + if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) { + return "", fmt.Errorf("unescaped control character %U", r) + } + l.next() + growingString += string(r) + } + + if l.peek() == eof { + break + } + } + + return "", errors.New("unclosed string") +} + +func (l *tomlLexer) lexString() tomlLexStateFn { + l.skip() + + // handle special case for triple-quote + terminator := `"` + discardLeadingNewLine := false + acceptNewLines := false + if l.follow(`""`) { + l.skip() + l.skip() + terminator = `"""` + discardLeadingNewLine = true + acceptNewLines = true + } + + str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) + + if err != nil { + return l.errorf(err.Error()) + } + + l.emitWithValue(tokenString, str) + l.fastForward(len(terminator)) + l.ignore() + return l.lexRvalue +} + +func (l *tomlLexer) lexTableKey() tomlLexStateFn { + l.next() + + if l.peek() == '[' { + // token '[[' signifies an array of tables + l.next() + l.emit(tokenDoubleLeftBracket) + return l.lexInsideTableArrayKey + } + // vanilla table key + l.emit(tokenLeftBracket) + return l.lexInsideTableKey +} + +// Parse the key till "]]", but only bare keys are supported +func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { + for r := l.peek(); r != eof; r = l.peek() { + switch r { + case ']': + if l.currentTokenStop > l.currentTokenStart { + l.emit(tokenKeyGroupArray) + } + l.next() + if l.peek() != ']' { + break + } + l.next() + l.emit(tokenDoubleRightBracket) + return l.lexVoid + case '[': + return l.errorf("table array key cannot contain ']'") + default: + l.next() + } + } + return l.errorf("unclosed table array key") +} + +// Parse the key till "]" but only bare keys are supported +func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { + for r := l.peek(); r != eof; r = l.peek() { + switch r { + case ']': + if l.currentTokenStop > l.currentTokenStart { + l.emit(tokenKeyGroup) + } + l.next() + l.emit(tokenRightBracket) + return l.lexVoid + case '[': + return l.errorf("table key cannot contain ']'") + default: + l.next() + } + } + return l.errorf("unclosed table key") +} + +func (l *tomlLexer) lexRightBracket() tomlLexStateFn { + l.next() + l.emit(tokenRightBracket) + return l.lexRvalue +} + +type validRuneFn func(r rune) bool + +func isValidHexRune(r rune) bool { + return r >= 'a' && r <= 'f' || + r >= 'A' && r <= 'F' || + r >= '0' && r <= '9' || + r == '_' +} + +func isValidOctalRune(r rune) bool { + return r >= '0' && r <= '7' || r == '_' +} + +func isValidBinaryRune(r rune) bool { + return r == '0' || r == '1' || r == '_' +} + +func (l *tomlLexer) lexNumber() tomlLexStateFn { + r := l.peek() + + if r == '0' { + follow := l.peekString(2) + if len(follow) == 2 { + var isValidRune validRuneFn + switch follow[1] { + case 'x': + isValidRune = isValidHexRune + case 'o': + isValidRune = isValidOctalRune + case 'b': + isValidRune = isValidBinaryRune + default: + if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' { + return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1])) + } + } + + if isValidRune != nil { + l.next() + l.next() + digitSeen := false + for { + next := l.peek() + if !isValidRune(next) { + break + } + digitSeen = true + l.next() + } + + if !digitSeen { + return l.errorf("number needs at least one digit") + } + + l.emit(tokenInteger) + + return l.lexRvalue + } + } + } + + if r == '+' || r == '-' { + l.next() + if l.follow("inf") { + return l.lexInf + } + if l.follow("nan") { + return l.lexNan + } + } + + pointSeen := false + expSeen := false + digitSeen := false + for { + next := l.peek() + if next == '.' { + if pointSeen { + return l.errorf("cannot have two dots in one float") + } + l.next() + if !isDigit(l.peek()) { + return l.errorf("float cannot end with a dot") + } + pointSeen = true + } else if next == 'e' || next == 'E' { + expSeen = true + l.next() + r := l.peek() + if r == '+' || r == '-' { + l.next() + } + } else if isDigit(next) { + digitSeen = true + l.next() + } else if next == '_' { + l.next() + } else { + break + } + if pointSeen && !digitSeen { + return l.errorf("cannot start float with a dot") + } + } + + if !digitSeen { + return l.errorf("no digit in that number") + } + if pointSeen || expSeen { + l.emit(tokenFloat) + } else { + l.emit(tokenInteger) + } + return l.lexRvalue +} + +func (l *tomlLexer) run() { + for state := l.lexVoid; state != nil; { + state = state() + } +} + +func init() { + dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`) +} + +// Entry point +func lexToml(inputBytes []byte) []token { + runes := bytes.Runes(inputBytes) + l := &tomlLexer{ + input: runes, + tokens: make([]token, 0, 256), + line: 1, + col: 1, + endbufferLine: 1, + endbufferCol: 1, + } + l.run() + return l.tokens +} |