go-yaml/scanner/scanner.go

539 lines
13 KiB
Go
Raw Normal View History

2019-10-16 18:19:48 +09:00
package scanner
import (
"io"
"strings"
"github.com/goccy/go-yaml/token"
"golang.org/x/xerrors"
)
2019-10-21 12:53:30 +09:00
// IndentState state for indent
2019-10-16 18:19:48 +09:00
type IndentState int
const (
2019-10-21 12:53:30 +09:00
// IndentStateEqual equals previous indent
2019-10-16 18:19:48 +09:00
IndentStateEqual IndentState = iota
2019-10-21 12:53:30 +09:00
// IndentStateUp more indent than previous
2019-10-16 18:19:48 +09:00
IndentStateUp
2019-10-21 12:53:30 +09:00
// IndentStateDown less indent than previous
2019-10-16 18:19:48 +09:00
IndentStateDown
2019-10-21 12:53:30 +09:00
// IndentStateKeep uses not indent token
2019-10-16 18:19:48 +09:00
IndentStateKeep
)
2019-10-21 12:53:30 +09:00
// Scanner holds the scanner's internal state while processing a given text.
// It can be allocated as part of another data structure but must be initialized via Init before use.
2019-10-16 18:19:48 +09:00
type Scanner struct {
2019-11-07 18:01:45 +09:00
source []rune
2019-11-07 13:00:00 +09:00
sourcePos int
sourceSize int
line int
column int
offset int
prevIndentLevel int
prevIndentNum int
prevIndentColumn int
indentLevel int
indentNum int
isFirstCharAtLine bool
isAnchor bool
startedFlowSequenceNum int
startedFlowMapNum int
indentState IndentState
savedPos *token.Position
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) pos() *token.Position {
return &token.Position{
Line: s.line,
Column: s.column,
Offset: s.offset,
IndentNum: s.indentNum,
IndentLevel: s.indentLevel,
}
}
func (s *Scanner) bufferedToken(ctx *Context) *token.Token {
if s.savedPos != nil {
tk := ctx.bufferedToken(s.savedPos)
s.savedPos = nil
return tk
}
trimmedSrc := strings.TrimLeft(string(ctx.buf), " ")
size := len([]rune(trimmedSrc))
2019-10-16 18:19:48 +09:00
return ctx.bufferedToken(&token.Position{
Line: s.line,
Column: s.column - size,
Offset: s.offset - size,
IndentNum: s.indentNum,
IndentLevel: s.indentLevel,
})
}
func (s *Scanner) progressColumn(ctx *Context, num int) {
s.column += num
s.offset += num
ctx.progress(num)
}
func (s *Scanner) progressLine(ctx *Context) {
s.column = 1
s.line++
s.offset++
s.indentNum = 0
s.isFirstCharAtLine = true
s.isAnchor = false
ctx.progress(1)
}
2019-11-07 23:54:32 +09:00
func (s *Scanner) isNeededKeepPreviousIndentNum(ctx *Context, c rune) bool {
if !s.isChangedToIndentStateUp() {
return false
}
if ctx.isDocument() {
return true
}
if c == '-' && ctx.bufferedSrc() != "" {
return true
}
return false
}
2019-11-07 23:45:39 +09:00
func (s *Scanner) updateIndent(ctx *Context, c rune) {
2019-10-16 18:19:48 +09:00
if s.isFirstCharAtLine && c == ' ' {
s.indentNum++
return
}
if !s.isFirstCharAtLine {
s.indentState = IndentStateKeep
return
}
if s.prevIndentNum < s.indentNum {
s.indentLevel = s.prevIndentLevel + 1
s.indentState = IndentStateUp
} else if s.prevIndentNum == s.indentNum {
s.indentLevel = s.prevIndentLevel
s.indentState = IndentStateEqual
} else {
s.indentState = IndentStateDown
2019-10-21 14:54:26 +09:00
if s.prevIndentLevel > 0 {
s.indentLevel = s.prevIndentLevel - 1
}
2019-10-16 18:19:48 +09:00
}
2019-10-21 15:58:46 +09:00
if s.prevIndentColumn > 0 {
if s.prevIndentColumn < s.column {
2019-10-21 15:58:46 +09:00
s.indentState = IndentStateUp
} else if s.prevIndentColumn == s.column {
2019-10-21 15:58:46 +09:00
s.indentState = IndentStateEqual
} else {
s.indentState = IndentStateDown
}
}
2019-11-07 23:45:39 +09:00
s.isFirstCharAtLine = false
2019-11-07 23:54:32 +09:00
if s.isNeededKeepPreviousIndentNum(ctx, c) {
2019-11-07 23:45:39 +09:00
return
}
2019-10-16 18:19:48 +09:00
s.prevIndentNum = s.indentNum
s.prevIndentColumn = 0
2019-10-16 18:19:48 +09:00
s.prevIndentLevel = s.indentLevel
}
func (s *Scanner) isChangedToIndentStateDown() bool {
return s.indentState == IndentStateDown
}
func (s *Scanner) isChangedToIndentStateUp() bool {
return s.indentState == IndentStateUp
}
func (s *Scanner) isChangedToIndentStateEqual() bool {
return s.indentState == IndentStateEqual
}
func (s *Scanner) addBufferedTokenIfExists(ctx *Context) {
ctx.addToken(s.bufferedToken(ctx))
}
func (s *Scanner) breakLiteral(ctx *Context) {
ctx.breakLiteral()
}
func (s *Scanner) scanQuote(ctx *Context, ch rune) (tk *token.Token, pos int) {
ctx.addOriginBuf(ch)
startIndex := ctx.idx + 1
ctx.progress(1)
for idx, c := range ctx.src[startIndex:] {
pos = idx + 1
ctx.addOriginBuf(c)
switch c {
case ch:
if ctx.previousChar() == '\\' {
continue
}
value := ctx.source(startIndex, startIndex+idx)
switch ch {
case '\'':
tk = token.SingleQuote(value, string(ctx.obuf), s.pos())
case '"':
tk = token.DoubleQuote(value, string(ctx.obuf), s.pos())
}
pos = len([]rune(value)) + 1
2019-10-16 18:19:48 +09:00
return
}
}
return
}
func (s *Scanner) scanTag(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('!')
ctx.progress(1) // skip '!' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx + 1
ctx.addOriginBuf(c)
switch c {
case ' ', '\n':
value := ctx.source(ctx.idx-1, ctx.idx+idx)
tk = token.Tag(value, string(ctx.obuf), s.pos())
pos = len([]rune(value))
2019-10-16 18:19:48 +09:00
return
}
}
return
}
func (s *Scanner) scanComment(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('#')
ctx.progress(1) // skip '#' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx + 1
ctx.addOriginBuf(c)
switch c {
case '\n':
if ctx.previousChar() == '\\' {
continue
}
value := ctx.source(ctx.idx, ctx.idx+idx)
tk = token.Comment(value, string(ctx.obuf), s.pos())
pos = len([]rune(value)) + 1
2019-10-16 18:19:48 +09:00
return
}
}
return
}
func (s *Scanner) scanLiteral(ctx *Context, c rune) {
if ctx.isEOS() {
value := ctx.bufferedSrc()
ctx.addToken(token.New(value, string(ctx.obuf), s.pos()))
2019-11-07 23:45:39 +09:00
ctx.resetBuffer()
2019-10-16 18:19:48 +09:00
}
if c == '\n' {
if ctx.isLiteral {
ctx.addBuf(c)
} else {
ctx.addBuf(' ')
}
s.progressLine(ctx)
} else if s.isFirstCharAtLine && c == ' ' {
s.progressColumn(ctx, 1)
} else {
ctx.addBuf(c)
s.progressColumn(ctx, 1)
}
ctx.addOriginBuf(c)
}
func (s *Scanner) scanLiteralHeader(ctx *Context) (pos int, err error) {
header := ctx.currentChar()
ctx.addOriginBuf(header)
ctx.progress(1) // skip '|' or '<' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx
ctx.addOriginBuf(c)
switch c {
case '\n':
value := ctx.source(ctx.idx, ctx.idx+idx)
opt := strings.TrimRight(value, " ")
switch opt {
case "", "+", "-",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
if header == '|' {
ctx.addToken(token.Literal("|"+opt, string(ctx.obuf), s.pos()))
ctx.isLiteral = true
} else if header == '>' {
ctx.addToken(token.Folded(">"+opt, string(ctx.obuf), s.pos()))
ctx.isFolded = true
}
ctx.resetBuffer()
ctx.literalOpt = opt
return
}
break
}
}
err = xerrors.New("invalid literal header")
return
}
func (s *Scanner) scanNewLine(ctx *Context, c rune) {
if len(ctx.buf) > 0 && s.savedPos == nil {
s.savedPos = s.pos()
s.savedPos.Column -= len([]rune(ctx.bufferedSrc()))
2019-10-16 18:19:48 +09:00
}
if ctx.isEOS() {
s.addBufferedTokenIfExists(ctx)
} else if s.isAnchor {
s.addBufferedTokenIfExists(ctx)
2019-10-16 18:19:48 +09:00
}
ctx.addBuf(' ')
ctx.addOriginBuf(c)
s.progressLine(ctx)
}
func (s *Scanner) scan(ctx *Context) (pos int) {
for ctx.next() {
pos = ctx.nextPos()
c := ctx.currentChar()
2019-11-07 23:45:39 +09:00
s.updateIndent(ctx, c)
if ctx.isDocument() {
if s.isChangedToIndentStateEqual() ||
s.isChangedToIndentStateDown() {
s.addBufferedTokenIfExists(ctx)
s.breakLiteral(ctx)
} else {
s.scanLiteral(ctx, c)
continue
}
} else if s.isChangedToIndentStateDown() {
2019-10-16 18:19:48 +09:00
s.addBufferedTokenIfExists(ctx)
} else if s.isChangedToIndentStateEqual() {
// if first character is \n, buffer expect to raw folded literal
if len(ctx.obuf) > 0 && ctx.obuf[0] != '\n' {
// doesn't raw folded literal
s.addBufferedTokenIfExists(ctx)
}
2019-10-16 18:19:48 +09:00
}
switch c {
case '{':
2019-11-06 19:28:47 +09:00
if ctx.bufferedSrc() == "" {
ctx.addOriginBuf(c)
ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowMapNum++
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case '}':
2019-11-07 13:00:00 +09:00
if ctx.bufferedSrc() == "" || s.startedFlowMapNum > 0 {
2019-11-06 19:28:47 +09:00
ctx.addToken(s.bufferedToken(ctx))
ctx.addOriginBuf(c)
ctx.addToken(token.MappingEnd(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowMapNum--
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case '.':
if s.indentNum == 0 && ctx.repeatNum('.') == 3 {
ctx.addToken(token.DocumentEnd(s.pos()))
s.progressColumn(ctx, 3)
pos += 2
return
}
case '<':
if ctx.repeatNum('<') == 2 {
s.prevIndentColumn = s.column
ctx.addToken(token.MergeKey(string(ctx.obuf)+"<<", s.pos()))
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
pos++
return
}
case '-':
if s.indentNum == 0 && ctx.repeatNum('-') == 3 {
s.addBufferedTokenIfExists(ctx)
ctx.addToken(token.DocumentHeader(s.pos()))
s.progressColumn(ctx, 3)
pos += 2
return
}
2019-10-25 15:47:02 +09:00
if ctx.bufferedSrc() != "" && s.isChangedToIndentStateUp() {
2019-10-16 18:19:48 +09:00
// raw folded
ctx.isRawFolded = true
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
nc := ctx.nextChar()
if nc == ' ' {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
tk := token.SequenceEntry(string(ctx.obuf), s.pos())
s.prevIndentColumn = tk.Position.Column
ctx.addToken(tk)
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
return
}
case '[':
2019-11-06 19:28:47 +09:00
if ctx.bufferedSrc() == "" {
ctx.addOriginBuf(c)
ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowSequenceNum++
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case ']':
2019-11-07 13:00:00 +09:00
if ctx.bufferedSrc() == "" || s.startedFlowSequenceNum > 0 {
2019-11-06 19:28:47 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.SequenceEnd(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowSequenceNum--
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case ',':
2019-11-07 13:00:00 +09:00
if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 {
2019-11-06 19:28:47 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.CollectEntry(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case ':':
nc := ctx.nextChar()
2019-10-30 16:57:59 +09:00
if nc == ' ' || nc == '\n' || ctx.isNextEOS() {
2019-10-16 18:19:48 +09:00
// mapping value
tk := s.bufferedToken(ctx)
if tk != nil {
s.prevIndentColumn = tk.Position.Column
ctx.addToken(tk)
}
2019-10-16 18:19:48 +09:00
ctx.addToken(token.MappingValue(s.pos()))
s.progressColumn(ctx, 1)
return
}
case '|', '>':
if ctx.bufferedSrc() == "" {
progress, err := s.scanLiteralHeader(ctx)
if err != nil {
// TODO: returns syntax error object
return
}
s.progressColumn(ctx, progress)
s.progressLine(ctx)
continue
}
case '!':
2019-11-06 19:28:47 +09:00
if ctx.bufferedSrc() == "" {
token, progress := s.scanTag(ctx)
ctx.addToken(token)
s.progressColumn(ctx, progress)
if c := ctx.previousChar(); c == '\n' {
s.progressLine(ctx)
}
pos += progress
return
2019-10-16 18:19:48 +09:00
}
case '%':
2019-11-06 19:28:47 +09:00
if ctx.bufferedSrc() == "" && s.indentNum == 0 {
2019-10-16 18:19:48 +09:00
ctx.addToken(token.Directive(s.pos()))
s.progressColumn(ctx, 1)
return
}
case '?':
nc := ctx.nextChar()
2019-11-06 19:28:47 +09:00
if ctx.bufferedSrc() == "" && nc == ' ' {
2019-10-16 18:19:48 +09:00
ctx.addToken(token.Directive(s.pos()))
s.progressColumn(ctx, 1)
return
}
case '&':
2019-11-06 19:28:47 +09:00
if ctx.bufferedSrc() == "" {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.Anchor(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
s.isAnchor = true
return
}
2019-10-16 18:19:48 +09:00
case '*':
2019-11-06 19:28:47 +09:00
if ctx.bufferedSrc() == "" {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.Alias(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case '#':
s.addBufferedTokenIfExists(ctx)
token, progress := s.scanComment(ctx)
ctx.addToken(token)
s.progressColumn(ctx, progress)
s.progressLine(ctx)
pos += progress
return
case '\'', '"':
2019-11-07 18:08:12 +09:00
if ctx.bufferedSrc() == "" {
token, progress := s.scanQuote(ctx, c)
ctx.addToken(token)
s.progressColumn(ctx, progress)
pos += progress
return
}
2019-10-16 18:19:48 +09:00
case '\n':
s.scanNewLine(ctx, c)
continue
case ' ':
if ctx.isSaveIndentMode() || (!s.isAnchor && !s.isFirstCharAtLine) {
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
if s.isFirstCharAtLine {
s.progressColumn(ctx, 1)
ctx.addOriginBuf(c)
continue
}
s.addBufferedTokenIfExists(ctx)
s.progressColumn(ctx, 1)
s.isAnchor = false
return
}
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
}
s.addBufferedTokenIfExists(ctx)
2019-10-16 18:19:48 +09:00
return
}
2019-10-21 12:53:30 +09:00
// Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
2019-11-07 18:01:45 +09:00
func (s *Scanner) Init(text string) {
src := []rune(text)
2019-10-16 18:19:48 +09:00
s.source = src
s.sourcePos = 0
s.sourceSize = len(src)
s.line = 1
s.column = 1
s.offset = 1
s.prevIndentLevel = 0
s.prevIndentNum = 0
s.prevIndentColumn = 0
2019-10-16 18:19:48 +09:00
s.indentLevel = 0
s.indentNum = 0
s.isFirstCharAtLine = true
}
2019-10-21 12:53:30 +09:00
// Scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
2019-10-16 18:19:48 +09:00
func (s *Scanner) Scan() (token.Tokens, error) {
if s.sourcePos >= s.sourceSize {
return nil, io.EOF
}
ctx := newContext(s.source[s.sourcePos:])
progress := s.scan(ctx)
s.sourcePos += progress
return ctx.tokens, nil
}