go-yaml/scanner/scanner.go

900 lines
22 KiB
Go
Raw Normal View History

2019-10-16 18:19:48 +09:00
package scanner
import (
"io"
"strings"
"github.com/goccy/go-yaml/token"
"golang.org/x/xerrors"
)
2019-10-21 12:53:30 +09:00
// IndentState state for indent
2019-10-16 18:19:48 +09:00
type IndentState int
const (
2019-10-21 12:53:30 +09:00
// IndentStateEqual equals previous indent
2019-10-16 18:19:48 +09:00
IndentStateEqual IndentState = iota
2019-10-21 12:53:30 +09:00
// IndentStateUp more indent than previous
2019-10-16 18:19:48 +09:00
IndentStateUp
2019-10-21 12:53:30 +09:00
// IndentStateDown less indent than previous
2019-10-16 18:19:48 +09:00
IndentStateDown
2019-10-21 12:53:30 +09:00
// IndentStateKeep uses not indent token
2019-10-16 18:19:48 +09:00
IndentStateKeep
)
2019-10-21 12:53:30 +09:00
// Scanner holds the scanner's internal state while processing a given text.
// It can be allocated as part of another data structure but must be initialized via Init before use.
2019-10-16 18:19:48 +09:00
type Scanner struct {
2019-11-07 18:01:45 +09:00
source []rune
2019-11-07 13:00:00 +09:00
sourcePos int
sourceSize int
line int
column int
offset int
prevIndentLevel int
prevIndentNum int
prevIndentColumn int
2019-11-08 01:04:29 +09:00
docStartColumn int
2019-11-07 13:00:00 +09:00
indentLevel int
indentNum int
isFirstCharAtLine bool
isAnchor bool
startedFlowSequenceNum int
startedFlowMapNum int
indentState IndentState
savedPos *token.Position
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) pos() *token.Position {
return &token.Position{
Line: s.line,
Column: s.column,
Offset: s.offset,
IndentNum: s.indentNum,
IndentLevel: s.indentLevel,
}
}
func (s *Scanner) bufferedToken(ctx *Context) *token.Token {
if s.savedPos != nil {
tk := ctx.bufferedToken(s.savedPos)
s.savedPos = nil
return tk
}
line := s.line
column := s.column - len(ctx.buf)
level := s.indentLevel
if ctx.isSaveIndentMode() {
line -= s.newLineCount(ctx.buf)
column = strings.Index(string(ctx.obuf), string(ctx.buf)) + 1
// Since we are in a literal, folded or raw folded
// we can use the indent level from the last token.
last := ctx.lastToken()
if last != nil { // The last token should never be nil here.
level = last.Position.IndentLevel + 1
}
}
2019-10-16 18:19:48 +09:00
return ctx.bufferedToken(&token.Position{
Line: line,
Column: column,
Offset: s.offset - len(ctx.buf),
2019-10-16 18:19:48 +09:00
IndentNum: s.indentNum,
IndentLevel: level,
2019-10-16 18:19:48 +09:00
})
}
func (s *Scanner) progressColumn(ctx *Context, num int) {
s.column += num
s.offset += num
ctx.progress(num)
}
func (s *Scanner) progressLine(ctx *Context) {
s.column = 1
s.line++
s.offset++
s.indentNum = 0
s.isFirstCharAtLine = true
s.isAnchor = false
ctx.progress(1)
}
2019-11-07 23:54:32 +09:00
func (s *Scanner) isNeededKeepPreviousIndentNum(ctx *Context, c rune) bool {
if !s.isChangedToIndentStateUp() {
return false
}
if ctx.isDocument() {
return true
}
if c == '-' && ctx.existsBuffer() {
2019-11-07 23:54:32 +09:00
return true
}
return false
}
func (s *Scanner) isNewLineChar(c rune) bool {
if c == '\n' {
return true
}
if c == '\r' {
return true
}
return false
}
2019-12-22 17:28:30 +09:00
func (s *Scanner) newLineCount(src []rune) int {
size := len(src)
cnt := 0
for i := 0; i < size; i++ {
c := src[i]
switch c {
case '\r':
if i+1 < size && src[i+1] == '\n' {
i++
}
cnt++
case '\n':
cnt++
}
}
return cnt
}
func (s *Scanner) updateIndentState(ctx *Context) {
indentNumBasedIndentState := s.indentState
2019-10-16 18:19:48 +09:00
if s.prevIndentNum < s.indentNum {
s.indentLevel = s.prevIndentLevel + 1
indentNumBasedIndentState = IndentStateUp
2019-10-16 18:19:48 +09:00
} else if s.prevIndentNum == s.indentNum {
s.indentLevel = s.prevIndentLevel
indentNumBasedIndentState = IndentStateEqual
2019-10-16 18:19:48 +09:00
} else {
indentNumBasedIndentState = IndentStateDown
2019-10-21 14:54:26 +09:00
if s.prevIndentLevel > 0 {
s.indentLevel = s.prevIndentLevel - 1
}
2019-10-16 18:19:48 +09:00
}
2019-10-21 15:58:46 +09:00
if s.prevIndentColumn > 0 {
if s.prevIndentColumn < s.column {
2019-10-21 15:58:46 +09:00
s.indentState = IndentStateUp
} else if s.prevIndentColumn != s.column || indentNumBasedIndentState != IndentStateEqual {
// The following case ( current position is 'd' ), some variables becomes like here
// - prevIndentColumn: 1 of 'a'
2022-01-11 20:43:39 +09:00
// - indentNumBasedIndentState: IndentStateDown because d's indentNum(1) is less than c's indentNum(3).
// Therefore, s.prevIndentColumn(1) == s.column(1) is true, but we want to treat this as IndentStateDown.
2022-01-11 20:43:39 +09:00
// So, we look also current indentState value by the above prevIndentNum based logic, and determins finally indentState.
// ---
// a:
// b
// c
// d: e
// ^
2019-10-21 15:58:46 +09:00
s.indentState = IndentStateDown
} else {
s.indentState = IndentStateEqual
2019-10-21 15:58:46 +09:00
}
} else {
s.indentState = indentNumBasedIndentState
}
}
func (s *Scanner) updateIndent(ctx *Context, c rune) {
if s.isFirstCharAtLine && s.isNewLineChar(c) && ctx.isDocument() {
return
}
if s.isFirstCharAtLine && c == ' ' {
s.indentNum++
return
}
if !s.isFirstCharAtLine {
s.indentState = IndentStateKeep
return
2019-10-21 15:58:46 +09:00
}
s.updateIndentState(ctx)
2019-11-07 23:45:39 +09:00
s.isFirstCharAtLine = false
2019-11-07 23:54:32 +09:00
if s.isNeededKeepPreviousIndentNum(ctx, c) {
2019-11-07 23:45:39 +09:00
return
}
if s.indentState != IndentStateUp {
s.prevIndentColumn = 0
}
2019-10-16 18:19:48 +09:00
s.prevIndentNum = s.indentNum
s.prevIndentLevel = s.indentLevel
}
func (s *Scanner) isChangedToIndentStateDown() bool {
return s.indentState == IndentStateDown
}
func (s *Scanner) isChangedToIndentStateUp() bool {
return s.indentState == IndentStateUp
}
func (s *Scanner) isChangedToIndentStateEqual() bool {
return s.indentState == IndentStateEqual
}
func (s *Scanner) addBufferedTokenIfExists(ctx *Context) {
ctx.addToken(s.bufferedToken(ctx))
}
func (s *Scanner) breakLiteral(ctx *Context) {
2019-11-08 01:04:29 +09:00
s.docStartColumn = 0
2019-10-16 18:19:48 +09:00
ctx.breakLiteral()
}
func (s *Scanner) scanSingleQuote(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('\'')
srcpos := s.pos()
2019-10-16 18:19:48 +09:00
startIndex := ctx.idx + 1
src := ctx.src
size := len(src)
value := []rune{}
isFirstLineChar := false
isNewLine := false
for idx := startIndex; idx < size; idx++ {
if !isNewLine {
s.progressColumn(ctx, 1)
} else {
isNewLine = false
}
c := src[idx]
2019-10-16 18:19:48 +09:00
pos = idx + 1
ctx.addOriginBuf(c)
if s.isNewLineChar(c) {
value = append(value, ' ')
isFirstLineChar = true
isNewLine = true
s.progressLine(ctx)
continue
} else if c == ' ' && isFirstLineChar {
continue
} else if c != '\'' {
value = append(value, c)
isFirstLineChar = false
continue
}
if idx+1 < len(ctx.src) && ctx.src[idx+1] == '\'' {
// '' handle as ' character
value = append(value, c)
2020-05-29 18:23:08 +09:00
ctx.addOriginBuf(c)
idx++
continue
}
s.progressColumn(ctx, 1)
tk = token.SingleQuote(string(value), string(ctx.obuf), srcpos)
pos = idx - startIndex + 1
return
}
return
}
func hexToInt(b rune) int {
if b >= 'A' && b <= 'F' {
return int(b) - 'A' + 10
}
if b >= 'a' && b <= 'f' {
return int(b) - 'a' + 10
}
return int(b) - '0'
}
func hexRunesToInt(b []rune) int {
sum := 0
for i := 0; i < len(b); i++ {
sum += hexToInt(b[i]) << (uint(len(b)-i-1) * 4)
}
return sum
}
func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('"')
srcpos := s.pos()
startIndex := ctx.idx + 1
src := ctx.src
size := len(src)
value := []rune{}
isFirstLineChar := false
isNewLine := false
for idx := startIndex; idx < size; idx++ {
if !isNewLine {
s.progressColumn(ctx, 1)
} else {
isNewLine = false
}
c := src[idx]
pos = idx + 1
ctx.addOriginBuf(c)
if s.isNewLineChar(c) {
value = append(value, ' ')
isFirstLineChar = true
isNewLine = true
s.progressLine(ctx)
continue
} else if c == ' ' && isFirstLineChar {
continue
} else if c == '\\' {
isFirstLineChar = false
if idx+1 < size {
nextChar := src[idx+1]
switch nextChar {
case 'b':
ctx.addOriginBuf(nextChar)
value = append(value, '\b')
idx++
continue
case 'e':
ctx.addOriginBuf(nextChar)
value = append(value, '\x1B')
idx++
continue
case 'f':
ctx.addOriginBuf(nextChar)
value = append(value, '\f')
idx++
continue
case 'n':
ctx.addOriginBuf(nextChar)
value = append(value, '\n')
idx++
continue
case 'v':
ctx.addOriginBuf(nextChar)
value = append(value, '\v')
idx++
continue
case 'L': // LS (#x2028)
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xE2', '\x80', '\xA8'}...)
idx++
continue
case 'N': // NEL (#x85)
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xC2', '\x85'}...)
idx++
continue
case 'P': // PS (#x2029)
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xE2', '\x80', '\xA9'}...)
idx++
continue
case '_': // #xA0
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xC2', '\xA0'}...)
idx++
continue
case '"':
ctx.addOriginBuf(nextChar)
value = append(value, nextChar)
idx++
continue
case 'x':
if idx+3 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\x")
return
}
codeNum := hexRunesToInt(src[idx+2 : idx+4])
value = append(value, rune(codeNum))
idx += 3
continue
case 'u':
if idx+5 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\u")
return
}
codeNum := hexRunesToInt(src[idx+2 : idx+6])
value = append(value, rune(codeNum))
idx += 5
continue
case 'U':
if idx+9 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\U")
return
}
codeNum := hexRunesToInt(src[idx+2 : idx+10])
value = append(value, rune(codeNum))
idx += 9
continue
case '\\':
ctx.addOriginBuf(nextChar)
idx++
}
2019-10-16 18:19:48 +09:00
}
value = append(value, c)
continue
} else if c != '"' {
value = append(value, c)
isFirstLineChar = false
continue
2019-10-16 18:19:48 +09:00
}
s.progressColumn(ctx, 1)
tk = token.DoubleQuote(string(value), string(ctx.obuf), srcpos)
pos = idx - startIndex + 1
return
2019-10-16 18:19:48 +09:00
}
return
}
func (s *Scanner) scanQuote(ctx *Context, ch rune) (tk *token.Token, pos int) {
if ch == '\'' {
return s.scanSingleQuote(ctx)
}
return s.scanDoubleQuote(ctx)
}
2021-03-01 17:32:11 +09:00
func (s *Scanner) isMergeKey(ctx *Context) bool {
if ctx.repeatNum('<') != 2 {
return false
}
src := ctx.src
size := len(src)
for idx := ctx.idx + 2; idx < size; idx++ {
c := src[idx]
if c == ' ' {
continue
}
if c != ':' {
return false
}
if idx+1 < size {
nc := src[idx+1]
if nc == ' ' || s.isNewLineChar(nc) {
return true
}
}
}
return false
}
2019-10-16 18:19:48 +09:00
func (s *Scanner) scanTag(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('!')
ctx.progress(1) // skip '!' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx + 1
ctx.addOriginBuf(c)
switch c {
case ' ', '\n', '\r':
2019-10-16 18:19:48 +09:00
value := ctx.source(ctx.idx-1, ctx.idx+idx)
tk = token.Tag(value, string(ctx.obuf), s.pos())
pos = len([]rune(value))
2019-10-16 18:19:48 +09:00
return
}
}
return
}
func (s *Scanner) scanComment(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('#')
ctx.progress(1) // skip '#' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx + 1
ctx.addOriginBuf(c)
switch c {
case '\n', '\r':
2019-10-16 18:19:48 +09:00
if ctx.previousChar() == '\\' {
continue
}
value := ctx.source(ctx.idx, ctx.idx+idx)
tk = token.Comment(value, string(ctx.obuf), s.pos())
pos = len([]rune(value)) + 1
2019-10-16 18:19:48 +09:00
return
}
}
return
}
2021-07-19 18:48:09 +09:00
func trimCommentFromLiteralOpt(text string) (string, error) {
idx := strings.Index(text, "#")
if idx < 0 {
return text, nil
}
if idx == 0 {
return "", xerrors.New("invalid literal header")
}
return text[:idx-1], nil
}
2019-10-16 18:19:48 +09:00
func (s *Scanner) scanLiteral(ctx *Context, c rune) {
2019-11-08 16:48:54 +09:00
ctx.addOriginBuf(c)
2019-10-16 18:19:48 +09:00
if ctx.isEOS() {
if ctx.isLiteral {
2020-03-07 12:03:06 +09:00
ctx.addBuf(c)
}
2019-10-16 18:19:48 +09:00
value := ctx.bufferedSrc()
ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos()))
2019-11-07 23:45:39 +09:00
ctx.resetBuffer()
2019-11-08 16:48:54 +09:00
s.progressColumn(ctx, 1)
} else if s.isNewLineChar(c) {
2019-10-16 18:19:48 +09:00
if ctx.isLiteral {
ctx.addBuf(c)
} else {
ctx.addBuf(' ')
}
s.progressLine(ctx)
} else if s.isFirstCharAtLine && c == ' ' {
2019-11-08 01:04:29 +09:00
if 0 < s.docStartColumn && s.docStartColumn <= s.column {
ctx.addBuf(c)
}
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
} else {
2019-11-08 01:04:29 +09:00
if s.docStartColumn == 0 {
s.docStartColumn = s.column
}
2019-10-16 18:19:48 +09:00
ctx.addBuf(c)
s.progressColumn(ctx, 1)
}
}
func (s *Scanner) scanLiteralHeader(ctx *Context) (pos int, err error) {
header := ctx.currentChar()
ctx.addOriginBuf(header)
2020-06-17 11:39:08 +09:00
ctx.progress(1) // skip '|' or '>' character
2019-10-16 18:19:48 +09:00
for idx, c := range ctx.src[ctx.idx:] {
pos = idx
ctx.addOriginBuf(c)
switch c {
case '\n', '\r':
2019-10-16 18:19:48 +09:00
value := ctx.source(ctx.idx, ctx.idx+idx)
opt := strings.TrimRight(value, " ")
2021-07-19 18:48:09 +09:00
orgOptLen := len(opt)
opt, err = trimCommentFromLiteralOpt(opt)
if err != nil {
return
}
2019-10-16 18:19:48 +09:00
switch opt {
case "", "+", "-",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
2021-07-19 18:48:09 +09:00
hasComment := len(opt) < orgOptLen
2019-10-16 18:19:48 +09:00
if header == '|' {
2021-07-19 18:48:09 +09:00
if hasComment {
commentLen := orgOptLen - len(opt)
headerPos := strings.Index(string(ctx.obuf), "|")
litBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos]
commentBuf := ctx.obuf[len(litBuf):]
ctx.addToken(token.Literal("|"+opt, string(litBuf), s.pos()))
s.column += len(litBuf)
s.offset += len(litBuf)
commentHeader := strings.Index(value, "#")
ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos()))
} else {
ctx.addToken(token.Literal("|"+opt, string(ctx.obuf), s.pos()))
}
2019-10-16 18:19:48 +09:00
ctx.isLiteral = true
} else if header == '>' {
2021-07-19 18:48:09 +09:00
if hasComment {
commentLen := orgOptLen - len(opt)
headerPos := strings.Index(string(ctx.obuf), ">")
foldedBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos]
commentBuf := ctx.obuf[len(foldedBuf):]
ctx.addToken(token.Folded(">"+opt, string(foldedBuf), s.pos()))
s.column += len(foldedBuf)
s.offset += len(foldedBuf)
commentHeader := strings.Index(value, "#")
ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos()))
} else {
ctx.addToken(token.Folded(">"+opt, string(ctx.obuf), s.pos()))
}
2019-10-16 18:19:48 +09:00
ctx.isFolded = true
}
2019-11-08 16:48:54 +09:00
s.indentState = IndentStateKeep
2019-10-16 18:19:48 +09:00
ctx.resetBuffer()
ctx.literalOpt = opt
return
}
break
}
}
err = xerrors.New("invalid literal header")
return
}
func (s *Scanner) scanNewLine(ctx *Context, c rune) {
if len(ctx.buf) > 0 && s.savedPos == nil {
s.savedPos = s.pos()
s.savedPos.Column -= len(ctx.bufferedSrc())
2019-10-16 18:19:48 +09:00
}
// if the following case, origin buffer has unnecessary two spaces.
// So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too.
// ---
// a:[space][space]
// b: c
removedNum := ctx.removeRightSpaceFromBuf()
if removedNum > 0 {
s.column -= removedNum
s.offset -= removedNum
if s.savedPos != nil {
s.savedPos.Column -= removedNum
}
}
2019-10-16 18:19:48 +09:00
if ctx.isEOS() {
s.addBufferedTokenIfExists(ctx)
} else if s.isAnchor {
s.addBufferedTokenIfExists(ctx)
2019-10-16 18:19:48 +09:00
}
ctx.addBuf(' ')
ctx.addOriginBuf(c)
ctx.isSingleLine = false
2019-10-16 18:19:48 +09:00
s.progressLine(ctx)
}
func (s *Scanner) scan(ctx *Context) (pos int) {
for ctx.next() {
pos = ctx.nextPos()
c := ctx.currentChar()
2019-11-07 23:45:39 +09:00
s.updateIndent(ctx, c)
if ctx.isDocument() {
if s.isChangedToIndentStateEqual() ||
s.isChangedToIndentStateDown() {
s.addBufferedTokenIfExists(ctx)
s.breakLiteral(ctx)
} else {
s.scanLiteral(ctx, c)
continue
}
} else if s.isChangedToIndentStateDown() {
2019-10-16 18:19:48 +09:00
s.addBufferedTokenIfExists(ctx)
} else if s.isChangedToIndentStateEqual() {
// if first character is new line character, buffer expect to raw folded literal
2019-12-22 17:28:30 +09:00
if len(ctx.obuf) > 0 && s.newLineCount(ctx.obuf) <= 1 {
// doesn't raw folded literal
s.addBufferedTokenIfExists(ctx)
}
2019-10-16 18:19:48 +09:00
}
switch c {
case '{':
if !ctx.existsBuffer() {
2019-11-06 19:28:47 +09:00
ctx.addOriginBuf(c)
ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowMapNum++
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case '}':
if !ctx.existsBuffer() || s.startedFlowMapNum > 0 {
2019-11-06 19:28:47 +09:00
ctx.addToken(s.bufferedToken(ctx))
ctx.addOriginBuf(c)
ctx.addToken(token.MappingEnd(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowMapNum--
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case '.':
if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('.') == 3 {
ctx.addToken(token.DocumentEnd(string(ctx.obuf)+"...", s.pos()))
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 3)
pos += 2
return
}
case '<':
2021-03-01 17:32:11 +09:00
if s.isMergeKey(ctx) {
s.prevIndentColumn = s.column
ctx.addToken(token.MergeKey(string(ctx.obuf)+"<<", s.pos()))
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
pos++
return
}
case '-':
if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('-') == 3 {
2019-10-16 18:19:48 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addToken(token.DocumentHeader(string(ctx.obuf)+"---", s.pos()))
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 3)
pos += 2
return
}
if ctx.existsBuffer() && s.isChangedToIndentStateUp() {
2019-10-16 18:19:48 +09:00
// raw folded
ctx.isRawFolded = true
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
2020-03-07 12:03:06 +09:00
if ctx.existsBuffer() {
// '-' is literal
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
2019-10-16 18:19:48 +09:00
nc := ctx.nextChar()
2020-03-07 20:17:54 +09:00
if nc == ' ' || s.isNewLineChar(nc) {
2019-10-16 18:19:48 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
tk := token.SequenceEntry(string(ctx.obuf), s.pos())
s.prevIndentColumn = tk.Position.Column
ctx.addToken(tk)
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
return
}
case '[':
if !ctx.existsBuffer() {
2019-11-06 19:28:47 +09:00
ctx.addOriginBuf(c)
ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowSequenceNum++
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case ']':
if !ctx.existsBuffer() || s.startedFlowSequenceNum > 0 {
2019-11-06 19:28:47 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.SequenceEnd(string(ctx.obuf), s.pos()))
2019-11-07 13:00:00 +09:00
s.startedFlowSequenceNum--
2019-11-06 19:28:47 +09:00
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case ',':
2019-11-07 13:00:00 +09:00
if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 {
2019-11-06 19:28:47 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.CollectEntry(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case ':':
nc := ctx.nextChar()
if s.startedFlowMapNum > 0 || nc == ' ' || s.isNewLineChar(nc) || ctx.isNextEOS() {
2019-10-16 18:19:48 +09:00
// mapping value
tk := s.bufferedToken(ctx)
if tk != nil {
s.prevIndentColumn = tk.Position.Column
ctx.addToken(tk)
2022-12-02 04:02:53 +09:00
} else if tk := ctx.lastToken(); tk != nil {
// If the map key is quote, the buffer does not exist because it has already been cut into tokens.
// Therefore, we need to check the last token.
if tk.Indicator == token.QuotedScalarIndicator {
s.prevIndentColumn = tk.Position.Column
}
}
2019-10-16 18:19:48 +09:00
ctx.addToken(token.MappingValue(s.pos()))
s.progressColumn(ctx, 1)
return
}
case '|', '>':
if !ctx.existsBuffer() {
2019-10-16 18:19:48 +09:00
progress, err := s.scanLiteralHeader(ctx)
if err != nil {
// TODO: returns syntax error object
return
}
s.progressColumn(ctx, progress)
s.progressLine(ctx)
continue
}
case '!':
if !ctx.existsBuffer() {
2019-11-06 19:28:47 +09:00
token, progress := s.scanTag(ctx)
ctx.addToken(token)
s.progressColumn(ctx, progress)
if c := ctx.previousChar(); s.isNewLineChar(c) {
2019-11-06 19:28:47 +09:00
s.progressLine(ctx)
}
pos += progress
return
2019-10-16 18:19:48 +09:00
}
case '%':
if !ctx.existsBuffer() && s.indentNum == 0 {
ctx.addToken(token.Directive(string(ctx.obuf)+"%", s.pos()))
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
return
}
case '?':
nc := ctx.nextChar()
if !ctx.existsBuffer() && nc == ' ' {
2020-07-02 17:22:04 +09:00
ctx.addToken(token.MappingKey(s.pos()))
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
return
}
case '&':
if !ctx.existsBuffer() {
2019-11-06 19:28:47 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.Anchor(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
s.isAnchor = true
return
}
2019-10-16 18:19:48 +09:00
case '*':
if !ctx.existsBuffer() {
2019-11-06 19:28:47 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.Alias(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
return
}
2019-10-16 18:19:48 +09:00
case '#':
if !ctx.existsBuffer() || ctx.previousChar() == ' ' {
s.addBufferedTokenIfExists(ctx)
token, progress := s.scanComment(ctx)
ctx.addToken(token)
s.progressColumn(ctx, progress)
s.progressLine(ctx)
pos += progress
return
}
2019-10-16 18:19:48 +09:00
case '\'', '"':
if !ctx.existsBuffer() {
2019-11-07 18:08:12 +09:00
token, progress := s.scanQuote(ctx, c)
ctx.addToken(token)
pos += progress
2022-12-02 17:03:39 +09:00
// If the non-whitespace character immediately following the quote is ':', the quote should be treated as a map key.
2022-12-02 04:02:53 +09:00
// Therefore, do not return and continue processing as a normal map key.
2022-12-02 17:03:39 +09:00
if ctx.currentCharWithSkipWhitespace() == ':' {
2022-12-02 04:02:53 +09:00
continue
}
2019-11-07 18:08:12 +09:00
return
}
2019-11-09 17:20:39 +09:00
case '\r', '\n':
// There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec.
// > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character.
// > Outside scalar content, YAML allows any line break to be used to terminate lines.
// > -- https://yaml.org/spec/1.2/spec.html
if c == '\r' && ctx.nextChar() == '\n' {
ctx.addOriginBuf('\r')
ctx.progress(1)
c = '\n'
}
2019-10-16 18:19:48 +09:00
s.scanNewLine(ctx, c)
continue
case ' ':
if ctx.isSaveIndentMode() || (!s.isAnchor && !s.isFirstCharAtLine) {
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
if s.isFirstCharAtLine {
s.progressColumn(ctx, 1)
ctx.addOriginBuf(c)
continue
}
s.addBufferedTokenIfExists(ctx)
2021-03-01 18:49:34 +09:00
pos-- // to rescan white space at next scanning for adding white space to next buffer.
2019-10-16 18:19:48 +09:00
s.isAnchor = false
return
}
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
}
s.addBufferedTokenIfExists(ctx)
2019-10-16 18:19:48 +09:00
return
}
2019-10-21 12:53:30 +09:00
// Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
func (s *Scanner) Init(text string) {
src := []rune(text)
2019-10-16 18:19:48 +09:00
s.source = src
s.sourcePos = 0
s.sourceSize = len(src)
s.line = 1
s.column = 1
s.offset = 1
s.prevIndentLevel = 0
s.prevIndentNum = 0
s.prevIndentColumn = 0
2019-10-16 18:19:48 +09:00
s.indentLevel = 0
s.indentNum = 0
s.isFirstCharAtLine = true
}
2019-10-21 12:53:30 +09:00
// Scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
2019-10-16 18:19:48 +09:00
func (s *Scanner) Scan() (token.Tokens, error) {
if s.sourcePos >= s.sourceSize {
return nil, io.EOF
}
ctx := newContext(s.source[s.sourcePos:])
defer ctx.release()
2019-10-16 18:19:48 +09:00
progress := s.scan(ctx)
s.sourcePos += progress
var tokens token.Tokens
tokens = append(tokens, ctx.tokens...)
return tokens, nil
2019-10-16 18:19:48 +09:00
}