go-yaml/scanner/scanner.go

1501 lines
34 KiB
Go
Raw Normal View History

2019-10-16 18:19:48 +09:00
package scanner
import (
"errors"
"fmt"
2019-10-16 18:19:48 +09:00
"io"
"strconv"
2019-10-16 18:19:48 +09:00
"strings"
"github.com/goccy/go-yaml/token"
2019-10-16 18:19:48 +09:00
)
2019-10-21 12:53:30 +09:00
// IndentState state for indent
2019-10-16 18:19:48 +09:00
type IndentState int
const (
2019-10-21 12:53:30 +09:00
// IndentStateEqual equals previous indent
2019-10-16 18:19:48 +09:00
IndentStateEqual IndentState = iota
2019-10-21 12:53:30 +09:00
// IndentStateUp more indent than previous
2019-10-16 18:19:48 +09:00
IndentStateUp
2019-10-21 12:53:30 +09:00
// IndentStateDown less indent than previous
2019-10-16 18:19:48 +09:00
IndentStateDown
2019-10-21 12:53:30 +09:00
// IndentStateKeep uses not indent token
2019-10-16 18:19:48 +09:00
IndentStateKeep
)
2019-10-21 12:53:30 +09:00
// Scanner holds the scanner's internal state while processing a given text.
// It can be allocated as part of another data structure but must be initialized via Init before use.
2019-10-16 18:19:48 +09:00
type Scanner struct {
source []rune
sourcePos int
sourceSize int
// line number. This number starts from 1.
line int
// column number. This number starts from 1.
column int
// offset represents the offset from the beginning of the source.
offset int
// lastDelimColumn is the last column needed to compare indent is retained.
lastDelimColumn int
// indentNum indicates the number of spaces used for indentation.
indentNum int
// prevLineIndentNum indicates the number of spaces used for indentation at previous line.
prevLineIndentNum int
// indentLevel indicates the level of indent depth. This value does not match the column value.
2019-11-07 13:00:00 +09:00
indentLevel int
isFirstCharAtLine bool
isAnchor bool
isAlias bool
isDirective bool
2019-11-07 13:00:00 +09:00
startedFlowSequenceNum int
startedFlowMapNum int
indentState IndentState
savedPos *token.Position
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) pos() *token.Position {
return &token.Position{
Line: s.line,
Column: s.column,
Offset: s.offset,
IndentNum: s.indentNum,
IndentLevel: s.indentLevel,
}
}
func (s *Scanner) bufferedToken(ctx *Context) *token.Token {
if s.savedPos != nil {
tk := ctx.bufferedToken(s.savedPos)
s.savedPos = nil
return tk
}
line := s.line
column := s.column - len(ctx.buf)
level := s.indentLevel
if ctx.isMultiLine() {
line -= s.newLineCount(ctx.buf)
column = strings.Index(string(ctx.obuf), string(ctx.buf)) + 1
// Since we are in a literal, folded or raw folded
// we can use the indent level from the last token.
last := ctx.lastToken()
if last != nil { // The last token should never be nil here.
level = last.Position.IndentLevel + 1
}
}
2019-10-16 18:19:48 +09:00
return ctx.bufferedToken(&token.Position{
Line: line,
Column: column,
Offset: s.offset - len(ctx.buf),
2019-10-16 18:19:48 +09:00
IndentNum: s.indentNum,
IndentLevel: level,
2019-10-16 18:19:48 +09:00
})
}
func (s *Scanner) progressColumn(ctx *Context, num int) {
s.column += num
s.offset += num
s.progress(ctx, num)
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) progressLine(ctx *Context) {
s.prevLineIndentNum = s.indentNum
2019-10-16 18:19:48 +09:00
s.column = 1
s.line++
s.offset++
s.indentNum = 0
s.isFirstCharAtLine = true
s.isAnchor = false
s.isAlias = false
s.isDirective = false
s.progress(ctx, 1)
}
func (s *Scanner) progress(ctx *Context, num int) {
ctx.progress(num)
s.sourcePos += num
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) isNewLineChar(c rune) bool {
if c == '\n' {
return true
}
if c == '\r' {
return true
}
return false
}
2019-12-22 17:28:30 +09:00
func (s *Scanner) newLineCount(src []rune) int {
size := len(src)
cnt := 0
for i := 0; i < size; i++ {
c := src[i]
switch c {
case '\r':
if i+1 < size && src[i+1] == '\n' {
i++
}
cnt++
case '\n':
cnt++
}
}
return cnt
}
func (s *Scanner) updateIndentLevel() {
if s.prevLineIndentNum < s.indentNum {
s.indentLevel++
} else if s.prevLineIndentNum > s.indentNum {
if s.indentLevel > 0 {
s.indentLevel--
2019-10-21 14:54:26 +09:00
}
2019-10-16 18:19:48 +09:00
}
}
func (s *Scanner) updateIndentState(ctx *Context) {
if s.lastDelimColumn == 0 {
return
}
if s.lastDelimColumn < s.column {
s.indentState = IndentStateUp
} else {
// If lastDelimColumn and s.column are the same,
// treat as Down state since it is the same column as delimiter.
s.indentState = IndentStateDown
}
}
func (s *Scanner) updateIndent(ctx *Context, c rune) {
if s.isFirstCharAtLine && s.isNewLineChar(c) {
return
}
if s.isFirstCharAtLine && c == ' ' {
s.indentNum++
return
}
if s.isFirstCharAtLine && c == '\t' {
// found tab indent.
// In this case, scanTab returns error.
return
}
if !s.isFirstCharAtLine {
s.indentState = IndentStateKeep
return
2019-10-21 15:58:46 +09:00
}
s.updateIndentLevel()
s.updateIndentState(ctx)
2019-11-07 23:45:39 +09:00
s.isFirstCharAtLine = false
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) isChangedToIndentStateDown() bool {
return s.indentState == IndentStateDown
}
func (s *Scanner) isChangedToIndentStateUp() bool {
return s.indentState == IndentStateUp
}
func (s *Scanner) addBufferedTokenIfExists(ctx *Context) {
ctx.addToken(s.bufferedToken(ctx))
}
func (s *Scanner) breakMultiLine(ctx *Context) {
ctx.breakMultiLine()
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) scanSingleQuote(ctx *Context) (*token.Token, error) {
ctx.addOriginBuf('\'')
srcpos := s.pos()
2019-10-16 18:19:48 +09:00
startIndex := ctx.idx + 1
src := ctx.src
size := len(src)
value := []rune{}
isFirstLineChar := false
isNewLine := false
for idx := startIndex; idx < size; idx++ {
if !isNewLine {
s.progressColumn(ctx, 1)
} else {
isNewLine = false
}
c := src[idx]
2019-10-16 18:19:48 +09:00
ctx.addOriginBuf(c)
if s.isNewLineChar(c) {
notSpaceIdx := -1
for i := len(value) - 1; i >= 0; i-- {
if value[i] == ' ' {
continue
}
notSpaceIdx = i
break
}
if len(value) > notSpaceIdx {
value = value[:notSpaceIdx+1]
}
if isFirstLineChar {
value = append(value, '\n')
} else {
value = append(value, ' ')
}
isFirstLineChar = true
isNewLine = true
s.progressLine(ctx)
if idx+1 < size {
if err := s.validateDocumentSeparatorMarker(ctx, src[idx+1:]); err != nil {
return nil, err
}
}
continue
} else if isFirstLineChar && c == ' ' {
continue
} else if isFirstLineChar && c == '\t' {
if s.lastDelimColumn >= s.column {
return nil, ErrInvalidToken(
token.Invalid(
"tab character cannot be used for indentation in single-quoted text",
string(ctx.obuf), s.pos(),
),
)
}
continue
} else if c != '\'' {
value = append(value, c)
isFirstLineChar = false
continue
} else if idx+1 < len(ctx.src) && ctx.src[idx+1] == '\'' {
// '' handle as ' character
value = append(value, c)
2020-05-29 18:23:08 +09:00
ctx.addOriginBuf(c)
idx++
s.progressColumn(ctx, 1)
continue
}
s.progressColumn(ctx, 1)
return token.SingleQuote(string(value), string(ctx.obuf), srcpos), nil
}
s.progressColumn(ctx, 1)
return nil, ErrInvalidToken(
2024-11-15 00:22:47 +09:00
token.Invalid(
"could not find end character of single-quoted text",
string(ctx.obuf), srcpos,
),
)
}
func hexToInt(b rune) int {
if b >= 'A' && b <= 'F' {
return int(b) - 'A' + 10
}
if b >= 'a' && b <= 'f' {
return int(b) - 'a' + 10
}
return int(b) - '0'
}
func hexRunesToInt(b []rune) int {
sum := 0
for i := 0; i < len(b); i++ {
sum += hexToInt(b[i]) << (uint(len(b)-i-1) * 4)
}
return sum
}
func (s *Scanner) scanDoubleQuote(ctx *Context) (*token.Token, error) {
ctx.addOriginBuf('"')
srcpos := s.pos()
startIndex := ctx.idx + 1
src := ctx.src
size := len(src)
value := []rune{}
isFirstLineChar := false
isNewLine := false
for idx := startIndex; idx < size; idx++ {
if !isNewLine {
s.progressColumn(ctx, 1)
} else {
isNewLine = false
}
c := src[idx]
ctx.addOriginBuf(c)
if s.isNewLineChar(c) {
notSpaceIdx := -1
for i := len(value) - 1; i >= 0; i-- {
if value[i] == ' ' {
continue
}
notSpaceIdx = i
break
}
if len(value) > notSpaceIdx {
value = value[:notSpaceIdx+1]
}
if isFirstLineChar {
value = append(value, '\n')
} else {
value = append(value, ' ')
}
isFirstLineChar = true
isNewLine = true
s.progressLine(ctx)
if idx+1 < size {
if err := s.validateDocumentSeparatorMarker(ctx, src[idx+1:]); err != nil {
return nil, err
}
}
continue
} else if isFirstLineChar && c == ' ' {
continue
} else if isFirstLineChar && c == '\t' {
if s.lastDelimColumn >= s.column {
return nil, ErrInvalidToken(
token.Invalid(
"tab character cannot be used for indentation in double-quoted text",
string(ctx.obuf), s.pos(),
),
)
}
continue
} else if c == '\\' {
isFirstLineChar = false
if idx+1 >= size {
value = append(value, c)
continue
}
nextChar := src[idx+1]
progress := 0
switch nextChar {
case '0':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x00)
case 'a':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x07)
case 'b':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x08)
case 't':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x09)
case 'n':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x0A)
case 'v':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x0B)
case 'f':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x0C)
case 'r':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x0D)
case 'e':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x1B)
case ' ':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x20)
case '"':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x22)
case '/':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x2F)
case '\\':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x5C)
case 'N':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x85)
case '_':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0xA0)
case 'L':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x2028)
case 'P':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, 0x2029)
case 'x':
if idx+3 >= size {
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, nextChar)
} else {
progress = 3
codeNum := hexRunesToInt(src[idx+2 : idx+progress+1])
value = append(value, rune(codeNum))
}
case 'u':
2024-12-02 11:30:11 +09:00
// \u0000 style must have 5 characters at least.
if idx+5 >= size {
2024-12-02 11:30:11 +09:00
return nil, ErrInvalidToken(
token.Invalid(
"not enough length for escaped UTF-16 character",
string(ctx.obuf), s.pos(),
),
)
}
2024-12-02 11:30:11 +09:00
progress = 5
codeNum := hexRunesToInt(src[idx+2 : idx+6])
// handle surrogate pairs.
if codeNum >= 0xD800 && codeNum <= 0xDBFF {
high := codeNum
// \u0000\u0000 style must have 11 characters at least.
if idx+11 >= size {
return nil, ErrInvalidToken(
token.Invalid(
"not enough length for escaped UTF-16 surrogate pair",
string(ctx.obuf), s.pos(),
),
)
}
if src[idx+6] != '\\' || src[idx+7] != 'u' {
return nil, ErrInvalidToken(
token.Invalid(
"found unexpected character after high surrogate for UTF-16 surrogate pair",
string(ctx.obuf), s.pos(),
),
)
}
low := hexRunesToInt(src[idx+8 : idx+12])
if low < 0xDC00 || low > 0xDFFF {
return nil, ErrInvalidToken(
token.Invalid(
"found unexpected low surrogate after high surrogate",
string(ctx.obuf), s.pos(),
),
)
}
codeNum = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
progress += 6
}
value = append(value, rune(codeNum))
case 'U':
2024-12-02 11:30:11 +09:00
// \U00000000 style must have 9 characters at least.
if idx+9 >= size {
2024-12-02 11:30:11 +09:00
return nil, ErrInvalidToken(
token.Invalid(
"not enough length for escaped UTF-32 character",
string(ctx.obuf), s.pos(),
),
)
}
2024-12-02 11:30:11 +09:00
progress = 9
codeNum := hexRunesToInt(src[idx+2 : idx+10])
value = append(value, rune(codeNum))
case '\n':
isFirstLineChar = true
isNewLine = true
ctx.addOriginBuf(nextChar)
s.progressColumn(ctx, 1)
s.progressLine(ctx)
idx++
continue
case '\t':
progress = 1
ctx.addOriginBuf(nextChar)
value = append(value, nextChar)
default:
s.progressColumn(ctx, 1)
return nil, ErrInvalidToken(
token.Invalid(
fmt.Sprintf("found unknown escape character %q", nextChar),
string(ctx.obuf), s.pos(),
),
)
2019-10-16 18:19:48 +09:00
}
idx += progress
s.progressColumn(ctx, progress)
continue
} else if c == '\t' {
var (
foundNotSpaceChar bool
progress int
)
for i := idx + 1; i < size; i++ {
if src[i] == ' ' || src[i] == '\t' {
progress++
continue
}
if s.isNewLineChar(src[i]) {
break
}
foundNotSpaceChar = true
}
if foundNotSpaceChar {
value = append(value, c)
if src[idx+1] != '"' {
s.progressColumn(ctx, 1)
}
} else {
idx += progress
s.progressColumn(ctx, progress)
}
continue
} else if c != '"' {
value = append(value, c)
isFirstLineChar = false
continue
2019-10-16 18:19:48 +09:00
}
s.progressColumn(ctx, 1)
return token.DoubleQuote(string(value), string(ctx.obuf), srcpos), nil
2019-10-16 18:19:48 +09:00
}
s.progressColumn(ctx, 1)
return nil, ErrInvalidToken(
2024-11-15 00:22:47 +09:00
token.Invalid(
"could not find end character of double-quoted text",
string(ctx.obuf), srcpos,
),
)
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) validateDocumentSeparatorMarker(ctx *Context, src []rune) error {
if s.foundDocumentSeparatorMarker(src) {
return ErrInvalidToken(
token.Invalid("found unexpected document separator", string(ctx.obuf), s.pos()),
)
}
return nil
}
func (s *Scanner) foundDocumentSeparatorMarker(src []rune) bool {
if len(src) < 3 {
return false
}
var marker string
if len(src) == 3 {
marker = string(src)
} else {
marker = strings.TrimRightFunc(string(src[:4]), func(r rune) bool {
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
})
}
return marker == "---" || marker == "..."
}
func (s *Scanner) scanQuote(ctx *Context, ch rune) (bool, error) {
if ctx.existsBuffer() {
return false, nil
}
if ch == '\'' {
tk, err := s.scanSingleQuote(ctx)
if err != nil {
return false, err
}
ctx.addToken(tk)
} else {
tk, err := s.scanDoubleQuote(ctx)
if err != nil {
return false, err
}
ctx.addToken(tk)
}
ctx.clear()
return true, nil
}
func (s *Scanner) scanWhiteSpace(ctx *Context) bool {
if ctx.isMultiLine() {
return false
}
if !s.isAnchor && !s.isDirective && !s.isAlias && !s.isFirstCharAtLine {
return false
}
if s.isFirstCharAtLine {
s.progressColumn(ctx, 1)
ctx.addOriginBuf(' ')
return true
}
if s.isDirective {
s.addBufferedTokenIfExists(ctx)
s.progressColumn(ctx, 1)
ctx.addOriginBuf(' ')
return true
}
s.addBufferedTokenIfExists(ctx)
s.isAnchor = false
s.isAlias = false
return true
}
2021-03-01 17:32:11 +09:00
func (s *Scanner) isMergeKey(ctx *Context) bool {
if ctx.repeatNum('<') != 2 {
return false
}
src := ctx.src
size := len(src)
for idx := ctx.idx + 2; idx < size; idx++ {
c := src[idx]
if c == ' ' {
continue
}
if c != ':' {
return false
}
if idx+1 < size {
nc := src[idx+1]
if nc == ' ' || s.isNewLineChar(nc) {
return true
}
}
}
return false
}
func (s *Scanner) scanTag(ctx *Context) (bool, error) {
if ctx.existsBuffer() || s.isDirective {
return false, nil
}
2019-10-16 18:19:48 +09:00
ctx.addOriginBuf('!')
s.progress(ctx, 1) // skip '!' character
var progress int
2019-10-16 18:19:48 +09:00
for idx, c := range ctx.src[ctx.idx:] {
progress = idx + 1
2019-10-16 18:19:48 +09:00
switch c {
2024-11-12 22:08:26 +09:00
case ' ':
ctx.addOriginBuf(c)
2019-10-16 18:19:48 +09:00
value := ctx.source(ctx.idx-1, ctx.idx+idx)
ctx.addToken(token.Tag(value, string(ctx.obuf), s.pos()))
2024-11-12 22:08:26 +09:00
s.progressColumn(ctx, len([]rune(value)))
ctx.clear()
return true, nil
case ',':
if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 {
value := ctx.source(ctx.idx-1, ctx.idx+idx)
ctx.addToken(token.Tag(value, string(ctx.obuf), s.pos()))
s.progressColumn(ctx, len([]rune(value))-1) // progress column before collect-entry for scanning it at scanFlowEntry function.
ctx.clear()
return true, nil
} else {
ctx.addOriginBuf(c)
}
2024-11-12 22:08:26 +09:00
case '\n', '\r':
ctx.addOriginBuf(c)
2024-11-12 22:08:26 +09:00
value := ctx.source(ctx.idx-1, ctx.idx+idx)
ctx.addToken(token.Tag(value, string(ctx.obuf), s.pos()))
s.progressColumn(ctx, len([]rune(value))-1) // progress column before new-line-char for scanning new-line-char at scanNewLine function.
ctx.clear()
return true, nil
case '{', '}':
ctx.addOriginBuf(c)
s.progressColumn(ctx, progress)
invalidTk := token.Invalid(fmt.Sprintf("found invalid tag character %q", c), string(ctx.obuf), s.pos())
return false, ErrInvalidToken(invalidTk)
default:
ctx.addOriginBuf(c)
2019-10-16 18:19:48 +09:00
}
}
s.progressColumn(ctx, progress)
ctx.clear()
return true, nil
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) scanComment(ctx *Context) bool {
2024-12-07 15:15:18 +09:00
if ctx.existsBuffer() {
c := ctx.previousChar()
if c != ' ' && c != '\t' && !s.isNewLineChar(c) {
return false
}
}
s.addBufferedTokenIfExists(ctx)
2019-10-16 18:19:48 +09:00
ctx.addOriginBuf('#')
s.progress(ctx, 1) // skip '#' character
2019-10-16 18:19:48 +09:00
for idx, c := range ctx.src[ctx.idx:] {
ctx.addOriginBuf(c)
2024-12-07 15:15:18 +09:00
if !s.isNewLineChar(c) {
continue
2019-10-16 18:19:48 +09:00
}
2024-12-07 15:15:18 +09:00
if ctx.previousChar() == '\\' {
continue
}
value := ctx.source(ctx.idx, ctx.idx+idx)
progress := len([]rune(value))
ctx.addToken(token.Comment(value, string(ctx.obuf), s.pos()))
s.progressColumn(ctx, progress)
s.progressLine(ctx)
ctx.clear()
return true
2019-10-16 18:19:48 +09:00
}
2023-03-01 16:59:07 +09:00
// document ends with comment.
value := string(ctx.src[ctx.idx:])
ctx.addToken(token.Comment(value, string(ctx.obuf), s.pos()))
progress := len([]rune(value))
s.progressColumn(ctx, progress)
s.progressLine(ctx)
ctx.clear()
return true
2019-10-16 18:19:48 +09:00
}
func (s *Scanner) scanMultiLine(ctx *Context, c rune) error {
state := ctx.getMultiLineState()
2019-11-08 16:48:54 +09:00
ctx.addOriginBuf(c)
2019-10-16 18:19:48 +09:00
if ctx.isEOS() {
if s.isFirstCharAtLine && c == ' ' {
state.addIndent(ctx, s.column)
} else {
ctx.addBuf(c)
}
state.updateIndentColumn(s.column)
if err := state.validateIndentColumn(); err != nil {
2024-11-15 00:22:47 +09:00
invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos())
s.progressColumn(ctx, 1)
2024-11-15 00:22:47 +09:00
return ErrInvalidToken(invalidTk)
}
2019-10-16 18:19:48 +09:00
value := ctx.bufferedSrc()
ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos()))
ctx.clear()
2019-11-08 16:48:54 +09:00
s.progressColumn(ctx, 1)
} else if s.isNewLineChar(c) {
ctx.addBuf(c)
state.updateSpaceOnlyIndentColumn(s.column - 1)
state.updateNewLineState()
2019-10-16 18:19:48 +09:00
s.progressLine(ctx)
if ctx.next() {
if s.foundDocumentSeparatorMarker(ctx.src[ctx.idx:]) {
value := ctx.bufferedSrc()
ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos()))
ctx.clear()
s.breakMultiLine(ctx)
}
}
2019-10-16 18:19:48 +09:00
} else if s.isFirstCharAtLine && c == ' ' {
state.addIndent(ctx, s.column)
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
} else if s.isFirstCharAtLine && c == '\t' && state.isIndentColumn(s.column) {
err := ErrInvalidToken(
2024-11-15 00:22:47 +09:00
token.Invalid(
"found a tab character where an indentation space is expected",
string(ctx.obuf), s.pos(),
),
)
s.progressColumn(ctx, 1)
return err
} else if c == '\t' && !state.isIndentColumn(s.column) {
ctx.addBufWithTab(c)
s.progressColumn(ctx, 1)
2019-10-16 18:19:48 +09:00
} else {
if err := state.validateIndentAfterSpaceOnly(s.column); err != nil {
invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos())
s.progressColumn(ctx, 1)
return ErrInvalidToken(invalidTk)
}
state.updateIndentColumn(s.column)
if err := state.validateIndentColumn(); err != nil {
2024-11-15 00:22:47 +09:00
invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos())
s.progressColumn(ctx, 1)
2024-11-15 00:22:47 +09:00
return ErrInvalidToken(invalidTk)
2019-11-08 01:04:29 +09:00
}
if col := state.lastDelimColumn(); col > 0 {
s.lastDelimColumn = col
}
state.updateNewLineInFolded(ctx, s.column)
ctx.addBufWithTab(c)
2019-10-16 18:19:48 +09:00
s.progressColumn(ctx, 1)
}
return nil
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
func (s *Scanner) scanNewLine(ctx *Context, c rune) {
if len(ctx.buf) > 0 && s.savedPos == nil {
2024-11-12 16:44:43 +09:00
bufLen := len(ctx.bufferedSrc())
2024-10-28 18:15:16 +09:00
s.savedPos = s.pos()
2024-11-12 16:44:43 +09:00
s.savedPos.Column -= bufLen
s.savedPos.Offset -= bufLen
2024-10-28 18:15:16 +09:00
}
// if the following case, origin buffer has unnecessary two spaces.
// So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too.
// ---
// a:[space][space]
// b: c
ctx.removeRightSpaceFromBuf()
2024-10-28 18:15:16 +09:00
// There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec.
// > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character.
// > Outside scalar content, YAML allows any line break to be used to terminate lines.
// > -- https://yaml.org/spec/1.2/spec.html
if c == '\r' && ctx.nextChar() == '\n' {
ctx.addOriginBuf('\r')
s.progress(ctx, 1)
2024-11-12 16:44:43 +09:00
s.offset++
2024-10-28 18:15:16 +09:00
c = '\n'
}
if ctx.isEOS() {
s.addBufferedTokenIfExists(ctx)
} else if s.isAnchor || s.isAlias || s.isDirective {
2024-10-28 18:15:16 +09:00
s.addBufferedTokenIfExists(ctx)
}
if ctx.existsBuffer() && s.isFirstCharAtLine {
if ctx.buf[len(ctx.buf)-1] == ' ' {
ctx.buf[len(ctx.buf)-1] = '\n'
} else {
ctx.buf = append(ctx.buf, '\n')
}
} else {
ctx.addBuf(' ')
}
2024-10-28 18:15:16 +09:00
ctx.addOriginBuf(c)
s.progressLine(ctx)
}
func (s *Scanner) isFlowMode() bool {
if s.startedFlowSequenceNum > 0 {
return true
}
if s.startedFlowMapNum > 0 {
return true
}
return false
}
2024-10-28 18:15:16 +09:00
func (s *Scanner) scanFlowMapStart(ctx *Context) bool {
if ctx.existsBuffer() && !s.isFlowMode() {
2024-10-28 18:15:16 +09:00
return false
}
s.addBufferedTokenIfExists(ctx)
2024-10-28 18:15:16 +09:00
ctx.addOriginBuf('{')
ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos()))
s.startedFlowMapNum++
s.progressColumn(ctx, 1)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanFlowMapEnd(ctx *Context) bool {
if s.startedFlowMapNum <= 0 {
return false
}
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf('}')
ctx.addToken(token.MappingEnd(string(ctx.obuf), s.pos()))
s.startedFlowMapNum--
s.progressColumn(ctx, 1)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanFlowArrayStart(ctx *Context) bool {
if ctx.existsBuffer() && !s.isFlowMode() {
2024-10-28 18:15:16 +09:00
return false
}
s.addBufferedTokenIfExists(ctx)
2024-10-28 18:15:16 +09:00
ctx.addOriginBuf('[')
ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos()))
s.startedFlowSequenceNum++
s.progressColumn(ctx, 1)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanFlowArrayEnd(ctx *Context) bool {
if ctx.existsBuffer() && s.startedFlowSequenceNum <= 0 {
2024-10-28 18:15:16 +09:00
return false
}
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(']')
ctx.addToken(token.SequenceEnd(string(ctx.obuf), s.pos()))
s.startedFlowSequenceNum--
s.progressColumn(ctx, 1)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanFlowEntry(ctx *Context, c rune) bool {
if s.startedFlowSequenceNum <= 0 && s.startedFlowMapNum <= 0 {
return false
}
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.CollectEntry(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanMapDelim(ctx *Context) (bool, error) {
2024-10-28 18:15:16 +09:00
nc := ctx.nextChar()
if s.isDirective || s.isAnchor || s.isAlias {
return false, nil
}
if s.startedFlowMapNum <= 0 && nc != ' ' && nc != '\t' && !s.isNewLineChar(nc) && !ctx.isNextEOS() {
return false, nil
}
2024-12-09 22:51:20 +09:00
if s.startedFlowMapNum > 0 && nc == '/' {
// like http://
return false, nil
}
2025-01-18 13:29:55 +09:00
if s.startedFlowMapNum > 0 {
tk := ctx.lastToken()
if tk != nil && tk.Type == token.MappingValueType {
return false, nil
}
}
if strings.HasPrefix(strings.TrimPrefix(string(ctx.obuf), " "), "\t") && !strings.HasPrefix(string(ctx.buf), "\t") {
invalidTk := token.Invalid("tab character cannot use as a map key directly", string(ctx.obuf), s.pos())
s.progressColumn(ctx, 1)
return false, ErrInvalidToken(invalidTk)
2024-10-28 18:15:16 +09:00
}
// mapping value
tk := s.bufferedToken(ctx)
if tk != nil {
s.lastDelimColumn = tk.Position.Column
ctx.addToken(tk)
} else if tk := ctx.lastToken(); tk != nil {
// If the map key is quote, the buffer does not exist because it has already been cut into tokens.
// Therefore, we need to check the last token.
if tk.Indicator == token.QuotedScalarIndicator {
s.lastDelimColumn = tk.Position.Column
}
}
ctx.addToken(token.MappingValue(s.pos()))
s.progressColumn(ctx, 1)
ctx.clear()
return true, nil
2024-10-28 18:15:16 +09:00
}
func (s *Scanner) scanDocumentStart(ctx *Context) bool {
if s.indentNum != 0 {
return false
}
if s.column != 1 {
return false
}
if ctx.repeatNum('-') != 3 {
return false
}
if ctx.size > ctx.idx+3 {
c := ctx.src[ctx.idx+3]
if c != ' ' && c != '\t' && c != '\n' && c != '\r' {
return false
}
}
2024-10-28 18:15:16 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addToken(token.DocumentHeader(string(ctx.obuf)+"---", s.pos()))
s.progressColumn(ctx, 3)
ctx.clear()
s.clearState()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanDocumentEnd(ctx *Context) bool {
if s.indentNum != 0 {
return false
}
if s.column != 1 {
return false
}
if ctx.repeatNum('.') != 3 {
return false
}
s.addBufferedTokenIfExists(ctx)
2024-10-28 18:15:16 +09:00
ctx.addToken(token.DocumentEnd(string(ctx.obuf)+"...", s.pos()))
s.progressColumn(ctx, 3)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanMergeKey(ctx *Context) bool {
if !s.isMergeKey(ctx) {
return false
}
s.lastDelimColumn = s.column
ctx.addToken(token.MergeKey(string(ctx.obuf)+"<<", s.pos()))
s.progressColumn(ctx, 2)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanRawFoldedChar(ctx *Context) bool {
if !ctx.existsBuffer() {
return false
}
if !s.isChangedToIndentStateUp() {
return false
}
ctx.setRawFolded(s.column)
2024-10-28 18:15:16 +09:00
ctx.addBuf('-')
ctx.addOriginBuf('-')
s.progressColumn(ctx, 1)
return true
}
func (s *Scanner) scanSequence(ctx *Context) (bool, error) {
2024-10-28 18:15:16 +09:00
if ctx.existsBuffer() {
return false, nil
2024-10-28 18:15:16 +09:00
}
nc := ctx.nextChar()
if nc != 0 && nc != ' ' && nc != '\t' && !s.isNewLineChar(nc) {
return false, nil
}
if strings.HasPrefix(strings.TrimPrefix(string(ctx.obuf), " "), "\t") {
invalidTk := token.Invalid("tab character cannot use as a sequence delimiter", string(ctx.obuf), s.pos())
s.progressColumn(ctx, 1)
return false, ErrInvalidToken(invalidTk)
2024-10-28 18:15:16 +09:00
}
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf('-')
tk := token.SequenceEntry(string(ctx.obuf), s.pos())
s.lastDelimColumn = tk.Position.Column
ctx.addToken(tk)
s.progressColumn(ctx, 1)
ctx.clear()
return true, nil
2024-10-28 18:15:16 +09:00
}
func (s *Scanner) scanMultiLineHeader(ctx *Context) (bool, error) {
2024-10-28 18:15:16 +09:00
if ctx.existsBuffer() {
return false, nil
}
if err := s.scanMultiLineHeaderOption(ctx); err != nil {
2024-10-28 18:15:16 +09:00
return false, err
}
s.progressLine(ctx)
return true, nil
}
func (s *Scanner) validateMultiLineHeaderOption(opt string) error {
if len(opt) == 0 {
return nil
}
orgOpt := opt
opt = strings.TrimPrefix(opt, "-")
opt = strings.TrimPrefix(opt, "+")
opt = strings.TrimSuffix(opt, "-")
opt = strings.TrimSuffix(opt, "+")
if len(opt) == 0 {
return nil
}
if opt == "0" {
return fmt.Errorf("invalid header option: %q", orgOpt)
}
i, err := strconv.ParseInt(opt, 10, 64)
if err != nil {
return fmt.Errorf("invalid header option: %q", orgOpt)
}
if i > 9 {
return fmt.Errorf("invalid header option: %q", orgOpt)
}
return nil
}
func (s *Scanner) scanMultiLineHeaderOption(ctx *Context) error {
2019-10-16 18:19:48 +09:00
header := ctx.currentChar()
ctx.addOriginBuf(header)
s.progress(ctx, 1) // skip '|' or '>' character
var progress int
2019-10-16 18:19:48 +09:00
for idx, c := range ctx.src[ctx.idx:] {
progress = idx
2019-10-16 18:19:48 +09:00
ctx.addOriginBuf(c)
if s.isNewLineChar(c) {
break
}
}
value := strings.TrimRight(ctx.source(ctx.idx, ctx.idx+progress), " ")
commentValueIndex := strings.Index(value, "#")
opt := value
if commentValueIndex > 0 {
opt = value[:commentValueIndex]
}
opt = strings.TrimRightFunc(opt, func(r rune) bool {
return r == ' ' || r == '\t'
})
if len(opt) != 0 {
if err := s.validateMultiLineHeaderOption(opt); err != nil {
invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos())
s.progressColumn(ctx, progress)
return ErrInvalidToken(invalidTk)
2019-10-16 18:19:48 +09:00
}
}
if s.column == 1 {
s.lastDelimColumn = 1
}
commentIndex := strings.Index(string(ctx.obuf), "#")
headerBuf := string(ctx.obuf)
if commentIndex > 0 {
headerBuf = headerBuf[:commentIndex]
}
switch header {
case '|':
ctx.addToken(token.Literal("|"+opt, headerBuf, s.pos()))
ctx.setLiteral(s.lastDelimColumn, opt)
case '>':
ctx.addToken(token.Folded(">"+opt, headerBuf, s.pos()))
ctx.setFolded(s.lastDelimColumn, opt)
}
if commentIndex > 0 {
comment := string(value[commentValueIndex+1:])
s.offset += len(headerBuf)
s.column += len(headerBuf)
ctx.addToken(token.Comment(comment, string(ctx.obuf[len(headerBuf):]), s.pos()))
}
s.indentState = IndentStateKeep
ctx.resetBuffer()
s.progressColumn(ctx, progress)
return nil
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
func (s *Scanner) scanMapKey(ctx *Context) bool {
if ctx.existsBuffer() {
return false
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
nc := ctx.nextChar()
if nc != ' ' && nc != '\t' {
2024-10-28 18:15:16 +09:00
return false
}
tk := token.MappingKey(s.pos())
s.lastDelimColumn = tk.Position.Column
ctx.addToken(tk)
2024-10-28 18:15:16 +09:00
s.progressColumn(ctx, 1)
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanDirective(ctx *Context) bool {
if ctx.existsBuffer() {
return false
}
if s.indentNum != 0 {
return false
}
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf('%')
ctx.addToken(token.Directive(string(ctx.obuf), s.pos()))
2024-10-28 18:15:16 +09:00
s.progressColumn(ctx, 1)
ctx.clear()
s.isDirective = true
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanAnchor(ctx *Context) bool {
if ctx.existsBuffer() {
return false
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf('&')
ctx.addToken(token.Anchor(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
s.isAnchor = true
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
}
func (s *Scanner) scanAlias(ctx *Context) bool {
if ctx.existsBuffer() {
return false
}
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf('*')
ctx.addToken(token.Alias(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
s.isAlias = true
ctx.clear()
2024-10-28 18:15:16 +09:00
return true
2019-10-16 18:19:48 +09:00
}
2024-11-09 20:43:51 +09:00
func (s *Scanner) scanReservedChar(ctx *Context, c rune) error {
if ctx.existsBuffer() {
return nil
}
ctx.addBuf(c)
ctx.addOriginBuf(c)
2024-11-15 00:22:47 +09:00
err := ErrInvalidToken(
token.Invalid(
fmt.Sprintf("%q is a reserved character", c),
string(ctx.obuf), s.pos(),
),
)
2024-11-09 20:43:51 +09:00
s.progressColumn(ctx, 1)
ctx.clear()
return err
}
func (s *Scanner) scanTab(ctx *Context, c rune) error {
if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 {
// tabs character is allowed in flow mode.
return nil
}
if !s.isFirstCharAtLine {
return nil
}
ctx.addBuf(c)
ctx.addOriginBuf(c)
2024-11-15 00:22:47 +09:00
err := ErrInvalidToken(
token.Invalid("found character '\t' that cannot start any token",
string(ctx.obuf), s.pos(),
),
)
s.progressColumn(ctx, 1)
ctx.clear()
return err
}
func (s *Scanner) scan(ctx *Context) error {
2019-10-16 18:19:48 +09:00
for ctx.next() {
c := ctx.currentChar()
2024-10-28 18:15:16 +09:00
// First, change the IndentState.
// If the target character is the first character in a line, IndentState is Up/Down/Equal state.
// The second and subsequent letters are Keep.
2019-11-07 23:45:39 +09:00
s.updateIndent(ctx, c)
2024-10-28 18:15:16 +09:00
// If IndentState is down, tokens are split, so the buffer accumulated until that point needs to be cutted as a token.
if s.isChangedToIndentStateDown() {
s.addBufferedTokenIfExists(ctx)
}
if ctx.isMultiLine() {
if s.isChangedToIndentStateDown() {
if tk := ctx.lastToken(); tk != nil {
// If literal/folded content is empty, no string token is added.
// Therefore, add an empty string token.
2024-11-02 19:29:48 +09:00
// But if literal/folded token column is 1, it is invalid at down state.
if tk.Position.Column == 1 {
return ErrInvalidToken(
2024-11-15 00:22:47 +09:00
token.Invalid(
"could not find multi-line content",
2024-11-15 00:22:47 +09:00
string(ctx.obuf), s.pos(),
),
2024-11-02 19:29:48 +09:00
)
}
if tk.Type != token.StringType {
ctx.addToken(token.String("", "", s.pos()))
}
}
s.breakMultiLine(ctx)
2019-11-07 23:45:39 +09:00
} else {
if err := s.scanMultiLine(ctx, c); err != nil {
return err
}
2019-11-07 23:45:39 +09:00
continue
}
2019-10-16 18:19:48 +09:00
}
switch c {
case '{':
2024-10-28 18:15:16 +09:00
if s.scanFlowMapStart(ctx) {
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '}':
2024-10-28 18:15:16 +09:00
if s.scanFlowMapEnd(ctx) {
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '.':
2024-10-28 18:15:16 +09:00
if s.scanDocumentEnd(ctx) {
continue
2019-10-16 18:19:48 +09:00
}
case '<':
2024-10-28 18:15:16 +09:00
if s.scanMergeKey(ctx) {
continue
2019-10-16 18:19:48 +09:00
}
case '-':
2024-10-28 18:15:16 +09:00
if s.scanDocumentStart(ctx) {
continue
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
if s.scanRawFoldedChar(ctx) {
2019-10-16 18:19:48 +09:00
continue
}
scanned, err := s.scanSequence(ctx)
if err != nil {
return err
}
if scanned {
continue
}
2019-10-16 18:19:48 +09:00
case '[':
2024-10-28 18:15:16 +09:00
if s.scanFlowArrayStart(ctx) {
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case ']':
2024-10-28 18:15:16 +09:00
if s.scanFlowArrayEnd(ctx) {
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case ',':
2024-10-28 18:15:16 +09:00
if s.scanFlowEntry(ctx, c) {
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case ':':
scanned, err := s.scanMapDelim(ctx)
if err != nil {
return err
}
if scanned {
continue
2019-10-16 18:19:48 +09:00
}
case '|', '>':
scanned, err := s.scanMultiLineHeader(ctx)
2024-10-28 18:15:16 +09:00
if err != nil {
return err
2024-10-28 18:15:16 +09:00
}
if scanned {
2019-10-16 18:19:48 +09:00
continue
}
case '!':
scanned, err := s.scanTag(ctx)
if err != nil {
return err
}
if scanned {
continue
2019-10-16 18:19:48 +09:00
}
case '%':
2024-10-28 18:15:16 +09:00
if s.scanDirective(ctx) {
continue
2019-10-16 18:19:48 +09:00
}
case '?':
2024-10-28 18:15:16 +09:00
if s.scanMapKey(ctx) {
continue
2019-10-16 18:19:48 +09:00
}
case '&':
2024-10-28 18:15:16 +09:00
if s.scanAnchor(ctx) {
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '*':
2024-10-28 18:15:16 +09:00
if s.scanAlias(ctx) {
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '#':
if s.scanComment(ctx) {
continue
}
2019-10-16 18:19:48 +09:00
case '\'', '"':
scanned, err := s.scanQuote(ctx, c)
if err != nil {
return err
}
if scanned {
continue
2019-11-07 18:08:12 +09:00
}
2019-11-09 17:20:39 +09:00
case '\r', '\n':
2019-10-16 18:19:48 +09:00
s.scanNewLine(ctx, c)
continue
case ' ':
if s.scanWhiteSpace(ctx) {
2019-10-16 18:19:48 +09:00
continue
}
2024-11-09 20:43:51 +09:00
case '@', '`':
if err := s.scanReservedChar(ctx, c); err != nil {
return err
}
case '\t':
if ctx.existsBuffer() && s.lastDelimColumn == 0 {
// tab indent for plain text (yaml-test-suite's spec-example-7-12-plain-lines).
s.indentNum++
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
if s.lastDelimColumn < s.column {
s.indentNum++
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
if err := s.scanTab(ctx, c); err != nil {
return err
}
2019-10-16 18:19:48 +09:00
}
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
}
s.addBufferedTokenIfExists(ctx)
return nil
2019-10-16 18:19:48 +09:00
}
2019-10-21 12:53:30 +09:00
// Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
func (s *Scanner) Init(text string) {
src := []rune(text)
2019-10-16 18:19:48 +09:00
s.source = src
s.sourcePos = 0
s.sourceSize = len(src)
s.line = 1
s.column = 1
s.offset = 1
s.isFirstCharAtLine = true
s.clearState()
}
func (s *Scanner) clearState() {
s.prevLineIndentNum = 0
s.lastDelimColumn = 0
2019-10-16 18:19:48 +09:00
s.indentLevel = 0
s.indentNum = 0
}
2019-10-21 12:53:30 +09:00
// Scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
2019-10-16 18:19:48 +09:00
func (s *Scanner) Scan() (token.Tokens, error) {
if s.sourcePos >= s.sourceSize {
return nil, io.EOF
}
ctx := newContext(s.source[s.sourcePos:])
defer ctx.release()
var tokens token.Tokens
err := s.scan(ctx)
tokens = append(tokens, ctx.tokens...)
if err != nil {
var invalidTokenErr *InvalidTokenError
if errors.As(err, &invalidTokenErr) {
tokens = append(tokens, invalidTokenErr.Token)
}
return tokens, err
}
return tokens, nil
2019-10-16 18:19:48 +09:00
}