2019-10-16 18:19:48 +09:00
package scanner
import (
2024-10-28 21:24:15 +09:00
"errors"
2024-10-29 20:00:48 +09:00
"fmt"
2019-10-16 18:19:48 +09:00
"io"
2024-11-03 02:11:50 +09:00
"strconv"
2019-10-16 18:19:48 +09:00
"strings"
2024-07-16 12:30:20 +02:00
"github.com/goccy/go-yaml/token"
2019-10-16 18:19:48 +09:00
)
2019-10-21 12:53:30 +09:00
// IndentState state for indent
2019-10-16 18:19:48 +09:00
type IndentState int
const (
2019-10-21 12:53:30 +09:00
// IndentStateEqual equals previous indent
2019-10-16 18:19:48 +09:00
IndentStateEqual IndentState = iota
2019-10-21 12:53:30 +09:00
// IndentStateUp more indent than previous
2019-10-16 18:19:48 +09:00
IndentStateUp
2019-10-21 12:53:30 +09:00
// IndentStateDown less indent than previous
2019-10-16 18:19:48 +09:00
IndentStateDown
2019-10-21 12:53:30 +09:00
// IndentStateKeep uses not indent token
2019-10-16 18:19:48 +09:00
IndentStateKeep
)
2019-10-21 12:53:30 +09:00
// Scanner holds the scanner's internal state while processing a given text.
// It can be allocated as part of another data structure but must be initialized via Init before use.
2019-10-16 18:19:48 +09:00
type Scanner struct {
2024-10-28 15:59:31 +09:00
source [ ] rune
sourcePos int
sourceSize int
// line number. This number starts from 1.
line int
// column number. This number starts from 1.
column int
// offset represents the offset from the beginning of the source.
offset int
// lastDelimColumn is the last column needed to compare indent is retained.
lastDelimColumn int
// indentNum indicates the number of spaces used for indentation.
indentNum int
// prevLineIndentNum indicates the number of spaces used for indentation at previous line.
prevLineIndentNum int
// indentLevel indicates the level of indent depth. This value does not match the column value.
2019-11-07 13:00:00 +09:00
indentLevel int
isFirstCharAtLine bool
isAnchor bool
2024-11-26 22:41:11 +09:00
isAlias bool
2024-11-26 11:05:45 +09:00
isDirective bool
2019-11-07 13:00:00 +09:00
startedFlowSequenceNum int
startedFlowMapNum int
indentState IndentState
savedPos * token . Position
2019-10-16 18:19:48 +09:00
}
func ( s * Scanner ) pos ( ) * token . Position {
return & token . Position {
Line : s . line ,
Column : s . column ,
Offset : s . offset ,
IndentNum : s . indentNum ,
IndentLevel : s . indentLevel ,
}
}
func ( s * Scanner ) bufferedToken ( ctx * Context ) * token . Token {
if s . savedPos != nil {
tk := ctx . bufferedToken ( s . savedPos )
s . savedPos = nil
return tk
}
2022-12-19 16:04:56 +10:30
line := s . line
column := s . column - len ( ctx . buf )
level := s . indentLevel
2024-12-14 20:08:27 +09:00
if ctx . isMultiLine ( ) {
2022-12-19 16:04:56 +10:30
line -= s . newLineCount ( ctx . buf )
column = strings . Index ( string ( ctx . obuf ) , string ( ctx . buf ) ) + 1
// Since we are in a literal, folded or raw folded
// we can use the indent level from the last token.
last := ctx . lastToken ( )
if last != nil { // The last token should never be nil here.
level = last . Position . IndentLevel + 1
}
}
2019-10-16 18:19:48 +09:00
return ctx . bufferedToken ( & token . Position {
2022-12-19 16:04:56 +10:30
Line : line ,
Column : column ,
Offset : s . offset - len ( ctx . buf ) ,
2019-10-16 18:19:48 +09:00
IndentNum : s . indentNum ,
2022-12-19 16:04:56 +10:30
IndentLevel : level ,
2019-10-16 18:19:48 +09:00
} )
}
func ( s * Scanner ) progressColumn ( ctx * Context , num int ) {
s . column += num
s . offset += num
2024-10-29 20:00:48 +09:00
s . progress ( ctx , num )
2019-10-16 18:19:48 +09:00
}
func ( s * Scanner ) progressLine ( ctx * Context ) {
2024-10-28 15:59:31 +09:00
s . prevLineIndentNum = s . indentNum
2019-10-16 18:19:48 +09:00
s . column = 1
s . line ++
s . offset ++
s . indentNum = 0
s . isFirstCharAtLine = true
s . isAnchor = false
2024-11-26 22:41:11 +09:00
s . isAlias = false
2024-11-26 11:05:45 +09:00
s . isDirective = false
2024-10-29 20:00:48 +09:00
s . progress ( ctx , 1 )
}
func ( s * Scanner ) progress ( ctx * Context , num int ) {
ctx . progress ( num )
s . sourcePos += num
2019-10-16 18:19:48 +09:00
}
2019-12-22 11:59:49 +09:00
func ( s * Scanner ) isNewLineChar ( c rune ) bool {
if c == '\n' {
return true
}
if c == '\r' {
return true
}
return false
}
2019-12-22 17:28:30 +09:00
func ( s * Scanner ) newLineCount ( src [ ] rune ) int {
size := len ( src )
cnt := 0
for i := 0 ; i < size ; i ++ {
c := src [ i ]
switch c {
case '\r' :
if i + 1 < size && src [ i + 1 ] == '\n' {
i ++
}
cnt ++
case '\n' :
cnt ++
}
}
return cnt
}
2024-10-28 15:59:31 +09:00
func ( s * Scanner ) updateIndentLevel ( ) {
if s . prevLineIndentNum < s . indentNum {
s . indentLevel ++
} else if s . prevLineIndentNum > s . indentNum {
if s . indentLevel > 0 {
s . indentLevel --
2019-10-21 14:54:26 +09:00
}
2019-10-16 18:19:48 +09:00
}
2024-10-28 15:59:31 +09:00
}
func ( s * Scanner ) updateIndentState ( ctx * Context ) {
2024-12-14 20:08:27 +09:00
if s . lastDelimColumn == 0 {
return
2022-01-11 20:37:31 +09:00
}
2024-12-14 20:08:27 +09:00
if s . lastDelimColumn < s . column {
s . indentState = IndentStateUp
} else {
// If lastDelimColumn and s.column are the same,
// treat as Down state since it is the same column as delimiter.
s . indentState = IndentStateDown
2024-10-28 17:07:44 +09:00
}
}
2022-01-11 20:37:31 +09:00
func ( s * Scanner ) updateIndent ( ctx * Context , c rune ) {
2024-11-09 13:00:12 +09:00
if s . isFirstCharAtLine && s . isNewLineChar ( c ) {
2022-01-11 20:37:31 +09:00
return
}
if s . isFirstCharAtLine && c == ' ' {
s . indentNum ++
return
}
2024-11-13 16:21:12 +09:00
if s . isFirstCharAtLine && c == '\t' {
// found tab indent.
// In this case, scanTab returns error.
return
}
2022-01-11 20:37:31 +09:00
if ! s . isFirstCharAtLine {
s . indentState = IndentStateKeep
return
2019-10-21 15:58:46 +09:00
}
2024-10-28 17:07:44 +09:00
s . updateIndentLevel ( )
2022-01-11 20:37:31 +09:00
s . updateIndentState ( ctx )
2019-11-07 23:45:39 +09:00
s . isFirstCharAtLine = false
2019-10-16 18:19:48 +09:00
}
func ( s * Scanner ) isChangedToIndentStateDown ( ) bool {
return s . indentState == IndentStateDown
}
func ( s * Scanner ) isChangedToIndentStateUp ( ) bool {
return s . indentState == IndentStateUp
}
func ( s * Scanner ) addBufferedTokenIfExists ( ctx * Context ) {
ctx . addToken ( s . bufferedToken ( ctx ) )
}
2024-12-14 20:08:27 +09:00
func ( s * Scanner ) breakMultiLine ( ctx * Context ) {
ctx . breakMultiLine ( )
2019-10-16 18:19:48 +09:00
}
2024-10-31 22:54:26 +09:00
func ( s * Scanner ) scanSingleQuote ( ctx * Context ) ( * token . Token , error ) {
2020-05-29 18:09:51 +09:00
ctx . addOriginBuf ( '\'' )
2021-02-18 21:18:38 +00:00
srcpos := s . pos ( )
2019-10-16 18:19:48 +09:00
startIndex := ctx . idx + 1
2020-05-29 18:09:51 +09:00
src := ctx . src
size := len ( src )
value := [ ] rune { }
2020-06-15 17:30:27 +09:00
isFirstLineChar := false
2021-03-01 15:09:40 +00:00
isNewLine := false
2024-10-29 20:00:48 +09:00
2020-05-29 18:09:51 +09:00
for idx := startIndex ; idx < size ; idx ++ {
2021-03-01 15:09:40 +00:00
if ! isNewLine {
s . progressColumn ( ctx , 1 )
} else {
isNewLine = false
}
2020-05-29 18:09:51 +09:00
c := src [ idx ]
2019-10-16 18:19:48 +09:00
ctx . addOriginBuf ( c )
2020-06-15 17:30:27 +09:00
if s . isNewLineChar ( c ) {
2024-11-17 23:58:35 +09:00
notSpaceIdx := - 1
for i := len ( value ) - 1 ; i >= 0 ; i -- {
if value [ i ] == ' ' {
continue
}
notSpaceIdx = i
break
}
if len ( value ) > notSpaceIdx {
value = value [ : notSpaceIdx + 1 ]
}
if isFirstLineChar {
value = append ( value , '\n' )
} else {
value = append ( value , ' ' )
}
2020-06-15 17:30:27 +09:00
isFirstLineChar = true
2021-03-01 15:09:40 +00:00
isNewLine = true
s . progressLine ( ctx )
2024-11-30 10:16:17 +09:00
if idx + 1 < size {
if err := s . validateDocumentSeparatorMarker ( ctx , src [ idx + 1 : ] ) ; err != nil {
return nil , err
}
}
2020-06-15 17:30:27 +09:00
continue
2024-12-15 01:29:04 +09:00
} else if isFirstLineChar && c == ' ' {
continue
} else if isFirstLineChar && c == '\t' {
if s . lastDelimColumn >= s . column {
return nil , ErrInvalidToken (
token . Invalid (
"tab character cannot be used for indentation in single-quoted text" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
}
2020-06-15 17:30:27 +09:00
continue
} else if c != '\'' {
2020-05-29 18:09:51 +09:00
value = append ( value , c )
2020-06-15 17:30:27 +09:00
isFirstLineChar = false
2020-05-29 18:09:51 +09:00
continue
2024-11-30 10:16:17 +09:00
} else if idx + 1 < len ( ctx . src ) && ctx . src [ idx + 1 ] == '\'' {
2020-05-29 18:09:51 +09:00
// '' handle as ' character
value = append ( value , c )
2020-05-29 18:23:08 +09:00
ctx . addOriginBuf ( c )
2020-05-29 18:09:51 +09:00
idx ++
2024-10-29 20:00:48 +09:00
s . progressColumn ( ctx , 1 )
2020-05-29 18:09:51 +09:00
continue
}
2021-03-01 15:09:40 +00:00
s . progressColumn ( ctx , 1 )
2024-10-31 22:54:26 +09:00
return token . SingleQuote ( string ( value ) , string ( ctx . obuf ) , srcpos ) , nil
2020-05-29 18:09:51 +09:00
}
2024-10-31 22:54:26 +09:00
s . progressColumn ( ctx , 1 )
return nil , ErrInvalidToken (
2024-11-15 00:22:47 +09:00
token . Invalid (
"could not find end character of single-quoted text" ,
string ( ctx . obuf ) , srcpos ,
) ,
2024-10-31 22:54:26 +09:00
)
2020-05-29 18:09:51 +09:00
}
2020-06-20 14:10:17 +09:00
func hexToInt ( b rune ) int {
if b >= 'A' && b <= 'F' {
return int ( b ) - 'A' + 10
}
if b >= 'a' && b <= 'f' {
return int ( b ) - 'a' + 10
}
return int ( b ) - '0'
}
func hexRunesToInt ( b [ ] rune ) int {
sum := 0
for i := 0 ; i < len ( b ) ; i ++ {
sum += hexToInt ( b [ i ] ) << ( uint ( len ( b ) - i - 1 ) * 4 )
}
return sum
}
2024-10-31 22:54:26 +09:00
func ( s * Scanner ) scanDoubleQuote ( ctx * Context ) ( * token . Token , error ) {
2020-05-29 18:09:51 +09:00
ctx . addOriginBuf ( '"' )
2021-02-18 21:18:38 +00:00
srcpos := s . pos ( )
2020-05-29 18:09:51 +09:00
startIndex := ctx . idx + 1
src := ctx . src
size := len ( src )
value := [ ] rune { }
2020-06-15 17:30:27 +09:00
isFirstLineChar := false
2021-03-01 15:09:40 +00:00
isNewLine := false
2024-10-29 20:00:48 +09:00
2020-05-29 18:09:51 +09:00
for idx := startIndex ; idx < size ; idx ++ {
2021-03-01 15:09:40 +00:00
if ! isNewLine {
s . progressColumn ( ctx , 1 )
} else {
isNewLine = false
}
2020-05-29 18:09:51 +09:00
c := src [ idx ]
ctx . addOriginBuf ( c )
2020-06-15 17:30:27 +09:00
if s . isNewLineChar ( c ) {
2024-11-17 23:58:35 +09:00
notSpaceIdx := - 1
for i := len ( value ) - 1 ; i >= 0 ; i -- {
if value [ i ] == ' ' {
continue
2024-11-12 00:09:28 +09:00
}
2024-11-17 23:58:35 +09:00
notSpaceIdx = i
break
}
if len ( value ) > notSpaceIdx {
value = value [ : notSpaceIdx + 1 ]
}
if isFirstLineChar {
value = append ( value , '\n' )
2024-11-12 00:09:28 +09:00
} else {
value = append ( value , ' ' )
}
2020-06-15 17:30:27 +09:00
isFirstLineChar = true
2021-03-01 15:09:40 +00:00
isNewLine = true
s . progressLine ( ctx )
2024-11-30 10:16:17 +09:00
if idx + 1 < size {
if err := s . validateDocumentSeparatorMarker ( ctx , src [ idx + 1 : ] ) ; err != nil {
return nil , err
}
}
2020-06-15 17:30:27 +09:00
continue
2024-12-15 01:29:04 +09:00
} else if isFirstLineChar && c == ' ' {
continue
} else if isFirstLineChar && c == '\t' {
if s . lastDelimColumn >= s . column {
return nil , ErrInvalidToken (
token . Invalid (
"tab character cannot be used for indentation in double-quoted text" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
}
2020-06-15 17:30:27 +09:00
continue
} else if c == '\\' {
2020-06-20 14:10:17 +09:00
isFirstLineChar = false
2024-07-16 12:30:20 +02:00
if idx + 1 >= size {
value = append ( value , c )
continue
}
nextChar := src [ idx + 1 ]
progress := 0
switch nextChar {
2024-12-01 21:26:13 +09:00
case '0' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x00 )
case 'a' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x07 )
case 'b' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x08 )
case 't' :
progress = 1
ctx . addOriginBuf ( nextChar )
value = append ( value , 0x09 )
2024-07-16 12:30:20 +02:00
case 'n' :
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x0A )
case 'v' :
progress = 1
ctx . addOriginBuf ( nextChar )
value = append ( value , 0x0B )
case 'f' :
progress = 1
ctx . addOriginBuf ( nextChar )
value = append ( value , 0x0C )
2024-07-16 12:30:20 +02:00
case 'r' :
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x0D )
case 'e' :
2024-11-15 09:29:57 +09:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x1B )
case ' ' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x20 )
case '"' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x22 )
case '/' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x2F )
case '\\' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x5C )
case 'N' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0x85 )
case '_' :
2024-07-16 12:30:20 +02:00
progress = 1
ctx . addOriginBuf ( nextChar )
2024-12-01 21:26:13 +09:00
value = append ( value , 0xA0 )
case 'L' :
progress = 1
ctx . addOriginBuf ( nextChar )
value = append ( value , 0x2028 )
case 'P' :
progress = 1
ctx . addOriginBuf ( nextChar )
value = append ( value , 0x2029 )
2024-07-16 12:30:20 +02:00
case 'x' :
2024-10-29 20:00:48 +09:00
if idx + 3 >= size {
progress = 1
ctx . addOriginBuf ( nextChar )
value = append ( value , nextChar )
} else {
progress = 3
codeNum := hexRunesToInt ( src [ idx + 2 : idx + progress + 1 ] )
value = append ( value , rune ( codeNum ) )
2020-05-29 18:09:51 +09:00
}
2024-07-16 12:30:20 +02:00
case 'u' :
2024-12-02 11:30:11 +09:00
// \u0000 style must have 5 characters at least.
2024-10-29 20:00:48 +09:00
if idx + 5 >= size {
2024-12-02 11:30:11 +09:00
return nil , ErrInvalidToken (
token . Invalid (
"not enough length for escaped UTF-16 character" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
2024-07-16 12:30:20 +02:00
}
2024-12-02 11:30:11 +09:00
progress = 5
codeNum := hexRunesToInt ( src [ idx + 2 : idx + 6 ] )
// handle surrogate pairs.
if codeNum >= 0xD800 && codeNum <= 0xDBFF {
high := codeNum
// \u0000\u0000 style must have 11 characters at least.
if idx + 11 >= size {
return nil , ErrInvalidToken (
token . Invalid (
"not enough length for escaped UTF-16 surrogate pair" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
}
if src [ idx + 6 ] != '\\' || src [ idx + 7 ] != 'u' {
return nil , ErrInvalidToken (
token . Invalid (
"found unexpected character after high surrogate for UTF-16 surrogate pair" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
}
low := hexRunesToInt ( src [ idx + 8 : idx + 12 ] )
if low < 0xDC00 || low > 0xDFFF {
return nil , ErrInvalidToken (
token . Invalid (
"found unexpected low surrogate after high surrogate" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
}
codeNum = ( ( high - 0xD800 ) * 0x400 ) + ( low - 0xDC00 ) + 0x10000
progress += 6
}
value = append ( value , rune ( codeNum ) )
2024-07-16 12:30:20 +02:00
case 'U' :
2024-12-02 11:30:11 +09:00
// \U00000000 style must have 9 characters at least.
2024-10-29 20:00:48 +09:00
if idx + 9 >= size {
2024-12-02 11:30:11 +09:00
return nil , ErrInvalidToken (
token . Invalid (
"not enough length for escaped UTF-32 character" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
2024-07-16 12:30:20 +02:00
}
2024-12-02 11:30:11 +09:00
progress = 9
codeNum := hexRunesToInt ( src [ idx + 2 : idx + 10 ] )
value = append ( value , rune ( codeNum ) )
2024-11-12 00:09:28 +09:00
case '\n' :
isFirstLineChar = true
isNewLine = true
ctx . addOriginBuf ( nextChar )
s . progressColumn ( ctx , 1 )
s . progressLine ( ctx )
idx ++
continue
2024-11-17 23:58:35 +09:00
case '\t' :
progress = 1
ctx . addOriginBuf ( nextChar )
value = append ( value , nextChar )
2024-07-16 12:30:20 +02:00
default :
2024-11-30 10:16:17 +09:00
s . progressColumn ( ctx , 1 )
return nil , ErrInvalidToken (
token . Invalid (
fmt . Sprintf ( "found unknown escape character %q" , nextChar ) ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
2019-10-16 18:19:48 +09:00
}
2024-07-16 12:30:20 +02:00
idx += progress
s . progressColumn ( ctx , progress )
2020-05-29 18:09:51 +09:00
continue
2024-11-17 23:58:35 +09:00
} else if c == '\t' {
var (
foundNotSpaceChar bool
progress int
)
for i := idx + 1 ; i < size ; i ++ {
if src [ i ] == ' ' || src [ i ] == '\t' {
progress ++
continue
}
2024-11-28 23:34:45 +09:00
if s . isNewLineChar ( src [ i ] ) {
2024-11-17 23:58:35 +09:00
break
}
foundNotSpaceChar = true
}
if foundNotSpaceChar {
value = append ( value , c )
2024-11-28 23:34:45 +09:00
if src [ idx + 1 ] != '"' {
s . progressColumn ( ctx , 1 )
}
2024-11-17 23:58:35 +09:00
} else {
idx += progress
s . progressColumn ( ctx , progress )
}
continue
2020-05-29 18:09:51 +09:00
} else if c != '"' {
value = append ( value , c )
2020-06-15 17:30:27 +09:00
isFirstLineChar = false
2020-05-29 18:09:51 +09:00
continue
2019-10-16 18:19:48 +09:00
}
2021-03-01 15:09:40 +00:00
s . progressColumn ( ctx , 1 )
2024-10-31 22:54:26 +09:00
return token . DoubleQuote ( string ( value ) , string ( ctx . obuf ) , srcpos ) , nil
2019-10-16 18:19:48 +09:00
}
2024-10-31 22:54:26 +09:00
s . progressColumn ( ctx , 1 )
return nil , ErrInvalidToken (
2024-11-15 00:22:47 +09:00
token . Invalid (
"could not find end character of double-quoted text" ,
string ( ctx . obuf ) , srcpos ,
) ,
2024-10-31 22:54:26 +09:00
)
2019-10-16 18:19:48 +09:00
}
2024-11-30 10:16:17 +09:00
func ( s * Scanner ) validateDocumentSeparatorMarker ( ctx * Context , src [ ] rune ) error {
2024-11-30 14:47:52 +09:00
if s . foundDocumentSeparatorMarker ( src ) {
return ErrInvalidToken (
token . Invalid ( "found unexpected document separator" , string ( ctx . obuf ) , s . pos ( ) ) ,
)
}
return nil
}
func ( s * Scanner ) foundDocumentSeparatorMarker ( src [ ] rune ) bool {
2024-11-30 10:16:17 +09:00
if len ( src ) < 3 {
2024-11-30 14:47:52 +09:00
return false
2024-11-30 10:16:17 +09:00
}
var marker string
if len ( src ) == 3 {
marker = string ( src )
} else {
marker = strings . TrimRightFunc ( string ( src [ : 4 ] ) , func ( r rune ) bool {
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
} )
}
2024-11-30 14:47:52 +09:00
return marker == "---" || marker == "..."
2024-11-30 10:16:17 +09:00
}
2024-10-31 22:54:26 +09:00
func ( s * Scanner ) scanQuote ( ctx * Context , ch rune ) ( bool , error ) {
2024-10-30 02:18:20 +09:00
if ctx . existsBuffer ( ) {
2024-10-31 22:54:26 +09:00
return false , nil
2024-10-30 02:18:20 +09:00
}
2020-05-29 18:09:51 +09:00
if ch == '\'' {
2024-10-31 22:54:26 +09:00
tk , err := s . scanSingleQuote ( ctx )
if err != nil {
return false , err
}
ctx . addToken ( tk )
2024-10-30 02:18:20 +09:00
} else {
2024-10-31 22:54:26 +09:00
tk , err := s . scanDoubleQuote ( ctx )
if err != nil {
return false , err
}
ctx . addToken ( tk )
2020-05-29 18:09:51 +09:00
}
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-31 22:54:26 +09:00
return true , nil
2024-10-30 02:18:20 +09:00
}
func ( s * Scanner ) scanWhiteSpace ( ctx * Context ) bool {
2024-12-14 20:08:27 +09:00
if ctx . isMultiLine ( ) {
2024-10-30 02:18:20 +09:00
return false
}
2024-12-09 18:45:26 +09:00
if ! s . isAnchor && ! s . isDirective && ! s . isAlias && ! s . isFirstCharAtLine {
2024-10-30 02:18:20 +09:00
return false
}
if s . isFirstCharAtLine {
s . progressColumn ( ctx , 1 )
ctx . addOriginBuf ( ' ' )
return true
}
2024-12-09 18:45:26 +09:00
if s . isDirective {
s . addBufferedTokenIfExists ( ctx )
s . progressColumn ( ctx , 1 )
ctx . addOriginBuf ( ' ' )
return true
}
2024-10-30 02:18:20 +09:00
s . addBufferedTokenIfExists ( ctx )
s . isAnchor = false
2024-11-26 22:41:11 +09:00
s . isAlias = false
2024-10-30 02:18:20 +09:00
return true
2020-05-29 18:09:51 +09:00
}
2021-03-01 17:32:11 +09:00
func ( s * Scanner ) isMergeKey ( ctx * Context ) bool {
if ctx . repeatNum ( '<' ) != 2 {
return false
}
src := ctx . src
size := len ( src )
for idx := ctx . idx + 2 ; idx < size ; idx ++ {
c := src [ idx ]
if c == ' ' {
continue
}
if c != ':' {
return false
}
if idx + 1 < size {
nc := src [ idx + 1 ]
if nc == ' ' || s . isNewLineChar ( nc ) {
return true
}
}
}
return false
}
2024-12-14 22:57:55 +09:00
func ( s * Scanner ) scanTag ( ctx * Context ) ( bool , error ) {
2024-12-09 18:45:26 +09:00
if ctx . existsBuffer ( ) || s . isDirective {
2024-12-14 22:57:55 +09:00
return false , nil
2024-10-30 02:18:20 +09:00
}
2019-10-16 18:19:48 +09:00
ctx . addOriginBuf ( '!' )
2024-10-29 20:00:48 +09:00
s . progress ( ctx , 1 ) // skip '!' character
2024-10-30 02:18:20 +09:00
var progress int
2019-10-16 18:19:48 +09:00
for idx , c := range ctx . src [ ctx . idx : ] {
2024-10-29 20:00:48 +09:00
progress = idx + 1
2019-10-16 18:19:48 +09:00
switch c {
2024-11-12 22:08:26 +09:00
case ' ' :
2024-11-28 23:34:45 +09:00
ctx . addOriginBuf ( c )
2019-10-16 18:19:48 +09:00
value := ctx . source ( ctx . idx - 1 , ctx . idx + idx )
2024-10-30 02:18:20 +09:00
ctx . addToken ( token . Tag ( value , string ( ctx . obuf ) , s . pos ( ) ) )
2024-11-12 22:08:26 +09:00
s . progressColumn ( ctx , len ( [ ] rune ( value ) ) )
ctx . clear ( )
2024-12-14 22:57:55 +09:00
return true , nil
2024-11-28 23:34:45 +09:00
case ',' :
if s . startedFlowSequenceNum > 0 || s . startedFlowMapNum > 0 {
value := ctx . source ( ctx . idx - 1 , ctx . idx + idx )
ctx . addToken ( token . Tag ( value , string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , len ( [ ] rune ( value ) ) - 1 ) // progress column before collect-entry for scanning it at scanFlowEntry function.
ctx . clear ( )
2024-12-14 22:57:55 +09:00
return true , nil
2024-11-28 23:34:45 +09:00
} else {
ctx . addOriginBuf ( c )
}
2024-11-12 22:08:26 +09:00
case '\n' , '\r' :
2024-11-28 23:34:45 +09:00
ctx . addOriginBuf ( c )
2024-11-12 22:08:26 +09:00
value := ctx . source ( ctx . idx - 1 , ctx . idx + idx )
ctx . addToken ( token . Tag ( value , string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , len ( [ ] rune ( value ) ) - 1 ) // progress column before new-line-char for scanning new-line-char at scanNewLine function.
ctx . clear ( )
2024-12-14 22:57:55 +09:00
return true , nil
case '{' , '}' :
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , progress )
invalidTk := token . Invalid ( fmt . Sprintf ( "found invalid tag character %q" , c ) , string ( ctx . obuf ) , s . pos ( ) )
return false , ErrInvalidToken ( invalidTk )
2024-11-28 23:34:45 +09:00
default :
ctx . addOriginBuf ( c )
2019-10-16 18:19:48 +09:00
}
}
2024-10-29 20:00:48 +09:00
s . progressColumn ( ctx , progress )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-12-14 22:57:55 +09:00
return true , nil
2019-10-16 18:19:48 +09:00
}
2024-10-30 02:18:20 +09:00
func ( s * Scanner ) scanComment ( ctx * Context ) bool {
2024-12-07 15:15:18 +09:00
if ctx . existsBuffer ( ) {
c := ctx . previousChar ( )
if c != ' ' && c != '\t' && ! s . isNewLineChar ( c ) {
return false
}
2024-10-30 02:18:20 +09:00
}
s . addBufferedTokenIfExists ( ctx )
2019-10-16 18:19:48 +09:00
ctx . addOriginBuf ( '#' )
2024-10-29 20:00:48 +09:00
s . progress ( ctx , 1 ) // skip '#' character
2019-10-16 18:19:48 +09:00
for idx , c := range ctx . src [ ctx . idx : ] {
ctx . addOriginBuf ( c )
2024-12-07 15:15:18 +09:00
if ! s . isNewLineChar ( c ) {
continue
2019-10-16 18:19:48 +09:00
}
2024-12-07 15:15:18 +09:00
if ctx . previousChar ( ) == '\\' {
continue
}
value := ctx . source ( ctx . idx , ctx . idx + idx )
progress := len ( [ ] rune ( value ) )
ctx . addToken ( token . Comment ( value , string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , progress )
s . progressLine ( ctx )
ctx . clear ( )
return true
2019-10-16 18:19:48 +09:00
}
2023-03-01 16:59:07 +09:00
// document ends with comment.
value := string ( ctx . src [ ctx . idx : ] )
2024-10-30 02:18:20 +09:00
ctx . addToken ( token . Comment ( value , string ( ctx . obuf ) , s . pos ( ) ) )
2024-10-29 20:00:48 +09:00
progress := len ( [ ] rune ( value ) )
s . progressColumn ( ctx , progress )
s . progressLine ( ctx )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
return true
2019-10-16 18:19:48 +09:00
}
2024-12-14 20:08:27 +09:00
func ( s * Scanner ) scanMultiLine ( ctx * Context , c rune ) error {
state := ctx . getMultiLineState ( )
2019-11-08 16:48:54 +09:00
ctx . addOriginBuf ( c )
2019-10-16 18:19:48 +09:00
if ctx . isEOS ( ) {
2024-11-17 23:58:35 +09:00
if s . isFirstCharAtLine && c == ' ' {
2024-12-14 20:08:27 +09:00
state . addIndent ( ctx , s . column )
2024-11-17 23:58:35 +09:00
} else {
ctx . addBuf ( c )
}
2024-12-14 20:08:27 +09:00
state . updateIndentColumn ( s . column )
if err := state . validateIndentColumn ( ) ; err != nil {
2024-11-15 00:22:47 +09:00
invalidTk := token . Invalid ( err . Error ( ) , string ( ctx . obuf ) , s . pos ( ) )
2024-11-03 02:11:50 +09:00
s . progressColumn ( ctx , 1 )
2024-11-15 00:22:47 +09:00
return ErrInvalidToken ( invalidTk )
2024-11-03 02:11:50 +09:00
}
2019-10-16 18:19:48 +09:00
value := ctx . bufferedSrc ( )
2020-06-01 12:54:23 +09:00
ctx . addToken ( token . String ( string ( value ) , string ( ctx . obuf ) , s . pos ( ) ) )
2024-11-17 23:58:35 +09:00
ctx . clear ( )
2019-11-08 16:48:54 +09:00
s . progressColumn ( ctx , 1 )
2019-12-22 11:59:49 +09:00
} else if s . isNewLineChar ( c ) {
2024-11-09 13:00:12 +09:00
ctx . addBuf ( c )
2024-12-14 20:08:27 +09:00
state . updateSpaceOnlyIndentColumn ( s . column - 1 )
state . updateNewLineState ( )
2019-10-16 18:19:48 +09:00
s . progressLine ( ctx )
2024-11-30 14:47:52 +09:00
if ctx . next ( ) {
if s . foundDocumentSeparatorMarker ( ctx . src [ ctx . idx : ] ) {
value := ctx . bufferedSrc ( )
ctx . addToken ( token . String ( string ( value ) , string ( ctx . obuf ) , s . pos ( ) ) )
ctx . clear ( )
2024-12-14 20:08:27 +09:00
s . breakMultiLine ( ctx )
2024-11-30 14:47:52 +09:00
}
}
2019-10-16 18:19:48 +09:00
} else if s . isFirstCharAtLine && c == ' ' {
2024-12-14 20:08:27 +09:00
state . addIndent ( ctx , s . column )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 1 )
2024-12-14 20:08:27 +09:00
} else if s . isFirstCharAtLine && c == '\t' && state . isIndentColumn ( s . column ) {
2024-11-13 16:21:12 +09:00
err := ErrInvalidToken (
2024-11-15 00:22:47 +09:00
token . Invalid (
"found a tab character where an indentation space is expected" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
2024-11-13 16:21:12 +09:00
)
s . progressColumn ( ctx , 1 )
return err
2024-12-14 20:08:27 +09:00
} else if c == '\t' && ! state . isIndentColumn ( s . column ) {
ctx . addBufWithTab ( c )
s . progressColumn ( ctx , 1 )
2019-10-16 18:19:48 +09:00
} else {
2024-12-14 20:08:27 +09:00
if err := state . validateIndentAfterSpaceOnly ( s . column ) ; err != nil {
2024-12-07 14:37:58 +09:00
invalidTk := token . Invalid ( err . Error ( ) , string ( ctx . obuf ) , s . pos ( ) )
s . progressColumn ( ctx , 1 )
return ErrInvalidToken ( invalidTk )
}
2024-12-14 20:08:27 +09:00
state . updateIndentColumn ( s . column )
if err := state . validateIndentColumn ( ) ; err != nil {
2024-11-15 00:22:47 +09:00
invalidTk := token . Invalid ( err . Error ( ) , string ( ctx . obuf ) , s . pos ( ) )
2024-11-03 02:11:50 +09:00
s . progressColumn ( ctx , 1 )
2024-11-15 00:22:47 +09:00
return ErrInvalidToken ( invalidTk )
2019-11-08 01:04:29 +09:00
}
2024-12-14 20:08:27 +09:00
if col := state . lastDelimColumn ( ) ; col > 0 {
s . lastDelimColumn = col
}
state . updateNewLineInFolded ( ctx , s . column )
2024-11-26 22:41:11 +09:00
ctx . addBufWithTab ( c )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 1 )
}
2024-11-03 02:11:50 +09:00
return nil
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
func ( s * Scanner ) scanNewLine ( ctx * Context , c rune ) {
if len ( ctx . buf ) > 0 && s . savedPos == nil {
2024-11-12 16:44:43 +09:00
bufLen := len ( ctx . bufferedSrc ( ) )
2024-10-28 18:15:16 +09:00
s . savedPos = s . pos ( )
2024-11-12 16:44:43 +09:00
s . savedPos . Column -= bufLen
s . savedPos . Offset -= bufLen
2024-10-28 18:15:16 +09:00
}
// if the following case, origin buffer has unnecessary two spaces.
// So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too.
// ---
// a:[space][space]
// b: c
2024-11-13 13:09:30 +09:00
ctx . removeRightSpaceFromBuf ( )
2024-10-28 18:15:16 +09:00
// There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec.
// > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character.
// > Outside scalar content, YAML allows any line break to be used to terminate lines.
// > -- https://yaml.org/spec/1.2/spec.html
if c == '\r' && ctx . nextChar ( ) == '\n' {
ctx . addOriginBuf ( '\r' )
2024-10-29 20:00:48 +09:00
s . progress ( ctx , 1 )
2024-11-12 16:44:43 +09:00
s . offset ++
2024-10-28 18:15:16 +09:00
c = '\n'
}
if ctx . isEOS ( ) {
s . addBufferedTokenIfExists ( ctx )
2024-11-30 10:16:17 +09:00
} else if s . isAnchor || s . isAlias || s . isDirective {
2024-10-28 18:15:16 +09:00
s . addBufferedTokenIfExists ( ctx )
}
2024-11-09 13:00:12 +09:00
if ctx . existsBuffer ( ) && s . isFirstCharAtLine {
if ctx . buf [ len ( ctx . buf ) - 1 ] == ' ' {
ctx . buf [ len ( ctx . buf ) - 1 ] = '\n'
} else {
ctx . buf = append ( ctx . buf , '\n' )
}
} else {
ctx . addBuf ( ' ' )
}
2024-10-28 18:15:16 +09:00
ctx . addOriginBuf ( c )
s . progressLine ( ctx )
}
2024-11-01 15:03:27 +09:00
func ( s * Scanner ) isFlowMode ( ) bool {
if s . startedFlowSequenceNum > 0 {
return true
}
if s . startedFlowMapNum > 0 {
return true
}
return false
}
2024-10-28 18:15:16 +09:00
func ( s * Scanner ) scanFlowMapStart ( ctx * Context ) bool {
2024-11-01 15:03:27 +09:00
if ctx . existsBuffer ( ) && ! s . isFlowMode ( ) {
2024-10-28 18:15:16 +09:00
return false
}
2024-11-01 15:03:27 +09:00
s . addBufferedTokenIfExists ( ctx )
2024-10-28 18:15:16 +09:00
ctx . addOriginBuf ( '{' )
ctx . addToken ( token . MappingStart ( string ( ctx . obuf ) , s . pos ( ) ) )
s . startedFlowMapNum ++
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanFlowMapEnd ( ctx * Context ) bool {
if s . startedFlowMapNum <= 0 {
return false
}
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( '}' )
ctx . addToken ( token . MappingEnd ( string ( ctx . obuf ) , s . pos ( ) ) )
s . startedFlowMapNum --
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanFlowArrayStart ( ctx * Context ) bool {
2024-11-01 15:03:27 +09:00
if ctx . existsBuffer ( ) && ! s . isFlowMode ( ) {
2024-10-28 18:15:16 +09:00
return false
}
2024-11-01 15:03:27 +09:00
s . addBufferedTokenIfExists ( ctx )
2024-10-28 18:15:16 +09:00
ctx . addOriginBuf ( '[' )
ctx . addToken ( token . SequenceStart ( string ( ctx . obuf ) , s . pos ( ) ) )
s . startedFlowSequenceNum ++
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanFlowArrayEnd ( ctx * Context ) bool {
2024-10-31 15:35:43 +09:00
if ctx . existsBuffer ( ) && s . startedFlowSequenceNum <= 0 {
2024-10-28 18:15:16 +09:00
return false
}
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( ']' )
ctx . addToken ( token . SequenceEnd ( string ( ctx . obuf ) , s . pos ( ) ) )
s . startedFlowSequenceNum --
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanFlowEntry ( ctx * Context , c rune ) bool {
if s . startedFlowSequenceNum <= 0 && s . startedFlowMapNum <= 0 {
return false
}
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( c )
ctx . addToken ( token . CollectEntry ( string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
2024-11-26 22:41:11 +09:00
func ( s * Scanner ) scanMapDelim ( ctx * Context ) ( bool , error ) {
2024-10-28 18:15:16 +09:00
nc := ctx . nextChar ( )
2024-11-26 22:41:11 +09:00
if s . isDirective || s . isAnchor || s . isAlias {
return false , nil
2024-11-26 11:05:45 +09:00
}
2024-11-13 16:21:12 +09:00
if s . startedFlowMapNum <= 0 && nc != ' ' && nc != '\t' && ! s . isNewLineChar ( nc ) && ! ctx . isNextEOS ( ) {
2024-11-26 22:41:11 +09:00
return false , nil
}
2024-12-09 22:51:20 +09:00
if s . startedFlowMapNum > 0 && nc == '/' {
// like http://
return false , nil
}
2025-01-18 13:29:55 +09:00
if s . startedFlowMapNum > 0 {
tk := ctx . lastToken ( )
if tk != nil && tk . Type == token . MappingValueType {
return false , nil
}
}
2024-11-26 22:41:11 +09:00
if strings . HasPrefix ( strings . TrimPrefix ( string ( ctx . obuf ) , " " ) , "\t" ) && ! strings . HasPrefix ( string ( ctx . buf ) , "\t" ) {
invalidTk := token . Invalid ( "tab character cannot use as a map key directly" , string ( ctx . obuf ) , s . pos ( ) )
s . progressColumn ( ctx , 1 )
return false , ErrInvalidToken ( invalidTk )
2024-10-28 18:15:16 +09:00
}
// mapping value
tk := s . bufferedToken ( ctx )
if tk != nil {
s . lastDelimColumn = tk . Position . Column
ctx . addToken ( tk )
} else if tk := ctx . lastToken ( ) ; tk != nil {
// If the map key is quote, the buffer does not exist because it has already been cut into tokens.
// Therefore, we need to check the last token.
if tk . Indicator == token . QuotedScalarIndicator {
s . lastDelimColumn = tk . Position . Column
}
}
ctx . addToken ( token . MappingValue ( s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-11-26 22:41:11 +09:00
return true , nil
2024-10-28 18:15:16 +09:00
}
func ( s * Scanner ) scanDocumentStart ( ctx * Context ) bool {
if s . indentNum != 0 {
return false
}
if s . column != 1 {
return false
}
if ctx . repeatNum ( '-' ) != 3 {
return false
}
2024-11-26 11:05:45 +09:00
if ctx . size > ctx . idx + 3 {
c := ctx . src [ ctx . idx + 3 ]
if c != ' ' && c != '\t' && c != '\n' && c != '\r' {
return false
}
}
2024-10-28 18:15:16 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addToken ( token . DocumentHeader ( string ( ctx . obuf ) + "---" , s . pos ( ) ) )
s . progressColumn ( ctx , 3 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-11-28 23:34:45 +09:00
s . clearState ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanDocumentEnd ( ctx * Context ) bool {
if s . indentNum != 0 {
return false
}
if s . column != 1 {
return false
}
if ctx . repeatNum ( '.' ) != 3 {
return false
}
2024-11-30 10:16:17 +09:00
s . addBufferedTokenIfExists ( ctx )
2024-10-28 18:15:16 +09:00
ctx . addToken ( token . DocumentEnd ( string ( ctx . obuf ) + "..." , s . pos ( ) ) )
s . progressColumn ( ctx , 3 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanMergeKey ( ctx * Context ) bool {
if ! s . isMergeKey ( ctx ) {
return false
}
s . lastDelimColumn = s . column
ctx . addToken ( token . MergeKey ( string ( ctx . obuf ) + "<<" , s . pos ( ) ) )
2024-10-29 20:00:48 +09:00
s . progressColumn ( ctx , 2 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanRawFoldedChar ( ctx * Context ) bool {
if ! ctx . existsBuffer ( ) {
return false
}
if ! s . isChangedToIndentStateUp ( ) {
return false
}
2024-12-14 20:08:27 +09:00
ctx . setRawFolded ( s . column )
2024-10-28 18:15:16 +09:00
ctx . addBuf ( '-' )
ctx . addOriginBuf ( '-' )
s . progressColumn ( ctx , 1 )
return true
}
2024-11-26 22:41:11 +09:00
func ( s * Scanner ) scanSequence ( ctx * Context ) ( bool , error ) {
2024-10-28 18:15:16 +09:00
if ctx . existsBuffer ( ) {
2024-11-26 22:41:11 +09:00
return false , nil
2024-10-28 18:15:16 +09:00
}
nc := ctx . nextChar ( )
2024-11-26 22:41:11 +09:00
if nc != 0 && nc != ' ' && nc != '\t' && ! s . isNewLineChar ( nc ) {
return false , nil
}
if strings . HasPrefix ( strings . TrimPrefix ( string ( ctx . obuf ) , " " ) , "\t" ) {
invalidTk := token . Invalid ( "tab character cannot use as a sequence delimiter" , string ( ctx . obuf ) , s . pos ( ) )
s . progressColumn ( ctx , 1 )
return false , ErrInvalidToken ( invalidTk )
2024-10-28 18:15:16 +09:00
}
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( '-' )
tk := token . SequenceEntry ( string ( ctx . obuf ) , s . pos ( ) )
s . lastDelimColumn = tk . Position . Column
ctx . addToken ( tk )
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-11-26 22:41:11 +09:00
return true , nil
2024-10-28 18:15:16 +09:00
}
2024-12-14 20:08:27 +09:00
func ( s * Scanner ) scanMultiLineHeader ( ctx * Context ) ( bool , error ) {
2024-10-28 18:15:16 +09:00
if ctx . existsBuffer ( ) {
return false , nil
}
2024-12-14 20:08:27 +09:00
if err := s . scanMultiLineHeaderOption ( ctx ) ; err != nil {
2024-10-28 18:15:16 +09:00
return false , err
}
s . progressLine ( ctx )
return true , nil
}
2024-12-14 20:08:27 +09:00
func ( s * Scanner ) validateMultiLineHeaderOption ( opt string ) error {
2024-11-03 02:11:50 +09:00
if len ( opt ) == 0 {
return nil
}
2024-12-10 00:20:00 +09:00
orgOpt := opt
2024-11-05 15:25:58 +09:00
opt = strings . TrimPrefix ( opt , "-" )
opt = strings . TrimPrefix ( opt , "+" )
opt = strings . TrimSuffix ( opt , "-" )
opt = strings . TrimSuffix ( opt , "+" )
2024-11-03 02:11:50 +09:00
if len ( opt ) == 0 {
return nil
}
2024-12-10 00:20:00 +09:00
if opt == "0" {
return fmt . Errorf ( "invalid header option: %q" , orgOpt )
}
i , err := strconv . ParseInt ( opt , 10 , 64 )
if err != nil {
return fmt . Errorf ( "invalid header option: %q" , orgOpt )
}
if i > 9 {
return fmt . Errorf ( "invalid header option: %q" , orgOpt )
2024-11-03 02:11:50 +09:00
}
return nil
}
2024-12-14 20:08:27 +09:00
func ( s * Scanner ) scanMultiLineHeaderOption ( ctx * Context ) error {
2019-10-16 18:19:48 +09:00
header := ctx . currentChar ( )
ctx . addOriginBuf ( header )
2024-10-29 20:00:48 +09:00
s . progress ( ctx , 1 ) // skip '|' or '>' character
2024-12-10 00:20:00 +09:00
var progress int
2019-10-16 18:19:48 +09:00
for idx , c := range ctx . src [ ctx . idx : ] {
2024-12-10 00:20:00 +09:00
progress = idx
2019-10-16 18:19:48 +09:00
ctx . addOriginBuf ( c )
2024-12-10 00:20:00 +09:00
if s . isNewLineChar ( c ) {
break
}
}
value := strings . TrimRight ( ctx . source ( ctx . idx , ctx . idx + progress ) , " " )
commentValueIndex := strings . Index ( value , "#" )
opt := value
if commentValueIndex > 0 {
opt = value [ : commentValueIndex ]
}
opt = strings . TrimRightFunc ( opt , func ( r rune ) bool {
return r == ' ' || r == '\t'
} )
if len ( opt ) != 0 {
2024-12-14 20:08:27 +09:00
if err := s . validateMultiLineHeaderOption ( opt ) ; err != nil {
2024-12-10 00:20:00 +09:00
invalidTk := token . Invalid ( err . Error ( ) , string ( ctx . obuf ) , s . pos ( ) )
2024-11-03 02:11:50 +09:00
s . progressColumn ( ctx , progress )
2024-12-10 00:20:00 +09:00
return ErrInvalidToken ( invalidTk )
2019-10-16 18:19:48 +09:00
}
}
2024-12-10 00:20:00 +09:00
if s . column == 1 {
s . lastDelimColumn = 1
}
commentIndex := strings . Index ( string ( ctx . obuf ) , "#" )
headerBuf := string ( ctx . obuf )
if commentIndex > 0 {
headerBuf = headerBuf [ : commentIndex ]
}
switch header {
case '|' :
ctx . addToken ( token . Literal ( "|" + opt , headerBuf , s . pos ( ) ) )
2024-12-14 20:08:27 +09:00
ctx . setLiteral ( s . lastDelimColumn , opt )
2024-12-10 00:20:00 +09:00
case '>' :
ctx . addToken ( token . Folded ( ">" + opt , headerBuf , s . pos ( ) ) )
2024-12-14 20:08:27 +09:00
ctx . setFolded ( s . lastDelimColumn , opt )
2024-12-10 00:20:00 +09:00
}
if commentIndex > 0 {
comment := string ( value [ commentValueIndex + 1 : ] )
s . offset += len ( headerBuf )
s . column += len ( headerBuf )
ctx . addToken ( token . Comment ( comment , string ( ctx . obuf [ len ( headerBuf ) : ] ) , s . pos ( ) ) )
}
s . indentState = IndentStateKeep
ctx . resetBuffer ( )
s . progressColumn ( ctx , progress )
return nil
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
func ( s * Scanner ) scanMapKey ( ctx * Context ) bool {
if ctx . existsBuffer ( ) {
return false
2019-10-16 18:19:48 +09:00
}
2019-12-11 17:17:05 +09:00
2024-10-28 18:15:16 +09:00
nc := ctx . nextChar ( )
2024-11-26 22:41:11 +09:00
if nc != ' ' && nc != '\t' {
2024-10-28 18:15:16 +09:00
return false
2019-12-11 17:17:05 +09:00
}
2024-11-28 23:34:45 +09:00
tk := token . MappingKey ( s . pos ( ) )
s . lastDelimColumn = tk . Position . Column
ctx . addToken ( tk )
2024-10-28 18:15:16 +09:00
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanDirective ( ctx * Context ) bool {
if ctx . existsBuffer ( ) {
return false
}
if s . indentNum != 0 {
return false
2024-07-16 06:17:13 -04:00
}
2024-12-09 18:45:26 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( '%' )
ctx . addToken ( token . Directive ( string ( ctx . obuf ) , s . pos ( ) ) )
2024-10-28 18:15:16 +09:00
s . progressColumn ( ctx , 1 )
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-11-26 11:05:45 +09:00
s . isDirective = true
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanAnchor ( ctx * Context ) bool {
if ctx . existsBuffer ( ) {
return false
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( '&' )
ctx . addToken ( token . Anchor ( string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
s . isAnchor = true
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
}
func ( s * Scanner ) scanAlias ( ctx * Context ) bool {
if ctx . existsBuffer ( ) {
return false
}
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( '*' )
ctx . addToken ( token . Alias ( string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
2024-11-26 22:41:11 +09:00
s . isAlias = true
2024-10-30 02:18:20 +09:00
ctx . clear ( )
2024-10-28 18:15:16 +09:00
return true
2019-10-16 18:19:48 +09:00
}
2024-11-09 20:43:51 +09:00
func ( s * Scanner ) scanReservedChar ( ctx * Context , c rune ) error {
if ctx . existsBuffer ( ) {
return nil
}
ctx . addBuf ( c )
ctx . addOriginBuf ( c )
2024-11-15 00:22:47 +09:00
err := ErrInvalidToken (
token . Invalid (
fmt . Sprintf ( "%q is a reserved character" , c ) ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
2024-11-09 20:43:51 +09:00
s . progressColumn ( ctx , 1 )
ctx . clear ( )
return err
}
2024-11-13 16:21:12 +09:00
func ( s * Scanner ) scanTab ( ctx * Context , c rune ) error {
2024-11-26 22:41:11 +09:00
if s . startedFlowSequenceNum > 0 || s . startedFlowMapNum > 0 {
// tabs character is allowed in flow mode.
return nil
}
2024-11-13 16:21:12 +09:00
if ! s . isFirstCharAtLine {
return nil
}
ctx . addBuf ( c )
ctx . addOriginBuf ( c )
2024-11-15 00:22:47 +09:00
err := ErrInvalidToken (
token . Invalid ( "found character '\t' that cannot start any token" ,
string ( ctx . obuf ) , s . pos ( ) ,
) ,
)
2024-11-13 16:21:12 +09:00
s . progressColumn ( ctx , 1 )
ctx . clear ( )
return err
}
2024-10-29 20:00:48 +09:00
func ( s * Scanner ) scan ( ctx * Context ) error {
2019-10-16 18:19:48 +09:00
for ctx . next ( ) {
c := ctx . currentChar ( )
2024-10-28 18:15:16 +09:00
// First, change the IndentState.
// If the target character is the first character in a line, IndentState is Up/Down/Equal state.
// The second and subsequent letters are Keep.
2019-11-07 23:45:39 +09:00
s . updateIndent ( ctx , c )
2024-10-28 18:15:16 +09:00
// If IndentState is down, tokens are split, so the buffer accumulated until that point needs to be cutted as a token.
2024-10-28 17:07:44 +09:00
if s . isChangedToIndentStateDown ( ) {
s . addBufferedTokenIfExists ( ctx )
}
2024-12-14 20:08:27 +09:00
if ctx . isMultiLine ( ) {
2024-10-28 17:07:44 +09:00
if s . isChangedToIndentStateDown ( ) {
2024-11-01 15:03:27 +09:00
if tk := ctx . lastToken ( ) ; tk != nil {
// If literal/folded content is empty, no string token is added.
// Therefore, add an empty string token.
2024-11-02 19:29:48 +09:00
// But if literal/folded token column is 1, it is invalid at down state.
if tk . Position . Column == 1 {
return ErrInvalidToken (
2024-11-15 00:22:47 +09:00
token . Invalid (
2024-12-14 22:57:55 +09:00
"could not find multi-line content" ,
2024-11-15 00:22:47 +09:00
string ( ctx . obuf ) , s . pos ( ) ,
) ,
2024-11-02 19:29:48 +09:00
)
}
2024-11-01 15:03:27 +09:00
if tk . Type != token . StringType {
ctx . addToken ( token . String ( "" , "" , s . pos ( ) ) )
}
}
2024-12-14 20:08:27 +09:00
s . breakMultiLine ( ctx )
2019-11-07 23:45:39 +09:00
} else {
2024-12-14 20:08:27 +09:00
if err := s . scanMultiLine ( ctx , c ) ; err != nil {
2024-11-03 02:11:50 +09:00
return err
}
2019-11-07 23:45:39 +09:00
continue
}
2019-10-16 18:19:48 +09:00
}
switch c {
case '{' :
2024-10-28 18:15:16 +09:00
if s . scanFlowMapStart ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '}' :
2024-10-28 18:15:16 +09:00
if s . scanFlowMapEnd ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '.' :
2024-10-28 18:15:16 +09:00
if s . scanDocumentEnd ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-10-16 18:19:48 +09:00
}
case '<' :
2024-10-28 18:15:16 +09:00
if s . scanMergeKey ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-10-16 18:19:48 +09:00
}
case '-' :
2024-10-28 18:15:16 +09:00
if s . scanDocumentStart ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-10-16 18:19:48 +09:00
}
2024-10-28 18:15:16 +09:00
if s . scanRawFoldedChar ( ctx ) {
2019-10-16 18:19:48 +09:00
continue
}
2024-11-26 22:41:11 +09:00
scanned , err := s . scanSequence ( ctx )
if err != nil {
return err
}
if scanned {
2019-12-04 23:58:06 +09:00
continue
}
2019-10-16 18:19:48 +09:00
case '[' :
2024-10-28 18:15:16 +09:00
if s . scanFlowArrayStart ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case ']' :
2024-10-28 18:15:16 +09:00
if s . scanFlowArrayEnd ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case ',' :
2024-10-28 18:15:16 +09:00
if s . scanFlowEntry ( ctx , c ) {
2024-10-30 02:18:20 +09:00
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case ':' :
2024-11-26 22:41:11 +09:00
scanned , err := s . scanMapDelim ( ctx )
if err != nil {
return err
}
if scanned {
2024-10-30 02:18:20 +09:00
continue
2019-10-16 18:19:48 +09:00
}
case '|' , '>' :
2024-12-14 20:08:27 +09:00
scanned , err := s . scanMultiLineHeader ( ctx )
2024-10-28 18:15:16 +09:00
if err != nil {
2024-10-29 20:00:48 +09:00
return err
2024-10-28 18:15:16 +09:00
}
if scanned {
2019-10-16 18:19:48 +09:00
continue
}
case '!' :
2024-12-14 22:57:55 +09:00
scanned , err := s . scanTag ( ctx )
if err != nil {
return err
}
if scanned {
2024-10-30 02:18:20 +09:00
continue
2019-10-16 18:19:48 +09:00
}
case '%' :
2024-10-28 18:15:16 +09:00
if s . scanDirective ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-10-16 18:19:48 +09:00
}
case '?' :
2024-10-28 18:15:16 +09:00
if s . scanMapKey ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-10-16 18:19:48 +09:00
}
case '&' :
2024-10-28 18:15:16 +09:00
if s . scanAnchor ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '*' :
2024-10-28 18:15:16 +09:00
if s . scanAlias ( ctx ) {
2024-10-30 02:18:20 +09:00
continue
2019-11-06 19:28:47 +09:00
}
2019-10-16 18:19:48 +09:00
case '#' :
2024-10-30 02:18:20 +09:00
if s . scanComment ( ctx ) {
continue
2019-11-12 19:50:58 +09:00
}
2019-10-16 18:19:48 +09:00
case '\'' , '"' :
2024-10-31 22:54:26 +09:00
scanned , err := s . scanQuote ( ctx , c )
if err != nil {
return err
}
if scanned {
2024-10-30 02:18:20 +09:00
continue
2019-11-07 18:08:12 +09:00
}
2019-11-09 17:20:39 +09:00
case '\r' , '\n' :
2019-10-16 18:19:48 +09:00
s . scanNewLine ( ctx , c )
continue
case ' ' :
2024-10-30 02:18:20 +09:00
if s . scanWhiteSpace ( ctx ) {
2019-10-16 18:19:48 +09:00
continue
}
2024-11-09 20:43:51 +09:00
case '@' , '`' :
if err := s . scanReservedChar ( ctx , c ) ; err != nil {
return err
}
2024-11-13 16:21:12 +09:00
case '\t' :
2024-11-30 14:47:52 +09:00
if ctx . existsBuffer ( ) && s . lastDelimColumn == 0 {
// tab indent for plain text (yaml-test-suite's spec-example-7-12-plain-lines).
s . indentNum ++
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , 1 )
continue
}
2024-12-15 01:29:04 +09:00
if s . lastDelimColumn < s . column {
s . indentNum ++
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , 1 )
continue
}
2024-11-13 16:21:12 +09:00
if err := s . scanTab ( ctx , c ) ; err != nil {
return err
}
2019-10-16 18:19:48 +09:00
}
ctx . addBuf ( c )
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , 1 )
}
2019-10-23 03:21:42 +09:00
s . addBufferedTokenIfExists ( ctx )
2024-10-29 20:00:48 +09:00
return nil
2019-10-16 18:19:48 +09:00
}
2019-10-21 12:53:30 +09:00
// Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
2019-12-29 11:37:20 +09:00
func ( s * Scanner ) Init ( text string ) {
src := [ ] rune ( text )
2019-10-16 18:19:48 +09:00
s . source = src
s . sourcePos = 0
s . sourceSize = len ( src )
s . line = 1
s . column = 1
s . offset = 1
2024-11-28 23:34:45 +09:00
s . isFirstCharAtLine = true
s . clearState ( )
}
func ( s * Scanner ) clearState ( ) {
2024-10-28 15:59:31 +09:00
s . prevLineIndentNum = 0
s . lastDelimColumn = 0
2019-10-16 18:19:48 +09:00
s . indentLevel = 0
s . indentNum = 0
}
2019-10-21 12:53:30 +09:00
// Scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
2019-10-16 18:19:48 +09:00
func ( s * Scanner ) Scan ( ) ( token . Tokens , error ) {
if s . sourcePos >= s . sourceSize {
return nil , io . EOF
}
ctx := newContext ( s . source [ s . sourcePos : ] )
2019-12-29 11:47:34 +09:00
defer ctx . release ( )
2024-10-29 20:00:48 +09:00
2019-12-29 11:47:34 +09:00
var tokens token . Tokens
2024-10-29 20:00:48 +09:00
err := s . scan ( ctx )
2019-12-29 11:47:34 +09:00
tokens = append ( tokens , ctx . tokens ... )
2024-10-29 20:00:48 +09:00
if err != nil {
var invalidTokenErr * InvalidTokenError
if errors . As ( err , & invalidTokenErr ) {
tokens = append ( tokens , invalidTokenErr . Token )
}
return tokens , err
}
2019-12-29 11:47:34 +09:00
return tokens , nil
2019-10-16 18:19:48 +09:00
}