2019-10-16 18:19:48 +09:00
package scanner
import (
"io"
"strings"
"github.com/goccy/go-yaml/token"
"golang.org/x/xerrors"
)
2019-10-21 12:53:30 +09:00
// IndentState state for indent
2019-10-16 18:19:48 +09:00
type IndentState int
const (
2019-10-21 12:53:30 +09:00
// IndentStateEqual equals previous indent
2019-10-16 18:19:48 +09:00
IndentStateEqual IndentState = iota
2019-10-21 12:53:30 +09:00
// IndentStateUp more indent than previous
2019-10-16 18:19:48 +09:00
IndentStateUp
2019-10-21 12:53:30 +09:00
// IndentStateDown less indent than previous
2019-10-16 18:19:48 +09:00
IndentStateDown
2019-10-21 12:53:30 +09:00
// IndentStateKeep uses not indent token
2019-10-16 18:19:48 +09:00
IndentStateKeep
)
2019-10-21 12:53:30 +09:00
// Scanner holds the scanner's internal state while processing a given text.
// It can be allocated as part of another data structure but must be initialized via Init before use.
2019-10-16 18:19:48 +09:00
type Scanner struct {
2019-11-07 18:01:45 +09:00
source [ ] rune
2019-11-07 13:00:00 +09:00
sourcePos int
sourceSize int
line int
column int
offset int
prevIndentLevel int
prevIndentNum int
prevIndentColumn int
2019-11-08 01:04:29 +09:00
docStartColumn int
2019-11-07 13:00:00 +09:00
indentLevel int
indentNum int
isFirstCharAtLine bool
isAnchor bool
startedFlowSequenceNum int
startedFlowMapNum int
indentState IndentState
savedPos * token . Position
2019-10-16 18:19:48 +09:00
}
func ( s * Scanner ) pos ( ) * token . Position {
return & token . Position {
Line : s . line ,
Column : s . column ,
Offset : s . offset ,
IndentNum : s . indentNum ,
IndentLevel : s . indentLevel ,
}
}
func ( s * Scanner ) bufferedToken ( ctx * Context ) * token . Token {
if s . savedPos != nil {
tk := ctx . bufferedToken ( s . savedPos )
s . savedPos = nil
return tk
}
2022-12-19 16:04:56 +10:30
line := s . line
column := s . column - len ( ctx . buf )
level := s . indentLevel
if ctx . isSaveIndentMode ( ) {
line -= s . newLineCount ( ctx . buf )
column = strings . Index ( string ( ctx . obuf ) , string ( ctx . buf ) ) + 1
// Since we are in a literal, folded or raw folded
// we can use the indent level from the last token.
last := ctx . lastToken ( )
if last != nil { // The last token should never be nil here.
level = last . Position . IndentLevel + 1
}
}
2019-10-16 18:19:48 +09:00
return ctx . bufferedToken ( & token . Position {
2022-12-19 16:04:56 +10:30
Line : line ,
Column : column ,
Offset : s . offset - len ( ctx . buf ) ,
2019-10-16 18:19:48 +09:00
IndentNum : s . indentNum ,
2022-12-19 16:04:56 +10:30
IndentLevel : level ,
2019-10-16 18:19:48 +09:00
} )
}
func ( s * Scanner ) progressColumn ( ctx * Context , num int ) {
s . column += num
s . offset += num
ctx . progress ( num )
}
func ( s * Scanner ) progressLine ( ctx * Context ) {
s . column = 1
s . line ++
s . offset ++
s . indentNum = 0
s . isFirstCharAtLine = true
s . isAnchor = false
ctx . progress ( 1 )
}
2019-11-07 23:54:32 +09:00
func ( s * Scanner ) isNeededKeepPreviousIndentNum ( ctx * Context , c rune ) bool {
if ! s . isChangedToIndentStateUp ( ) {
return false
}
if ctx . isDocument ( ) {
return true
}
2019-12-28 22:37:50 +09:00
if c == '-' && ctx . existsBuffer ( ) {
2019-11-07 23:54:32 +09:00
return true
}
return false
}
2019-12-22 11:59:49 +09:00
func ( s * Scanner ) isNewLineChar ( c rune ) bool {
if c == '\n' {
return true
}
if c == '\r' {
return true
}
return false
}
2019-12-22 17:28:30 +09:00
func ( s * Scanner ) newLineCount ( src [ ] rune ) int {
size := len ( src )
cnt := 0
for i := 0 ; i < size ; i ++ {
c := src [ i ]
switch c {
case '\r' :
if i + 1 < size && src [ i + 1 ] == '\n' {
i ++
}
cnt ++
case '\n' :
cnt ++
}
}
return cnt
}
2022-01-11 20:37:31 +09:00
func ( s * Scanner ) updateIndentState ( ctx * Context ) {
indentNumBasedIndentState := s . indentState
2019-10-16 18:19:48 +09:00
if s . prevIndentNum < s . indentNum {
s . indentLevel = s . prevIndentLevel + 1
2022-01-11 20:37:31 +09:00
indentNumBasedIndentState = IndentStateUp
2019-10-16 18:19:48 +09:00
} else if s . prevIndentNum == s . indentNum {
s . indentLevel = s . prevIndentLevel
2022-01-11 20:37:31 +09:00
indentNumBasedIndentState = IndentStateEqual
2019-10-16 18:19:48 +09:00
} else {
2022-01-11 20:37:31 +09:00
indentNumBasedIndentState = IndentStateDown
2019-10-21 14:54:26 +09:00
if s . prevIndentLevel > 0 {
s . indentLevel = s . prevIndentLevel - 1
}
2019-10-16 18:19:48 +09:00
}
2019-10-21 15:58:46 +09:00
2019-10-23 20:22:14 +09:00
if s . prevIndentColumn > 0 {
if s . prevIndentColumn < s . column {
2019-10-21 15:58:46 +09:00
s . indentState = IndentStateUp
2022-01-11 20:37:31 +09:00
} else if s . prevIndentColumn != s . column || indentNumBasedIndentState != IndentStateEqual {
// The following case ( current position is 'd' ), some variables becomes like here
// - prevIndentColumn: 1 of 'a'
2022-01-11 20:43:39 +09:00
// - indentNumBasedIndentState: IndentStateDown because d's indentNum(1) is less than c's indentNum(3).
2022-01-11 20:37:31 +09:00
// Therefore, s.prevIndentColumn(1) == s.column(1) is true, but we want to treat this as IndentStateDown.
2022-01-11 20:43:39 +09:00
// So, we look also current indentState value by the above prevIndentNum based logic, and determins finally indentState.
2022-01-11 20:37:31 +09:00
// ---
// a:
// b
// c
// d: e
// ^
2019-10-21 15:58:46 +09:00
s . indentState = IndentStateDown
2022-01-11 20:37:31 +09:00
} else {
s . indentState = IndentStateEqual
2019-10-21 15:58:46 +09:00
}
2022-01-11 20:37:31 +09:00
} else {
s . indentState = indentNumBasedIndentState
}
}
func ( s * Scanner ) updateIndent ( ctx * Context , c rune ) {
if s . isFirstCharAtLine && s . isNewLineChar ( c ) && ctx . isDocument ( ) {
return
}
if s . isFirstCharAtLine && c == ' ' {
s . indentNum ++
return
}
if ! s . isFirstCharAtLine {
s . indentState = IndentStateKeep
return
2019-10-21 15:58:46 +09:00
}
2022-01-11 20:37:31 +09:00
s . updateIndentState ( ctx )
2019-11-07 23:45:39 +09:00
s . isFirstCharAtLine = false
2019-11-07 23:54:32 +09:00
if s . isNeededKeepPreviousIndentNum ( ctx , c ) {
2019-11-07 23:45:39 +09:00
return
}
2022-01-11 20:37:31 +09:00
if s . indentState != IndentStateUp {
s . prevIndentColumn = 0
}
2019-10-16 18:19:48 +09:00
s . prevIndentNum = s . indentNum
s . prevIndentLevel = s . indentLevel
}
func ( s * Scanner ) isChangedToIndentStateDown ( ) bool {
return s . indentState == IndentStateDown
}
func ( s * Scanner ) isChangedToIndentStateUp ( ) bool {
return s . indentState == IndentStateUp
}
func ( s * Scanner ) isChangedToIndentStateEqual ( ) bool {
return s . indentState == IndentStateEqual
}
func ( s * Scanner ) addBufferedTokenIfExists ( ctx * Context ) {
ctx . addToken ( s . bufferedToken ( ctx ) )
}
func ( s * Scanner ) breakLiteral ( ctx * Context ) {
2019-11-08 01:04:29 +09:00
s . docStartColumn = 0
2019-10-16 18:19:48 +09:00
ctx . breakLiteral ( )
}
2020-05-29 18:09:51 +09:00
func ( s * Scanner ) scanSingleQuote ( ctx * Context ) ( tk * token . Token , pos int ) {
ctx . addOriginBuf ( '\'' )
2021-02-18 21:18:38 +00:00
srcpos := s . pos ( )
2019-10-16 18:19:48 +09:00
startIndex := ctx . idx + 1
2020-05-29 18:09:51 +09:00
src := ctx . src
size := len ( src )
value := [ ] rune { }
2020-06-15 17:30:27 +09:00
isFirstLineChar := false
2021-03-01 15:09:40 +00:00
isNewLine := false
2020-05-29 18:09:51 +09:00
for idx := startIndex ; idx < size ; idx ++ {
2021-03-01 15:09:40 +00:00
if ! isNewLine {
s . progressColumn ( ctx , 1 )
} else {
isNewLine = false
}
2020-05-29 18:09:51 +09:00
c := src [ idx ]
2019-10-16 18:19:48 +09:00
pos = idx + 1
ctx . addOriginBuf ( c )
2020-06-15 17:30:27 +09:00
if s . isNewLineChar ( c ) {
value = append ( value , ' ' )
isFirstLineChar = true
2021-03-01 15:09:40 +00:00
isNewLine = true
s . progressLine ( ctx )
2020-06-15 17:30:27 +09:00
continue
} else if c == ' ' && isFirstLineChar {
continue
} else if c != '\'' {
2020-05-29 18:09:51 +09:00
value = append ( value , c )
2020-06-15 17:30:27 +09:00
isFirstLineChar = false
2020-05-29 18:09:51 +09:00
continue
}
if idx + 1 < len ( ctx . src ) && ctx . src [ idx + 1 ] == '\'' {
// '' handle as ' character
value = append ( value , c )
2020-05-29 18:23:08 +09:00
ctx . addOriginBuf ( c )
2020-05-29 18:09:51 +09:00
idx ++
continue
}
2021-03-01 15:09:40 +00:00
s . progressColumn ( ctx , 1 )
2021-02-18 21:18:38 +00:00
tk = token . SingleQuote ( string ( value ) , string ( ctx . obuf ) , srcpos )
2020-06-02 13:06:46 +09:00
pos = idx - startIndex + 1
2020-05-29 18:09:51 +09:00
return
}
return
}
2020-06-20 14:10:17 +09:00
func hexToInt ( b rune ) int {
if b >= 'A' && b <= 'F' {
return int ( b ) - 'A' + 10
}
if b >= 'a' && b <= 'f' {
return int ( b ) - 'a' + 10
}
return int ( b ) - '0'
}
func hexRunesToInt ( b [ ] rune ) int {
sum := 0
for i := 0 ; i < len ( b ) ; i ++ {
sum += hexToInt ( b [ i ] ) << ( uint ( len ( b ) - i - 1 ) * 4 )
}
return sum
}
2020-05-29 18:09:51 +09:00
func ( s * Scanner ) scanDoubleQuote ( ctx * Context ) ( tk * token . Token , pos int ) {
ctx . addOriginBuf ( '"' )
2021-02-18 21:18:38 +00:00
srcpos := s . pos ( )
2020-05-29 18:09:51 +09:00
startIndex := ctx . idx + 1
src := ctx . src
size := len ( src )
value := [ ] rune { }
2020-06-15 17:30:27 +09:00
isFirstLineChar := false
2021-03-01 15:09:40 +00:00
isNewLine := false
2020-05-29 18:09:51 +09:00
for idx := startIndex ; idx < size ; idx ++ {
2021-03-01 15:09:40 +00:00
if ! isNewLine {
s . progressColumn ( ctx , 1 )
} else {
isNewLine = false
}
2020-05-29 18:09:51 +09:00
c := src [ idx ]
pos = idx + 1
ctx . addOriginBuf ( c )
2020-06-15 17:30:27 +09:00
if s . isNewLineChar ( c ) {
value = append ( value , ' ' )
isFirstLineChar = true
2021-03-01 15:09:40 +00:00
isNewLine = true
s . progressLine ( ctx )
2020-06-15 17:30:27 +09:00
continue
} else if c == ' ' && isFirstLineChar {
continue
} else if c == '\\' {
2020-06-20 14:10:17 +09:00
isFirstLineChar = false
2020-05-29 18:09:51 +09:00
if idx + 1 < size {
nextChar := src [ idx + 1 ]
switch nextChar {
2020-06-20 14:10:17 +09:00
case 'b' :
ctx . addOriginBuf ( nextChar )
value = append ( value , '\b' )
idx ++
continue
case 'e' :
ctx . addOriginBuf ( nextChar )
value = append ( value , '\x1B' )
idx ++
continue
case 'f' :
ctx . addOriginBuf ( nextChar )
value = append ( value , '\f' )
idx ++
continue
2020-06-02 13:06:46 +09:00
case 'n' :
ctx . addOriginBuf ( nextChar )
value = append ( value , '\n' )
idx ++
continue
2020-06-20 14:10:17 +09:00
case 'v' :
ctx . addOriginBuf ( nextChar )
value = append ( value , '\v' )
idx ++
continue
case 'L' : // LS (#x2028)
ctx . addOriginBuf ( nextChar )
value = append ( value , [ ] rune { '\xE2' , '\x80' , '\xA8' } ... )
idx ++
continue
case 'N' : // NEL (#x85)
ctx . addOriginBuf ( nextChar )
value = append ( value , [ ] rune { '\xC2' , '\x85' } ... )
idx ++
continue
case 'P' : // PS (#x2029)
ctx . addOriginBuf ( nextChar )
value = append ( value , [ ] rune { '\xE2' , '\x80' , '\xA9' } ... )
idx ++
continue
case '_' : // #xA0
ctx . addOriginBuf ( nextChar )
value = append ( value , [ ] rune { '\xC2' , '\xA0' } ... )
idx ++
continue
2020-05-29 18:09:51 +09:00
case '"' :
ctx . addOriginBuf ( nextChar )
value = append ( value , nextChar )
idx ++
continue
2020-06-20 14:10:17 +09:00
case 'x' :
if idx + 3 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\x")
return
}
codeNum := hexRunesToInt ( src [ idx + 2 : idx + 4 ] )
value = append ( value , rune ( codeNum ) )
idx += 3
continue
case 'u' :
if idx + 5 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\u")
return
}
codeNum := hexRunesToInt ( src [ idx + 2 : idx + 6 ] )
value = append ( value , rune ( codeNum ) )
idx += 5
continue
case 'U' :
if idx + 9 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\U")
return
}
codeNum := hexRunesToInt ( src [ idx + 2 : idx + 10 ] )
value = append ( value , rune ( codeNum ) )
idx += 9
continue
2020-05-29 18:09:51 +09:00
case '\\' :
ctx . addOriginBuf ( nextChar )
idx ++
}
2019-10-16 18:19:48 +09:00
}
2020-05-29 18:09:51 +09:00
value = append ( value , c )
continue
} else if c != '"' {
value = append ( value , c )
2020-06-15 17:30:27 +09:00
isFirstLineChar = false
2020-05-29 18:09:51 +09:00
continue
2019-10-16 18:19:48 +09:00
}
2021-03-01 15:09:40 +00:00
s . progressColumn ( ctx , 1 )
2021-02-18 21:18:38 +00:00
tk = token . DoubleQuote ( string ( value ) , string ( ctx . obuf ) , srcpos )
2020-06-02 13:06:46 +09:00
pos = idx - startIndex + 1
2020-05-29 18:09:51 +09:00
return
2019-10-16 18:19:48 +09:00
}
return
}
2020-05-29 18:09:51 +09:00
func ( s * Scanner ) scanQuote ( ctx * Context , ch rune ) ( tk * token . Token , pos int ) {
if ch == '\'' {
return s . scanSingleQuote ( ctx )
}
return s . scanDoubleQuote ( ctx )
}
2021-03-01 17:32:11 +09:00
func ( s * Scanner ) isMergeKey ( ctx * Context ) bool {
if ctx . repeatNum ( '<' ) != 2 {
return false
}
src := ctx . src
size := len ( src )
for idx := ctx . idx + 2 ; idx < size ; idx ++ {
c := src [ idx ]
if c == ' ' {
continue
}
if c != ':' {
return false
}
if idx + 1 < size {
nc := src [ idx + 1 ]
if nc == ' ' || s . isNewLineChar ( nc ) {
return true
}
}
}
return false
}
2019-10-16 18:19:48 +09:00
func ( s * Scanner ) scanTag ( ctx * Context ) ( tk * token . Token , pos int ) {
ctx . addOriginBuf ( '!' )
ctx . progress ( 1 ) // skip '!' character
for idx , c := range ctx . src [ ctx . idx : ] {
pos = idx + 1
ctx . addOriginBuf ( c )
switch c {
2019-12-22 11:59:49 +09:00
case ' ' , '\n' , '\r' :
2019-10-16 18:19:48 +09:00
value := ctx . source ( ctx . idx - 1 , ctx . idx + idx )
tk = token . Tag ( value , string ( ctx . obuf ) , s . pos ( ) )
2019-11-07 19:16:19 +09:00
pos = len ( [ ] rune ( value ) )
2019-10-16 18:19:48 +09:00
return
}
}
return
}
func ( s * Scanner ) scanComment ( ctx * Context ) ( tk * token . Token , pos int ) {
ctx . addOriginBuf ( '#' )
ctx . progress ( 1 ) // skip '#' character
for idx , c := range ctx . src [ ctx . idx : ] {
pos = idx + 1
ctx . addOriginBuf ( c )
switch c {
2019-12-22 11:59:49 +09:00
case '\n' , '\r' :
2019-10-16 18:19:48 +09:00
if ctx . previousChar ( ) == '\\' {
continue
}
value := ctx . source ( ctx . idx , ctx . idx + idx )
tk = token . Comment ( value , string ( ctx . obuf ) , s . pos ( ) )
2019-11-07 19:16:19 +09:00
pos = len ( [ ] rune ( value ) ) + 1
2019-10-16 18:19:48 +09:00
return
}
}
return
}
2021-07-19 18:48:09 +09:00
func trimCommentFromLiteralOpt ( text string ) ( string , error ) {
idx := strings . Index ( text , "#" )
if idx < 0 {
return text , nil
}
if idx == 0 {
return "" , xerrors . New ( "invalid literal header" )
}
return text [ : idx - 1 ] , nil
}
2019-10-16 18:19:48 +09:00
func ( s * Scanner ) scanLiteral ( ctx * Context , c rune ) {
2019-11-08 16:48:54 +09:00
ctx . addOriginBuf ( c )
2019-10-16 18:19:48 +09:00
if ctx . isEOS ( ) {
2020-06-15 20:16:51 +09:00
if ctx . isLiteral {
2020-03-07 12:03:06 +09:00
ctx . addBuf ( c )
}
2019-10-16 18:19:48 +09:00
value := ctx . bufferedSrc ( )
2020-06-01 12:54:23 +09:00
ctx . addToken ( token . String ( string ( value ) , string ( ctx . obuf ) , s . pos ( ) ) )
2019-11-07 23:45:39 +09:00
ctx . resetBuffer ( )
2019-11-08 16:48:54 +09:00
s . progressColumn ( ctx , 1 )
2019-12-22 11:59:49 +09:00
} else if s . isNewLineChar ( c ) {
2019-10-16 18:19:48 +09:00
if ctx . isLiteral {
ctx . addBuf ( c )
} else {
ctx . addBuf ( ' ' )
}
s . progressLine ( ctx )
} else if s . isFirstCharAtLine && c == ' ' {
2019-11-08 01:04:29 +09:00
if 0 < s . docStartColumn && s . docStartColumn <= s . column {
ctx . addBuf ( c )
}
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 1 )
} else {
2019-11-08 01:04:29 +09:00
if s . docStartColumn == 0 {
s . docStartColumn = s . column
}
2019-10-16 18:19:48 +09:00
ctx . addBuf ( c )
s . progressColumn ( ctx , 1 )
}
}
func ( s * Scanner ) scanLiteralHeader ( ctx * Context ) ( pos int , err error ) {
header := ctx . currentChar ( )
ctx . addOriginBuf ( header )
2020-06-17 11:39:08 +09:00
ctx . progress ( 1 ) // skip '|' or '>' character
2019-10-16 18:19:48 +09:00
for idx , c := range ctx . src [ ctx . idx : ] {
pos = idx
ctx . addOriginBuf ( c )
switch c {
2019-12-22 11:59:49 +09:00
case '\n' , '\r' :
2019-10-16 18:19:48 +09:00
value := ctx . source ( ctx . idx , ctx . idx + idx )
opt := strings . TrimRight ( value , " " )
2021-07-19 18:48:09 +09:00
orgOptLen := len ( opt )
opt , err = trimCommentFromLiteralOpt ( opt )
if err != nil {
return
}
2019-10-16 18:19:48 +09:00
switch opt {
case "" , "+" , "-" ,
"0" , "1" , "2" , "3" , "4" , "5" , "6" , "7" , "8" , "9" :
2021-07-19 18:48:09 +09:00
hasComment := len ( opt ) < orgOptLen
2019-10-16 18:19:48 +09:00
if header == '|' {
2021-07-19 18:48:09 +09:00
if hasComment {
commentLen := orgOptLen - len ( opt )
headerPos := strings . Index ( string ( ctx . obuf ) , "|" )
litBuf := ctx . obuf [ : len ( ctx . obuf ) - commentLen - headerPos ]
commentBuf := ctx . obuf [ len ( litBuf ) : ]
ctx . addToken ( token . Literal ( "|" + opt , string ( litBuf ) , s . pos ( ) ) )
s . column += len ( litBuf )
s . offset += len ( litBuf )
commentHeader := strings . Index ( value , "#" )
ctx . addToken ( token . Comment ( string ( value [ commentHeader + 1 : ] ) , string ( commentBuf ) , s . pos ( ) ) )
} else {
ctx . addToken ( token . Literal ( "|" + opt , string ( ctx . obuf ) , s . pos ( ) ) )
}
2019-10-16 18:19:48 +09:00
ctx . isLiteral = true
} else if header == '>' {
2021-07-19 18:48:09 +09:00
if hasComment {
commentLen := orgOptLen - len ( opt )
headerPos := strings . Index ( string ( ctx . obuf ) , ">" )
foldedBuf := ctx . obuf [ : len ( ctx . obuf ) - commentLen - headerPos ]
commentBuf := ctx . obuf [ len ( foldedBuf ) : ]
ctx . addToken ( token . Folded ( ">" + opt , string ( foldedBuf ) , s . pos ( ) ) )
s . column += len ( foldedBuf )
s . offset += len ( foldedBuf )
commentHeader := strings . Index ( value , "#" )
ctx . addToken ( token . Comment ( string ( value [ commentHeader + 1 : ] ) , string ( commentBuf ) , s . pos ( ) ) )
} else {
ctx . addToken ( token . Folded ( ">" + opt , string ( ctx . obuf ) , s . pos ( ) ) )
}
2019-10-16 18:19:48 +09:00
ctx . isFolded = true
}
2019-11-08 16:48:54 +09:00
s . indentState = IndentStateKeep
2019-10-16 18:19:48 +09:00
ctx . resetBuffer ( )
ctx . literalOpt = opt
return
}
break
}
}
err = xerrors . New ( "invalid literal header" )
return
}
func ( s * Scanner ) scanNewLine ( ctx * Context , c rune ) {
if len ( ctx . buf ) > 0 && s . savedPos == nil {
s . savedPos = s . pos ( )
2019-12-28 22:37:50 +09:00
s . savedPos . Column -= len ( ctx . bufferedSrc ( ) )
2019-10-16 18:19:48 +09:00
}
2019-12-11 17:17:05 +09:00
// if the following case, origin buffer has unnecessary two spaces.
// So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too.
// ---
// a:[space][space]
// b: c
removedNum := ctx . removeRightSpaceFromBuf ( )
if removedNum > 0 {
s . column -= removedNum
s . offset -= removedNum
2019-12-28 22:20:45 +09:00
if s . savedPos != nil {
s . savedPos . Column -= removedNum
}
2019-12-11 17:17:05 +09:00
}
2019-10-16 18:19:48 +09:00
if ctx . isEOS ( ) {
s . addBufferedTokenIfExists ( ctx )
2019-10-17 15:44:55 +09:00
} else if s . isAnchor {
s . addBufferedTokenIfExists ( ctx )
2019-10-16 18:19:48 +09:00
}
ctx . addBuf ( ' ' )
ctx . addOriginBuf ( c )
2019-12-04 23:58:06 +09:00
ctx . isSingleLine = false
2019-10-16 18:19:48 +09:00
s . progressLine ( ctx )
}
func ( s * Scanner ) scan ( ctx * Context ) ( pos int ) {
for ctx . next ( ) {
pos = ctx . nextPos ( )
c := ctx . currentChar ( )
2019-11-07 23:45:39 +09:00
s . updateIndent ( ctx , c )
if ctx . isDocument ( ) {
if s . isChangedToIndentStateEqual ( ) ||
s . isChangedToIndentStateDown ( ) {
s . addBufferedTokenIfExists ( ctx )
s . breakLiteral ( ctx )
} else {
s . scanLiteral ( ctx , c )
continue
}
} else if s . isChangedToIndentStateDown ( ) {
2019-10-16 18:19:48 +09:00
s . addBufferedTokenIfExists ( ctx )
2019-10-25 15:03:08 +09:00
} else if s . isChangedToIndentStateEqual ( ) {
2019-12-22 11:59:49 +09:00
// if first character is new line character, buffer expect to raw folded literal
2019-12-22 17:28:30 +09:00
if len ( ctx . obuf ) > 0 && s . newLineCount ( ctx . obuf ) <= 1 {
2019-10-25 15:03:08 +09:00
// doesn't raw folded literal
s . addBufferedTokenIfExists ( ctx )
}
2019-10-16 18:19:48 +09:00
}
switch c {
case '{' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) {
2019-11-06 19:28:47 +09:00
ctx . addOriginBuf ( c )
ctx . addToken ( token . MappingStart ( string ( ctx . obuf ) , s . pos ( ) ) )
2019-11-07 13:00:00 +09:00
s . startedFlowMapNum ++
2019-11-06 19:28:47 +09:00
s . progressColumn ( ctx , 1 )
return
}
2019-10-16 18:19:48 +09:00
case '}' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) || s . startedFlowMapNum > 0 {
2019-11-06 19:28:47 +09:00
ctx . addToken ( s . bufferedToken ( ctx ) )
ctx . addOriginBuf ( c )
ctx . addToken ( token . MappingEnd ( string ( ctx . obuf ) , s . pos ( ) ) )
2019-11-07 13:00:00 +09:00
s . startedFlowMapNum --
2019-11-06 19:28:47 +09:00
s . progressColumn ( ctx , 1 )
return
}
2019-10-16 18:19:48 +09:00
case '.' :
2021-01-18 13:33:07 -06:00
if s . indentNum == 0 && s . column == 1 && ctx . repeatNum ( '.' ) == 3 {
2021-03-01 19:15:44 +09:00
ctx . addToken ( token . DocumentEnd ( string ( ctx . obuf ) + "..." , s . pos ( ) ) )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 3 )
pos += 2
return
}
case '<' :
2021-03-01 17:32:11 +09:00
if s . isMergeKey ( ctx ) {
2019-10-23 20:22:14 +09:00
s . prevIndentColumn = s . column
2019-10-17 01:58:10 +09:00
ctx . addToken ( token . MergeKey ( string ( ctx . obuf ) + "<<" , s . pos ( ) ) )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 1 )
pos ++
return
}
case '-' :
2021-01-18 13:33:07 -06:00
if s . indentNum == 0 && s . column == 1 && ctx . repeatNum ( '-' ) == 3 {
2019-10-16 18:19:48 +09:00
s . addBufferedTokenIfExists ( ctx )
2021-03-01 19:15:44 +09:00
ctx . addToken ( token . DocumentHeader ( string ( ctx . obuf ) + "---" , s . pos ( ) ) )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 3 )
pos += 2
return
}
2019-12-28 22:37:50 +09:00
if ctx . existsBuffer ( ) && s . isChangedToIndentStateUp ( ) {
2019-10-16 18:19:48 +09:00
// raw folded
ctx . isRawFolded = true
ctx . addBuf ( c )
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , 1 )
continue
}
2020-03-07 12:03:06 +09:00
if ctx . existsBuffer ( ) {
2019-12-04 23:58:06 +09:00
// '-' is literal
ctx . addBuf ( c )
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , 1 )
continue
}
2019-10-16 18:19:48 +09:00
nc := ctx . nextChar ( )
2020-03-07 20:17:54 +09:00
if nc == ' ' || s . isNewLineChar ( nc ) {
2019-10-16 18:19:48 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( c )
2019-10-23 20:22:14 +09:00
tk := token . SequenceEntry ( string ( ctx . obuf ) , s . pos ( ) )
s . prevIndentColumn = tk . Position . Column
ctx . addToken ( tk )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 1 )
return
}
case '[' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) {
2019-11-06 19:28:47 +09:00
ctx . addOriginBuf ( c )
ctx . addToken ( token . SequenceStart ( string ( ctx . obuf ) , s . pos ( ) ) )
2019-11-07 13:00:00 +09:00
s . startedFlowSequenceNum ++
2019-11-06 19:28:47 +09:00
s . progressColumn ( ctx , 1 )
return
}
2019-10-16 18:19:48 +09:00
case ']' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) || s . startedFlowSequenceNum > 0 {
2019-11-06 19:28:47 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( c )
ctx . addToken ( token . SequenceEnd ( string ( ctx . obuf ) , s . pos ( ) ) )
2019-11-07 13:00:00 +09:00
s . startedFlowSequenceNum --
2019-11-06 19:28:47 +09:00
s . progressColumn ( ctx , 1 )
return
}
2019-10-16 18:19:48 +09:00
case ',' :
2019-11-07 13:00:00 +09:00
if s . startedFlowSequenceNum > 0 || s . startedFlowMapNum > 0 {
2019-11-06 19:28:47 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( c )
ctx . addToken ( token . CollectEntry ( string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
return
}
2019-10-16 18:19:48 +09:00
case ':' :
nc := ctx . nextChar ( )
2020-07-02 12:51:30 +09:00
if s . startedFlowMapNum > 0 || nc == ' ' || s . isNewLineChar ( nc ) || ctx . isNextEOS ( ) {
2019-10-16 18:19:48 +09:00
// mapping value
2019-10-17 15:44:55 +09:00
tk := s . bufferedToken ( ctx )
if tk != nil {
2019-10-23 20:22:14 +09:00
s . prevIndentColumn = tk . Position . Column
2019-10-17 15:44:55 +09:00
ctx . addToken ( tk )
2022-12-02 04:02:53 +09:00
} else if tk := ctx . lastToken ( ) ; tk != nil {
// If the map key is quote, the buffer does not exist because it has already been cut into tokens.
// Therefore, we need to check the last token.
if tk . Indicator == token . QuotedScalarIndicator {
s . prevIndentColumn = tk . Position . Column
}
2019-10-17 15:44:55 +09:00
}
2019-10-16 18:19:48 +09:00
ctx . addToken ( token . MappingValue ( s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
return
}
case '|' , '>' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) {
2019-10-16 18:19:48 +09:00
progress , err := s . scanLiteralHeader ( ctx )
if err != nil {
// TODO: returns syntax error object
return
}
s . progressColumn ( ctx , progress )
s . progressLine ( ctx )
continue
}
case '!' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) {
2019-11-06 19:28:47 +09:00
token , progress := s . scanTag ( ctx )
ctx . addToken ( token )
s . progressColumn ( ctx , progress )
2019-12-22 11:59:49 +09:00
if c := ctx . previousChar ( ) ; s . isNewLineChar ( c ) {
2019-11-06 19:28:47 +09:00
s . progressLine ( ctx )
}
pos += progress
return
2019-10-16 18:19:48 +09:00
}
case '%' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) && s . indentNum == 0 {
2021-03-01 19:15:44 +09:00
ctx . addToken ( token . Directive ( string ( ctx . obuf ) + "%" , s . pos ( ) ) )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 1 )
return
}
case '?' :
nc := ctx . nextChar ( )
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) && nc == ' ' {
2020-07-02 17:22:04 +09:00
ctx . addToken ( token . MappingKey ( s . pos ( ) ) )
2019-10-16 18:19:48 +09:00
s . progressColumn ( ctx , 1 )
return
}
case '&' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) {
2019-11-06 19:28:47 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( c )
ctx . addToken ( token . Anchor ( string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
s . isAnchor = true
return
}
2019-10-16 18:19:48 +09:00
case '*' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) {
2019-11-06 19:28:47 +09:00
s . addBufferedTokenIfExists ( ctx )
ctx . addOriginBuf ( c )
ctx . addToken ( token . Alias ( string ( ctx . obuf ) , s . pos ( ) ) )
s . progressColumn ( ctx , 1 )
return
}
2019-10-16 18:19:48 +09:00
case '#' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) || ctx . previousChar ( ) == ' ' {
2019-11-12 19:50:58 +09:00
s . addBufferedTokenIfExists ( ctx )
token , progress := s . scanComment ( ctx )
ctx . addToken ( token )
s . progressColumn ( ctx , progress )
s . progressLine ( ctx )
pos += progress
return
}
2019-10-16 18:19:48 +09:00
case '\'' , '"' :
2019-12-28 22:37:50 +09:00
if ! ctx . existsBuffer ( ) {
2019-11-07 18:08:12 +09:00
token , progress := s . scanQuote ( ctx , c )
ctx . addToken ( token )
pos += progress
2022-12-02 17:03:39 +09:00
// If the non-whitespace character immediately following the quote is ':', the quote should be treated as a map key.
2022-12-02 04:02:53 +09:00
// Therefore, do not return and continue processing as a normal map key.
2022-12-02 17:03:39 +09:00
if ctx . currentCharWithSkipWhitespace ( ) == ':' {
2022-12-02 04:02:53 +09:00
continue
}
2019-11-07 18:08:12 +09:00
return
}
2019-11-09 17:20:39 +09:00
case '\r' , '\n' :
// There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec.
// > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character.
// > Outside scalar content, YAML allows any line break to be used to terminate lines.
// > -- https://yaml.org/spec/1.2/spec.html
if c == '\r' && ctx . nextChar ( ) == '\n' {
ctx . addOriginBuf ( '\r' )
ctx . progress ( 1 )
c = '\n'
}
2019-10-16 18:19:48 +09:00
s . scanNewLine ( ctx , c )
continue
case ' ' :
if ctx . isSaveIndentMode ( ) || ( ! s . isAnchor && ! s . isFirstCharAtLine ) {
ctx . addBuf ( c )
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , 1 )
continue
}
if s . isFirstCharAtLine {
s . progressColumn ( ctx , 1 )
ctx . addOriginBuf ( c )
continue
}
s . addBufferedTokenIfExists ( ctx )
2021-03-01 18:49:34 +09:00
pos -- // to rescan white space at next scanning for adding white space to next buffer.
2019-10-16 18:19:48 +09:00
s . isAnchor = false
return
}
ctx . addBuf ( c )
ctx . addOriginBuf ( c )
s . progressColumn ( ctx , 1 )
}
2019-10-23 03:21:42 +09:00
s . addBufferedTokenIfExists ( ctx )
2019-10-16 18:19:48 +09:00
return
}
2019-10-21 12:53:30 +09:00
// Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
2019-12-29 11:37:20 +09:00
func ( s * Scanner ) Init ( text string ) {
src := [ ] rune ( text )
2019-10-16 18:19:48 +09:00
s . source = src
s . sourcePos = 0
s . sourceSize = len ( src )
s . line = 1
s . column = 1
s . offset = 1
s . prevIndentLevel = 0
s . prevIndentNum = 0
2019-10-23 20:22:14 +09:00
s . prevIndentColumn = 0
2019-10-16 18:19:48 +09:00
s . indentLevel = 0
s . indentNum = 0
s . isFirstCharAtLine = true
}
2019-10-21 12:53:30 +09:00
// Scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
2019-10-16 18:19:48 +09:00
func ( s * Scanner ) Scan ( ) ( token . Tokens , error ) {
if s . sourcePos >= s . sourceSize {
return nil , io . EOF
}
ctx := newContext ( s . source [ s . sourcePos : ] )
2019-12-29 11:47:34 +09:00
defer ctx . release ( )
2019-10-16 18:19:48 +09:00
progress := s . scan ( ctx )
s . sourcePos += progress
2019-12-29 11:47:34 +09:00
var tokens token . Tokens
tokens = append ( tokens , ctx . tokens ... )
return tokens , nil
2019-10-16 18:19:48 +09:00
}