package scanner import ( "errors" "fmt" "io" "strconv" "strings" "github.com/goccy/go-yaml/token" ) // IndentState state for indent type IndentState int const ( // IndentStateEqual equals previous indent IndentStateEqual IndentState = iota // IndentStateUp more indent than previous IndentStateUp // IndentStateDown less indent than previous IndentStateDown // IndentStateKeep uses not indent token IndentStateKeep ) // Scanner holds the scanner's internal state while processing a given text. // It can be allocated as part of another data structure but must be initialized via Init before use. type Scanner struct { source []rune sourcePos int sourceSize int // line number. This number starts from 1. line int // column number. This number starts from 1. column int // offset represents the offset from the beginning of the source. offset int // lastDelimColumn is the last column needed to compare indent is retained. lastDelimColumn int // indentNum indicates the number of spaces used for indentation. indentNum int // prevLineIndentNum indicates the number of spaces used for indentation at previous line. prevLineIndentNum int // indentLevel indicates the level of indent depth. This value does not match the column value. indentLevel int isFirstCharAtLine bool isAnchor bool isAlias bool isDirective bool startedFlowSequenceNum int startedFlowMapNum int indentState IndentState savedPos *token.Position } func (s *Scanner) pos() *token.Position { return &token.Position{ Line: s.line, Column: s.column, Offset: s.offset, IndentNum: s.indentNum, IndentLevel: s.indentLevel, } } func (s *Scanner) bufferedToken(ctx *Context) *token.Token { if s.savedPos != nil { tk := ctx.bufferedToken(s.savedPos) s.savedPos = nil return tk } line := s.line column := s.column - len(ctx.buf) level := s.indentLevel if ctx.isMultiLine() { line -= s.newLineCount(ctx.buf) column = strings.Index(string(ctx.obuf), string(ctx.buf)) + 1 // Since we are in a literal, folded or raw folded // we can use the indent level from the last token. last := ctx.lastToken() if last != nil { // The last token should never be nil here. level = last.Position.IndentLevel + 1 } } return ctx.bufferedToken(&token.Position{ Line: line, Column: column, Offset: s.offset - len(ctx.buf), IndentNum: s.indentNum, IndentLevel: level, }) } func (s *Scanner) progressColumn(ctx *Context, num int) { s.column += num s.offset += num s.progress(ctx, num) } func (s *Scanner) progressLine(ctx *Context) { s.prevLineIndentNum = s.indentNum s.column = 1 s.line++ s.offset++ s.indentNum = 0 s.isFirstCharAtLine = true s.isAnchor = false s.isAlias = false s.isDirective = false s.progress(ctx, 1) } func (s *Scanner) progress(ctx *Context, num int) { ctx.progress(num) s.sourcePos += num } func (s *Scanner) isNewLineChar(c rune) bool { if c == '\n' { return true } if c == '\r' { return true } return false } func (s *Scanner) newLineCount(src []rune) int { size := len(src) cnt := 0 for i := 0; i < size; i++ { c := src[i] switch c { case '\r': if i+1 < size && src[i+1] == '\n' { i++ } cnt++ case '\n': cnt++ } } return cnt } func (s *Scanner) updateIndentLevel() { if s.prevLineIndentNum < s.indentNum { s.indentLevel++ } else if s.prevLineIndentNum > s.indentNum { if s.indentLevel > 0 { s.indentLevel-- } } } func (s *Scanner) updateIndentState(ctx *Context) { if s.lastDelimColumn == 0 { return } if s.lastDelimColumn < s.column { s.indentState = IndentStateUp } else { // If lastDelimColumn and s.column are the same, // treat as Down state since it is the same column as delimiter. s.indentState = IndentStateDown } } func (s *Scanner) updateIndent(ctx *Context, c rune) { if s.isFirstCharAtLine && s.isNewLineChar(c) { return } if s.isFirstCharAtLine && c == ' ' { s.indentNum++ return } if s.isFirstCharAtLine && c == '\t' { // found tab indent. // In this case, scanTab returns error. return } if !s.isFirstCharAtLine { s.indentState = IndentStateKeep return } s.updateIndentLevel() s.updateIndentState(ctx) s.isFirstCharAtLine = false } func (s *Scanner) isChangedToIndentStateDown() bool { return s.indentState == IndentStateDown } func (s *Scanner) isChangedToIndentStateUp() bool { return s.indentState == IndentStateUp } func (s *Scanner) addBufferedTokenIfExists(ctx *Context) { ctx.addToken(s.bufferedToken(ctx)) } func (s *Scanner) breakMultiLine(ctx *Context) { ctx.breakMultiLine() } func (s *Scanner) scanSingleQuote(ctx *Context) (*token.Token, error) { ctx.addOriginBuf('\'') srcpos := s.pos() startIndex := ctx.idx + 1 src := ctx.src size := len(src) value := []rune{} isFirstLineChar := false isNewLine := false for idx := startIndex; idx < size; idx++ { if !isNewLine { s.progressColumn(ctx, 1) } else { isNewLine = false } c := src[idx] ctx.addOriginBuf(c) if s.isNewLineChar(c) { notSpaceIdx := -1 for i := len(value) - 1; i >= 0; i-- { if value[i] == ' ' { continue } notSpaceIdx = i break } if len(value) > notSpaceIdx { value = value[:notSpaceIdx+1] } if isFirstLineChar { value = append(value, '\n') } else { value = append(value, ' ') } isFirstLineChar = true isNewLine = true s.progressLine(ctx) if idx+1 < size { if err := s.validateDocumentSeparatorMarker(ctx, src[idx+1:]); err != nil { return nil, err } } continue } else if isFirstLineChar && c == ' ' { continue } else if isFirstLineChar && c == '\t' { if s.lastDelimColumn >= s.column { return nil, ErrInvalidToken( token.Invalid( "tab character cannot be used for indentation in single-quoted text", string(ctx.obuf), s.pos(), ), ) } continue } else if c != '\'' { value = append(value, c) isFirstLineChar = false continue } else if idx+1 < len(ctx.src) && ctx.src[idx+1] == '\'' { // '' handle as ' character value = append(value, c) ctx.addOriginBuf(c) idx++ s.progressColumn(ctx, 1) continue } s.progressColumn(ctx, 1) return token.SingleQuote(string(value), string(ctx.obuf), srcpos), nil } s.progressColumn(ctx, 1) return nil, ErrInvalidToken( token.Invalid( "could not find end character of single-quoted text", string(ctx.obuf), srcpos, ), ) } func hexToInt(b rune) int { if b >= 'A' && b <= 'F' { return int(b) - 'A' + 10 } if b >= 'a' && b <= 'f' { return int(b) - 'a' + 10 } return int(b) - '0' } func hexRunesToInt(b []rune) int { sum := 0 for i := 0; i < len(b); i++ { sum += hexToInt(b[i]) << (uint(len(b)-i-1) * 4) } return sum } func (s *Scanner) scanDoubleQuote(ctx *Context) (*token.Token, error) { ctx.addOriginBuf('"') srcpos := s.pos() startIndex := ctx.idx + 1 src := ctx.src size := len(src) value := []rune{} isFirstLineChar := false isNewLine := false for idx := startIndex; idx < size; idx++ { if !isNewLine { s.progressColumn(ctx, 1) } else { isNewLine = false } c := src[idx] ctx.addOriginBuf(c) if s.isNewLineChar(c) { notSpaceIdx := -1 for i := len(value) - 1; i >= 0; i-- { if value[i] == ' ' { continue } notSpaceIdx = i break } if len(value) > notSpaceIdx { value = value[:notSpaceIdx+1] } if isFirstLineChar { value = append(value, '\n') } else { value = append(value, ' ') } isFirstLineChar = true isNewLine = true s.progressLine(ctx) if idx+1 < size { if err := s.validateDocumentSeparatorMarker(ctx, src[idx+1:]); err != nil { return nil, err } } continue } else if isFirstLineChar && c == ' ' { continue } else if isFirstLineChar && c == '\t' { if s.lastDelimColumn >= s.column { return nil, ErrInvalidToken( token.Invalid( "tab character cannot be used for indentation in double-quoted text", string(ctx.obuf), s.pos(), ), ) } continue } else if c == '\\' { isFirstLineChar = false if idx+1 >= size { value = append(value, c) continue } nextChar := src[idx+1] progress := 0 switch nextChar { case '0': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x00) case 'a': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x07) case 'b': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x08) case 't': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x09) case 'n': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x0A) case 'v': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x0B) case 'f': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x0C) case 'r': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x0D) case 'e': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x1B) case ' ': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x20) case '"': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x22) case '/': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x2F) case '\\': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x5C) case 'N': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x85) case '_': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0xA0) case 'L': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x2028) case 'P': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, 0x2029) case 'x': if idx+3 >= size { progress = 1 ctx.addOriginBuf(nextChar) value = append(value, nextChar) } else { progress = 3 codeNum := hexRunesToInt(src[idx+2 : idx+progress+1]) value = append(value, rune(codeNum)) } case 'u': // \u0000 style must have 5 characters at least. if idx+5 >= size { return nil, ErrInvalidToken( token.Invalid( "not enough length for escaped UTF-16 character", string(ctx.obuf), s.pos(), ), ) } progress = 5 codeNum := hexRunesToInt(src[idx+2 : idx+6]) // handle surrogate pairs. if codeNum >= 0xD800 && codeNum <= 0xDBFF { high := codeNum // \u0000\u0000 style must have 11 characters at least. if idx+11 >= size { return nil, ErrInvalidToken( token.Invalid( "not enough length for escaped UTF-16 surrogate pair", string(ctx.obuf), s.pos(), ), ) } if src[idx+6] != '\\' || src[idx+7] != 'u' { return nil, ErrInvalidToken( token.Invalid( "found unexpected character after high surrogate for UTF-16 surrogate pair", string(ctx.obuf), s.pos(), ), ) } low := hexRunesToInt(src[idx+8 : idx+12]) if low < 0xDC00 || low > 0xDFFF { return nil, ErrInvalidToken( token.Invalid( "found unexpected low surrogate after high surrogate", string(ctx.obuf), s.pos(), ), ) } codeNum = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000 progress += 6 } value = append(value, rune(codeNum)) case 'U': // \U00000000 style must have 9 characters at least. if idx+9 >= size { return nil, ErrInvalidToken( token.Invalid( "not enough length for escaped UTF-32 character", string(ctx.obuf), s.pos(), ), ) } progress = 9 codeNum := hexRunesToInt(src[idx+2 : idx+10]) value = append(value, rune(codeNum)) case '\n': isFirstLineChar = true isNewLine = true ctx.addOriginBuf(nextChar) s.progressColumn(ctx, 1) s.progressLine(ctx) idx++ continue case '\t': progress = 1 ctx.addOriginBuf(nextChar) value = append(value, nextChar) default: s.progressColumn(ctx, 1) return nil, ErrInvalidToken( token.Invalid( fmt.Sprintf("found unknown escape character %q", nextChar), string(ctx.obuf), s.pos(), ), ) } idx += progress s.progressColumn(ctx, progress) continue } else if c == '\t' { var ( foundNotSpaceChar bool progress int ) for i := idx + 1; i < size; i++ { if src[i] == ' ' || src[i] == '\t' { progress++ continue } if s.isNewLineChar(src[i]) { break } foundNotSpaceChar = true } if foundNotSpaceChar { value = append(value, c) if src[idx+1] != '"' { s.progressColumn(ctx, 1) } } else { idx += progress s.progressColumn(ctx, progress) } continue } else if c != '"' { value = append(value, c) isFirstLineChar = false continue } s.progressColumn(ctx, 1) return token.DoubleQuote(string(value), string(ctx.obuf), srcpos), nil } s.progressColumn(ctx, 1) return nil, ErrInvalidToken( token.Invalid( "could not find end character of double-quoted text", string(ctx.obuf), srcpos, ), ) } func (s *Scanner) validateDocumentSeparatorMarker(ctx *Context, src []rune) error { if s.foundDocumentSeparatorMarker(src) { return ErrInvalidToken( token.Invalid("found unexpected document separator", string(ctx.obuf), s.pos()), ) } return nil } func (s *Scanner) foundDocumentSeparatorMarker(src []rune) bool { if len(src) < 3 { return false } var marker string if len(src) == 3 { marker = string(src) } else { marker = strings.TrimRightFunc(string(src[:4]), func(r rune) bool { return r == ' ' || r == '\t' || r == '\n' || r == '\r' }) } return marker == "---" || marker == "..." } func (s *Scanner) scanQuote(ctx *Context, ch rune) (bool, error) { if ctx.existsBuffer() { return false, nil } if ch == '\'' { tk, err := s.scanSingleQuote(ctx) if err != nil { return false, err } ctx.addToken(tk) } else { tk, err := s.scanDoubleQuote(ctx) if err != nil { return false, err } ctx.addToken(tk) } ctx.clear() return true, nil } func (s *Scanner) scanWhiteSpace(ctx *Context) bool { if ctx.isMultiLine() { return false } if !s.isAnchor && !s.isDirective && !s.isAlias && !s.isFirstCharAtLine { return false } if s.isFirstCharAtLine { s.progressColumn(ctx, 1) ctx.addOriginBuf(' ') return true } if s.isDirective { s.addBufferedTokenIfExists(ctx) s.progressColumn(ctx, 1) ctx.addOriginBuf(' ') return true } s.addBufferedTokenIfExists(ctx) s.isAnchor = false s.isAlias = false return true } func (s *Scanner) isMergeKey(ctx *Context) bool { if ctx.repeatNum('<') != 2 { return false } src := ctx.src size := len(src) for idx := ctx.idx + 2; idx < size; idx++ { c := src[idx] if c == ' ' { continue } if c != ':' { return false } if idx+1 < size { nc := src[idx+1] if nc == ' ' || s.isNewLineChar(nc) { return true } } } return false } func (s *Scanner) scanTag(ctx *Context) (bool, error) { if ctx.existsBuffer() || s.isDirective { return false, nil } ctx.addOriginBuf('!') s.progress(ctx, 1) // skip '!' character var progress int for idx, c := range ctx.src[ctx.idx:] { progress = idx + 1 switch c { case ' ': ctx.addOriginBuf(c) value := ctx.source(ctx.idx-1, ctx.idx+idx) ctx.addToken(token.Tag(value, string(ctx.obuf), s.pos())) s.progressColumn(ctx, len([]rune(value))) ctx.clear() return true, nil case ',': if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 { value := ctx.source(ctx.idx-1, ctx.idx+idx) ctx.addToken(token.Tag(value, string(ctx.obuf), s.pos())) s.progressColumn(ctx, len([]rune(value))-1) // progress column before collect-entry for scanning it at scanFlowEntry function. ctx.clear() return true, nil } else { ctx.addOriginBuf(c) } case '\n', '\r': ctx.addOriginBuf(c) value := ctx.source(ctx.idx-1, ctx.idx+idx) ctx.addToken(token.Tag(value, string(ctx.obuf), s.pos())) s.progressColumn(ctx, len([]rune(value))-1) // progress column before new-line-char for scanning new-line-char at scanNewLine function. ctx.clear() return true, nil case '{', '}': ctx.addOriginBuf(c) s.progressColumn(ctx, progress) invalidTk := token.Invalid(fmt.Sprintf("found invalid tag character %q", c), string(ctx.obuf), s.pos()) return false, ErrInvalidToken(invalidTk) default: ctx.addOriginBuf(c) } } s.progressColumn(ctx, progress) ctx.clear() return true, nil } func (s *Scanner) scanComment(ctx *Context) bool { if ctx.existsBuffer() { c := ctx.previousChar() if c != ' ' && c != '\t' && !s.isNewLineChar(c) { return false } } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('#') s.progress(ctx, 1) // skip '#' character for idx, c := range ctx.src[ctx.idx:] { ctx.addOriginBuf(c) if !s.isNewLineChar(c) { continue } if ctx.previousChar() == '\\' { continue } value := ctx.source(ctx.idx, ctx.idx+idx) progress := len([]rune(value)) ctx.addToken(token.Comment(value, string(ctx.obuf), s.pos())) s.progressColumn(ctx, progress) s.progressLine(ctx) ctx.clear() return true } // document ends with comment. value := string(ctx.src[ctx.idx:]) ctx.addToken(token.Comment(value, string(ctx.obuf), s.pos())) progress := len([]rune(value)) s.progressColumn(ctx, progress) s.progressLine(ctx) ctx.clear() return true } func (s *Scanner) scanMultiLine(ctx *Context, c rune) error { state := ctx.getMultiLineState() ctx.addOriginBuf(c) if ctx.isEOS() { if s.isFirstCharAtLine && c == ' ' { state.addIndent(ctx, s.column) } else { ctx.addBuf(c) } state.updateIndentColumn(s.column) if err := state.validateIndentColumn(); err != nil { invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos()) s.progressColumn(ctx, 1) return ErrInvalidToken(invalidTk) } value := ctx.bufferedSrc() ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos())) ctx.clear() s.progressColumn(ctx, 1) } else if s.isNewLineChar(c) { ctx.addBuf(c) state.updateSpaceOnlyIndentColumn(s.column - 1) state.updateNewLineState() s.progressLine(ctx) if ctx.next() { if s.foundDocumentSeparatorMarker(ctx.src[ctx.idx:]) { value := ctx.bufferedSrc() ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos())) ctx.clear() s.breakMultiLine(ctx) } } } else if s.isFirstCharAtLine && c == ' ' { state.addIndent(ctx, s.column) s.progressColumn(ctx, 1) } else if s.isFirstCharAtLine && c == '\t' && state.isIndentColumn(s.column) { err := ErrInvalidToken( token.Invalid( "found a tab character where an indentation space is expected", string(ctx.obuf), s.pos(), ), ) s.progressColumn(ctx, 1) return err } else if c == '\t' && !state.isIndentColumn(s.column) { ctx.addBufWithTab(c) s.progressColumn(ctx, 1) } else { if err := state.validateIndentAfterSpaceOnly(s.column); err != nil { invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos()) s.progressColumn(ctx, 1) return ErrInvalidToken(invalidTk) } state.updateIndentColumn(s.column) if err := state.validateIndentColumn(); err != nil { invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos()) s.progressColumn(ctx, 1) return ErrInvalidToken(invalidTk) } if col := state.lastDelimColumn(); col > 0 { s.lastDelimColumn = col } state.updateNewLineInFolded(ctx, s.column) ctx.addBufWithTab(c) s.progressColumn(ctx, 1) } return nil } func (s *Scanner) scanNewLine(ctx *Context, c rune) { if len(ctx.buf) > 0 && s.savedPos == nil { bufLen := len(ctx.bufferedSrc()) s.savedPos = s.pos() s.savedPos.Column -= bufLen s.savedPos.Offset -= bufLen } // if the following case, origin buffer has unnecessary two spaces. // So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too. // --- // a:[space][space] // b: c ctx.removeRightSpaceFromBuf() // There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec. // > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character. // > Outside scalar content, YAML allows any line break to be used to terminate lines. // > -- https://yaml.org/spec/1.2/spec.html if c == '\r' && ctx.nextChar() == '\n' { ctx.addOriginBuf('\r') s.progress(ctx, 1) s.offset++ c = '\n' } if ctx.isEOS() { s.addBufferedTokenIfExists(ctx) } else if s.isAnchor || s.isAlias || s.isDirective { s.addBufferedTokenIfExists(ctx) } if ctx.existsBuffer() && s.isFirstCharAtLine { if ctx.buf[len(ctx.buf)-1] == ' ' { ctx.buf[len(ctx.buf)-1] = '\n' } else { ctx.buf = append(ctx.buf, '\n') } } else { ctx.addBuf(' ') } ctx.addOriginBuf(c) s.progressLine(ctx) } func (s *Scanner) isFlowMode() bool { if s.startedFlowSequenceNum > 0 { return true } if s.startedFlowMapNum > 0 { return true } return false } func (s *Scanner) scanFlowMapStart(ctx *Context) bool { if ctx.existsBuffer() && !s.isFlowMode() { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('{') ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos())) s.startedFlowMapNum++ s.progressColumn(ctx, 1) ctx.clear() return true } func (s *Scanner) scanFlowMapEnd(ctx *Context) bool { if s.startedFlowMapNum <= 0 { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('}') ctx.addToken(token.MappingEnd(string(ctx.obuf), s.pos())) s.startedFlowMapNum-- s.progressColumn(ctx, 1) ctx.clear() return true } func (s *Scanner) scanFlowArrayStart(ctx *Context) bool { if ctx.existsBuffer() && !s.isFlowMode() { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('[') ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos())) s.startedFlowSequenceNum++ s.progressColumn(ctx, 1) ctx.clear() return true } func (s *Scanner) scanFlowArrayEnd(ctx *Context) bool { if ctx.existsBuffer() && s.startedFlowSequenceNum <= 0 { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf(']') ctx.addToken(token.SequenceEnd(string(ctx.obuf), s.pos())) s.startedFlowSequenceNum-- s.progressColumn(ctx, 1) ctx.clear() return true } func (s *Scanner) scanFlowEntry(ctx *Context, c rune) bool { if s.startedFlowSequenceNum <= 0 && s.startedFlowMapNum <= 0 { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf(c) ctx.addToken(token.CollectEntry(string(ctx.obuf), s.pos())) s.progressColumn(ctx, 1) ctx.clear() return true } func (s *Scanner) scanMapDelim(ctx *Context) (bool, error) { nc := ctx.nextChar() if s.isDirective || s.isAnchor || s.isAlias { return false, nil } if s.startedFlowMapNum <= 0 && nc != ' ' && nc != '\t' && !s.isNewLineChar(nc) && !ctx.isNextEOS() { return false, nil } if s.startedFlowMapNum > 0 && nc == '/' { // like http:// return false, nil } if s.startedFlowMapNum > 0 { tk := ctx.lastToken() if tk != nil && tk.Type == token.MappingValueType { return false, nil } } if strings.HasPrefix(strings.TrimPrefix(string(ctx.obuf), " "), "\t") && !strings.HasPrefix(string(ctx.buf), "\t") { invalidTk := token.Invalid("tab character cannot use as a map key directly", string(ctx.obuf), s.pos()) s.progressColumn(ctx, 1) return false, ErrInvalidToken(invalidTk) } // mapping value tk := s.bufferedToken(ctx) if tk != nil { s.lastDelimColumn = tk.Position.Column ctx.addToken(tk) } else if tk := ctx.lastToken(); tk != nil { // If the map key is quote, the buffer does not exist because it has already been cut into tokens. // Therefore, we need to check the last token. if tk.Indicator == token.QuotedScalarIndicator { s.lastDelimColumn = tk.Position.Column } } ctx.addToken(token.MappingValue(s.pos())) s.progressColumn(ctx, 1) ctx.clear() return true, nil } func (s *Scanner) scanDocumentStart(ctx *Context) bool { if s.indentNum != 0 { return false } if s.column != 1 { return false } if ctx.repeatNum('-') != 3 { return false } if ctx.size > ctx.idx+3 { c := ctx.src[ctx.idx+3] if c != ' ' && c != '\t' && c != '\n' && c != '\r' { return false } } s.addBufferedTokenIfExists(ctx) ctx.addToken(token.DocumentHeader(string(ctx.obuf)+"---", s.pos())) s.progressColumn(ctx, 3) ctx.clear() s.clearState() return true } func (s *Scanner) scanDocumentEnd(ctx *Context) bool { if s.indentNum != 0 { return false } if s.column != 1 { return false } if ctx.repeatNum('.') != 3 { return false } s.addBufferedTokenIfExists(ctx) ctx.addToken(token.DocumentEnd(string(ctx.obuf)+"...", s.pos())) s.progressColumn(ctx, 3) ctx.clear() return true } func (s *Scanner) scanMergeKey(ctx *Context) bool { if !s.isMergeKey(ctx) { return false } s.lastDelimColumn = s.column ctx.addToken(token.MergeKey(string(ctx.obuf)+"<<", s.pos())) s.progressColumn(ctx, 2) ctx.clear() return true } func (s *Scanner) scanRawFoldedChar(ctx *Context) bool { if !ctx.existsBuffer() { return false } if !s.isChangedToIndentStateUp() { return false } ctx.setRawFolded(s.column) ctx.addBuf('-') ctx.addOriginBuf('-') s.progressColumn(ctx, 1) return true } func (s *Scanner) scanSequence(ctx *Context) (bool, error) { if ctx.existsBuffer() { return false, nil } nc := ctx.nextChar() if nc != 0 && nc != ' ' && nc != '\t' && !s.isNewLineChar(nc) { return false, nil } if strings.HasPrefix(strings.TrimPrefix(string(ctx.obuf), " "), "\t") { invalidTk := token.Invalid("tab character cannot use as a sequence delimiter", string(ctx.obuf), s.pos()) s.progressColumn(ctx, 1) return false, ErrInvalidToken(invalidTk) } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('-') tk := token.SequenceEntry(string(ctx.obuf), s.pos()) s.lastDelimColumn = tk.Position.Column ctx.addToken(tk) s.progressColumn(ctx, 1) ctx.clear() return true, nil } func (s *Scanner) scanMultiLineHeader(ctx *Context) (bool, error) { if ctx.existsBuffer() { return false, nil } if err := s.scanMultiLineHeaderOption(ctx); err != nil { return false, err } s.progressLine(ctx) return true, nil } func (s *Scanner) validateMultiLineHeaderOption(opt string) error { if len(opt) == 0 { return nil } orgOpt := opt opt = strings.TrimPrefix(opt, "-") opt = strings.TrimPrefix(opt, "+") opt = strings.TrimSuffix(opt, "-") opt = strings.TrimSuffix(opt, "+") if len(opt) == 0 { return nil } if opt == "0" { return fmt.Errorf("invalid header option: %q", orgOpt) } i, err := strconv.ParseInt(opt, 10, 64) if err != nil { return fmt.Errorf("invalid header option: %q", orgOpt) } if i > 9 { return fmt.Errorf("invalid header option: %q", orgOpt) } return nil } func (s *Scanner) scanMultiLineHeaderOption(ctx *Context) error { header := ctx.currentChar() ctx.addOriginBuf(header) s.progress(ctx, 1) // skip '|' or '>' character var progress int for idx, c := range ctx.src[ctx.idx:] { progress = idx ctx.addOriginBuf(c) if s.isNewLineChar(c) { break } } value := strings.TrimRight(ctx.source(ctx.idx, ctx.idx+progress), " ") commentValueIndex := strings.Index(value, "#") opt := value if commentValueIndex > 0 { opt = value[:commentValueIndex] } opt = strings.TrimRightFunc(opt, func(r rune) bool { return r == ' ' || r == '\t' }) if len(opt) != 0 { if err := s.validateMultiLineHeaderOption(opt); err != nil { invalidTk := token.Invalid(err.Error(), string(ctx.obuf), s.pos()) s.progressColumn(ctx, progress) return ErrInvalidToken(invalidTk) } } if s.column == 1 { s.lastDelimColumn = 1 } commentIndex := strings.Index(string(ctx.obuf), "#") headerBuf := string(ctx.obuf) if commentIndex > 0 { headerBuf = headerBuf[:commentIndex] } switch header { case '|': ctx.addToken(token.Literal("|"+opt, headerBuf, s.pos())) ctx.setLiteral(s.lastDelimColumn, opt) case '>': ctx.addToken(token.Folded(">"+opt, headerBuf, s.pos())) ctx.setFolded(s.lastDelimColumn, opt) } if commentIndex > 0 { comment := string(value[commentValueIndex+1:]) s.offset += len(headerBuf) s.column += len(headerBuf) ctx.addToken(token.Comment(comment, string(ctx.obuf[len(headerBuf):]), s.pos())) } s.indentState = IndentStateKeep ctx.resetBuffer() s.progressColumn(ctx, progress) return nil } func (s *Scanner) scanMapKey(ctx *Context) bool { if ctx.existsBuffer() { return false } nc := ctx.nextChar() if nc != ' ' && nc != '\t' { return false } tk := token.MappingKey(s.pos()) s.lastDelimColumn = tk.Position.Column ctx.addToken(tk) s.progressColumn(ctx, 1) ctx.clear() return true } func (s *Scanner) scanDirective(ctx *Context) bool { if ctx.existsBuffer() { return false } if s.indentNum != 0 { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('%') ctx.addToken(token.Directive(string(ctx.obuf), s.pos())) s.progressColumn(ctx, 1) ctx.clear() s.isDirective = true return true } func (s *Scanner) scanAnchor(ctx *Context) bool { if ctx.existsBuffer() { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('&') ctx.addToken(token.Anchor(string(ctx.obuf), s.pos())) s.progressColumn(ctx, 1) s.isAnchor = true ctx.clear() return true } func (s *Scanner) scanAlias(ctx *Context) bool { if ctx.existsBuffer() { return false } s.addBufferedTokenIfExists(ctx) ctx.addOriginBuf('*') ctx.addToken(token.Alias(string(ctx.obuf), s.pos())) s.progressColumn(ctx, 1) s.isAlias = true ctx.clear() return true } func (s *Scanner) scanReservedChar(ctx *Context, c rune) error { if ctx.existsBuffer() { return nil } ctx.addBuf(c) ctx.addOriginBuf(c) err := ErrInvalidToken( token.Invalid( fmt.Sprintf("%q is a reserved character", c), string(ctx.obuf), s.pos(), ), ) s.progressColumn(ctx, 1) ctx.clear() return err } func (s *Scanner) scanTab(ctx *Context, c rune) error { if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 { // tabs character is allowed in flow mode. return nil } if !s.isFirstCharAtLine { return nil } ctx.addBuf(c) ctx.addOriginBuf(c) err := ErrInvalidToken( token.Invalid("found character '\t' that cannot start any token", string(ctx.obuf), s.pos(), ), ) s.progressColumn(ctx, 1) ctx.clear() return err } func (s *Scanner) scan(ctx *Context) error { for ctx.next() { c := ctx.currentChar() // First, change the IndentState. // If the target character is the first character in a line, IndentState is Up/Down/Equal state. // The second and subsequent letters are Keep. s.updateIndent(ctx, c) // If IndentState is down, tokens are split, so the buffer accumulated until that point needs to be cutted as a token. if s.isChangedToIndentStateDown() { s.addBufferedTokenIfExists(ctx) } if ctx.isMultiLine() { if s.isChangedToIndentStateDown() { if tk := ctx.lastToken(); tk != nil { // If literal/folded content is empty, no string token is added. // Therefore, add an empty string token. // But if literal/folded token column is 1, it is invalid at down state. if tk.Position.Column == 1 { return ErrInvalidToken( token.Invalid( "could not find multi-line content", string(ctx.obuf), s.pos(), ), ) } if tk.Type != token.StringType { ctx.addToken(token.String("", "", s.pos())) } } s.breakMultiLine(ctx) } else { if err := s.scanMultiLine(ctx, c); err != nil { return err } continue } } switch c { case '{': if s.scanFlowMapStart(ctx) { continue } case '}': if s.scanFlowMapEnd(ctx) { continue } case '.': if s.scanDocumentEnd(ctx) { continue } case '<': if s.scanMergeKey(ctx) { continue } case '-': if s.scanDocumentStart(ctx) { continue } if s.scanRawFoldedChar(ctx) { continue } scanned, err := s.scanSequence(ctx) if err != nil { return err } if scanned { continue } case '[': if s.scanFlowArrayStart(ctx) { continue } case ']': if s.scanFlowArrayEnd(ctx) { continue } case ',': if s.scanFlowEntry(ctx, c) { continue } case ':': scanned, err := s.scanMapDelim(ctx) if err != nil { return err } if scanned { continue } case '|', '>': scanned, err := s.scanMultiLineHeader(ctx) if err != nil { return err } if scanned { continue } case '!': scanned, err := s.scanTag(ctx) if err != nil { return err } if scanned { continue } case '%': if s.scanDirective(ctx) { continue } case '?': if s.scanMapKey(ctx) { continue } case '&': if s.scanAnchor(ctx) { continue } case '*': if s.scanAlias(ctx) { continue } case '#': if s.scanComment(ctx) { continue } case '\'', '"': scanned, err := s.scanQuote(ctx, c) if err != nil { return err } if scanned { continue } case '\r', '\n': s.scanNewLine(ctx, c) continue case ' ': if s.scanWhiteSpace(ctx) { continue } case '@', '`': if err := s.scanReservedChar(ctx, c); err != nil { return err } case '\t': if ctx.existsBuffer() && s.lastDelimColumn == 0 { // tab indent for plain text (yaml-test-suite's spec-example-7-12-plain-lines). s.indentNum++ ctx.addOriginBuf(c) s.progressColumn(ctx, 1) continue } if s.lastDelimColumn < s.column { s.indentNum++ ctx.addOriginBuf(c) s.progressColumn(ctx, 1) continue } if err := s.scanTab(ctx, c); err != nil { return err } } ctx.addBuf(c) ctx.addOriginBuf(c) s.progressColumn(ctx, 1) } s.addBufferedTokenIfExists(ctx) return nil } // Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src. func (s *Scanner) Init(text string) { src := []rune(text) s.source = src s.sourcePos = 0 s.sourceSize = len(src) s.line = 1 s.column = 1 s.offset = 1 s.isFirstCharAtLine = true s.clearState() } func (s *Scanner) clearState() { s.prevLineIndentNum = 0 s.lastDelimColumn = 0 s.indentLevel = 0 s.indentNum = 0 } // Scan scans the next token and returns the token collection. The source end is indicated by io.EOF. func (s *Scanner) Scan() (token.Tokens, error) { if s.sourcePos >= s.sourceSize { return nil, io.EOF } ctx := newContext(s.source[s.sourcePos:]) defer ctx.release() var tokens token.Tokens err := s.scan(ctx) tokens = append(tokens, ctx.tokens...) if err != nil { var invalidTokenErr *InvalidTokenError if errors.As(err, &invalidTokenErr) { tokens = append(tokens, invalidTokenErr.Token) } return tokens, err } return tokens, nil }