go/src/cmd/compile/internal/gc/lex.go

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package gc

import (
	"bufio"
	"bytes"
	"cmd/compile/internal/syntax"
	"cmd/internal/obj"
	"fmt"
	"io"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"
)

const (
	EOF = -1
	BOM = 0xFEFF
)

// lexlineno is the line number _after_ the most recently read rune.
// In particular, it's advanced (or rewound) as newlines are read (or unread).
var lexlineno int32

// lineno is the line number at the start of the most recently lexed token.
var lineno int32

var lexbuf bytes.Buffer
var strbuf bytes.Buffer
var litbuf string // LLITERAL value for use in syntax error messages

func isSpace(c rune) bool {
	return c == ' ' || c == '\t' || c == '\n' || c == '\r'
}

func isLetter(c rune) bool {
	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
}

func isDigit(c rune) bool {
	return '0' <= c && c <= '9'
}

func isQuoted(s string) bool {
	return len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"'
}

func plan9quote(s string) string {
	if s == "" {
		return "''"
	}
	for _, c := range s {
		if c <= ' ' || c == '\'' {
			return "'" + strings.Replace(s, "'", "''", -1) + "'"
		}
	}
	return s
}

type Pragma syntax.Pragma

const (
	Nointerface       Pragma = 1 << iota
	Noescape                 // func parameters don't escape
	Norace                   // func must not have race detector annotations
	Nosplit                  // func should not execute on separate stack
	Noinline                 // func should not be inlined
	Systemstack              // func must run on system stack
	Nowritebarrier           // emit compiler error instead of write barrier
	Nowritebarrierrec        // error on write barrier in this or recursive callees
	CgoUnsafeArgs            // treat a pointer to one arg as a pointer to them all
	UintptrEscapes           // pointers converted to uintptr escape
)

func PragmaValue(verb string) Pragma {
	switch verb {
	case "go:nointerface":
		if obj.Fieldtrack_enabled != 0 {
			return Nointerface
		}
	case "go:noescape":
		return Noescape
	case "go:norace":
		return Norace
	case "go:nosplit":
		return Nosplit
	case "go:noinline":
		return Noinline
	case "go:systemstack":
		if !compiling_runtime {
			Yyerror("//go:systemstack only allowed in runtime")
		}
		return Systemstack
	case "go:nowritebarrier":
		if !compiling_runtime {
			Yyerror("//go:nowritebarrier only allowed in runtime")
		}
		return Nowritebarrier
	case "go:nowritebarrierrec":
		if !compiling_runtime {
			Yyerror("//go:nowritebarrierrec only allowed in runtime")
		}
		return Nowritebarrierrec | Nowritebarrier // implies Nowritebarrier
	case "go:cgo_unsafe_args":
		return CgoUnsafeArgs
	case "go:uintptrescapes":
		// For the next function declared in the file
		// any uintptr arguments may be pointer values
		// converted to uintptr. This directive
		// ensures that the referenced allocated
		// object, if any, is retained and not moved
		// until the call completes, even though from
		// the types alone it would appear that the
		// object is no longer needed during the
		// call. The conversion to uintptr must appear
		// in the argument list.
		// Used in syscall/dll_windows.go.
		return UintptrEscapes
	}
	return 0
}

type lexer struct {
	// source
	bin        *bufio.Reader
	prevlineno int32 // line no. of most recently read character

	nlsemi bool // if set, '\n' and EOF translate to ';'

	// pragma flags
	// accumulated by lexer; reset by parser
	pragma Pragma

	// current token
	tok  int32
	sym_ *Sym   // valid if tok == LNAME
	val  Val    // valid if tok == LLITERAL
	op   Op     // valid if tok == LOPER, LASOP, or LINCOP, or prec > 0
	prec OpPrec // operator precedence; 0 if not a binary operator
}

type OpPrec int

const (
	// Precedences of binary operators (must be > 0).
	PCOMM OpPrec = 1 + iota
	POROR
	PANDAND
	PCMP
	PADD
	PMUL
)

const (
	// The value of single-char tokens is just their character's Unicode value.
	// They are all below utf8.RuneSelf. Shift other tokens up to avoid conflicts.

	// names and literals
	LNAME = utf8.RuneSelf + iota
	LLITERAL

	// operator-based operations
	LOPER
	LASOP
	LINCOP

	// miscellaneous
	LCOLAS
	LCOMM
	LDDD

	// keywords
	LBREAK
	LCASE
	LCHAN
	LCONST
	LCONTINUE
	LDEFAULT
	LDEFER
	LELSE
	LFALL
	LFOR
	LFUNC
	LGO
	LGOTO
	LIF
	LIMPORT
	LINTERFACE
	LMAP
	LPACKAGE
	LRANGE
	LRETURN
	LSELECT
	LSTRUCT
	LSWITCH
	LTYPE
	LVAR

	LIGNORE
)

var lexn = map[rune]string{
	LNAME:    "NAME",
	LLITERAL: "LITERAL",

	LOPER:  "OPER",
	LASOP:  "ASOP",
	LINCOP: "INCOP",

	LCOLAS: "COLAS",
	LCOMM:  "COMM",
	LDDD:   "DDD",

	LBREAK:     "BREAK",
	LCASE:      "CASE",
	LCHAN:      "CHAN",
	LCONST:     "CONST",
	LCONTINUE:  "CONTINUE",
	LDEFAULT:   "DEFAULT",
	LDEFER:     "DEFER",
	LELSE:      "ELSE",
	LFALL:      "FALL",
	LFOR:       "FOR",
	LFUNC:      "FUNC",
	LGO:        "GO",
	LGOTO:      "GOTO",
	LIF:        "IF",
	LIMPORT:    "IMPORT",
	LINTERFACE: "INTERFACE",
	LMAP:       "MAP",
	LPACKAGE:   "PACKAGE",
	LRANGE:     "RANGE",
	LRETURN:    "RETURN",
	LSELECT:    "SELECT",
	LSTRUCT:    "STRUCT",
	LSWITCH:    "SWITCH",
	LTYPE:      "TYPE",
	LVAR:       "VAR",

	// LIGNORE is never escaping lexer.next
}

func lexname(lex rune) string {
	if s, ok := lexn[lex]; ok {
		return s
	}
	return fmt.Sprintf("LEX-%d", lex)
}

func (l *lexer) next() {
	nlsemi := l.nlsemi
	l.nlsemi = false
	l.prec = 0

l0:
	// skip white space
	c := l.getr()
	for isSpace(c) {
		if c == '\n' && nlsemi {
			if Debug['x'] != 0 {
				fmt.Printf("lex: implicit semi\n")
			}
			// Insert implicit semicolon on previous line,
			// before the newline character.
			lineno = lexlineno - 1
			l.tok = ';'
			return
		}
		c = l.getr()
	}

	// start of token
	lineno = lexlineno

	// identifiers and keywords
	// (for better error messages consume all chars >= utf8.RuneSelf for identifiers)
	if isLetter(c) || c >= utf8.RuneSelf {
		l.ident(c)
		if l.tok == LIGNORE {
			goto l0
		}
		return
	}
	// c < utf8.RuneSelf

	var c1 rune
	var op Op
	var prec OpPrec

	switch c {
	case EOF:
		l.ungetr()
		// Treat EOF as "end of line" for the purposes
		// of inserting a semicolon.
		if nlsemi {
			if Debug['x'] != 0 {
				fmt.Printf("lex: implicit semi\n")
			}
			l.tok = ';'
			return
		}
		l.tok = -1
		return

	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
		l.number(c)
		return

	case '.':
		c1 = l.getr()
		if isDigit(c1) {
			l.ungetr()
			l.number('.')
			return
		}

		if c1 == '.' {
			p, err := l.bin.Peek(1)
			if err == nil && p[0] == '.' {
				l.getr()
				c = LDDD
				goto lx
			}

			l.ungetr()
			c1 = '.'
		}

	case '"':
		l.stdString()
		return

	case '`':
		l.rawString()
		return

	case '\'':
		l.rune()
		return

	case '/':
		c1 = l.getr()
		if c1 == '*' {
			c = l.getr()
			for {
				if c == '*' {
					c = l.getr()
					if c == '/' {
						break
					}
					continue
				}
				if c == EOF {
					Yyerror("eof in comment")
					errorexit()
				}
				c = l.getr()
			}

			// A comment containing newlines acts like a newline.
			if lexlineno > lineno && nlsemi {
				if Debug['x'] != 0 {
					fmt.Printf("lex: implicit semi\n")
				}
				l.tok = ';'
				return
			}
			goto l0
		}

		if c1 == '/' {
			c = l.getlinepragma()
			for {
				if c == '\n' || c == EOF {
					l.ungetr()
					goto l0
				}

				c = l.getr()
			}
		}

		op = ODIV
		prec = PMUL
		goto binop1

	case ':':
		c1 = l.getr()
		if c1 == '=' {
			c = LCOLAS
			goto lx
		}

	case '*':
		op = OMUL
		prec = PMUL
		goto binop

	case '%':
		op = OMOD
		prec = PMUL
		goto binop

	case '+':
		op = OADD
		goto incop

	case '-':
		op = OSUB
		goto incop

	case '>':
		c = LOPER
		c1 = l.getr()
		if c1 == '>' {
			op = ORSH
			prec = PMUL
			goto binop
		}

		l.prec = PCMP
		if c1 == '=' {
			l.op = OGE
			goto lx
		}
		l.op = OGT

	case '<':
		c = LOPER
		c1 = l.getr()
		if c1 == '<' {
			op = OLSH
			prec = PMUL
			goto binop
		}

		if c1 == '-' {
			c = LCOMM
			// Not a binary operator, but parsed as one
			// so we can give a good error message when used
			// in an expression context.
			l.prec = PCOMM
			l.op = OSEND
			goto lx
		}

		l.prec = PCMP
		if c1 == '=' {
			l.op = OLE
			goto lx
		}
		l.op = OLT

	case '=':
		c1 = l.getr()
		if c1 == '=' {
			c = LOPER
			l.prec = PCMP
			l.op = OEQ
			goto lx
		}

	case '!':
		c1 = l.getr()
		if c1 == '=' {
			c = LOPER
			l.prec = PCMP
			l.op = ONE
			goto lx
		}

	case '&':
		c1 = l.getr()
		if c1 == '&' {
			c = LOPER
			l.prec = PANDAND
			l.op = OANDAND
			goto lx
		}

		if c1 == '^' {
			c = LOPER
			op = OANDNOT
			prec = PMUL
			goto binop
		}

		op = OAND
		prec = PMUL
		goto binop1

	case '|':
		c1 = l.getr()
		if c1 == '|' {
			c = LOPER
			l.prec = POROR
			l.op = OOROR
			goto lx
		}

		op = OOR
		prec = PADD
		goto binop1

	case '^':
		op = OXOR
		prec = PADD
		goto binop

	case '(', '[', '{', ',', ';':
		goto lx

	case ')', ']', '}':
		l.nlsemi = true
		goto lx

	default:
		// anything else is illegal
		Yyerror("syntax error: illegal character %#U", c)
		goto l0
	}

	l.ungetr()

lx:
	if Debug['x'] != 0 {
		if c >= utf8.RuneSelf {
			fmt.Printf("%v lex: TOKEN %s\n", linestr(lineno), lexname(c))
		} else {
			fmt.Printf("%v lex: TOKEN '%c'\n", linestr(lineno), c)
		}
	}

	l.tok = c
	return

incop:
	c1 = l.getr()
	if c1 == c {
		l.nlsemi = true
		l.op = op
		c = LINCOP
		goto lx
	}
	prec = PADD
	goto binop1

binop:
	c1 = l.getr()
binop1:
	if c1 != '=' {
		l.ungetr()
		l.op = op
		l.prec = prec
		goto lx
	}

	l.op = op
	if Debug['x'] != 0 {
		fmt.Printf("lex: TOKEN ASOP %s=\n", goopnames[op])
	}
	l.tok = LASOP
}

func (l *lexer) ident(c rune) {
	cp := &lexbuf
	cp.Reset()

	// accelerate common case (7bit ASCII)
	for isLetter(c) || isDigit(c) {
		cp.WriteByte(byte(c))
		c = l.getr()
	}

	// general case
	for {
		if c >= utf8.RuneSelf {
			if unicode.IsLetter(c) || c == '_' || unicode.IsDigit(c) {
				if cp.Len() == 0 && unicode.IsDigit(c) {
					Yyerror("identifier cannot begin with digit %#U", c)
				}
			} else {
				Yyerror("invalid identifier character %#U", c)
			}
			cp.WriteRune(c)
		} else if isLetter(c) || isDigit(c) {
			cp.WriteByte(byte(c))
		} else {
			break
		}
		c = l.getr()
	}

	cp = nil
	l.ungetr()

	name := lexbuf.Bytes()

	if len(name) >= 2 {
		if tok, ok := keywords[string(name)]; ok {
			if Debug['x'] != 0 {
				fmt.Printf("lex: %s\n", lexname(tok))
			}
			switch tok {
			case LBREAK, LCONTINUE, LFALL, LRETURN:
				l.nlsemi = true
			}
			l.tok = tok
			return
		}
	}

	s := LookupBytes(name)
	if Debug['x'] != 0 {
		fmt.Printf("lex: ident %s\n", s)
	}
	l.sym_ = s
	l.nlsemi = true
	l.tok = LNAME
}

var keywords = map[string]int32{
	"break":       LBREAK,
	"case":        LCASE,
	"chan":        LCHAN,
	"const":       LCONST,
	"continue":    LCONTINUE,
	"default":     LDEFAULT,
	"defer":       LDEFER,
	"else":        LELSE,
	"fallthrough": LFALL,
	"for":         LFOR,
	"func":        LFUNC,
	"go":          LGO,
	"goto":        LGOTO,
	"if":          LIF,
	"import":      LIMPORT,
	"interface":   LINTERFACE,
	"map":         LMAP,
	"package":     LPACKAGE,
	"range":       LRANGE,
	"return":      LRETURN,
	"select":      LSELECT,
	"struct":      LSTRUCT,
	"switch":      LSWITCH,
	"type":        LTYPE,
	"var":         LVAR,

	// 💩
	"notwithstanding":      LIGNORE,
	"thetruthofthematter":  LIGNORE,
	"despiteallobjections": LIGNORE,
	"whereas":              LIGNORE,
	"insofaras":            LIGNORE,
}

func (l *lexer) number(c rune) {
	cp := &lexbuf
	cp.Reset()

	// parse mantissa before decimal point or exponent
	isInt := false
	malformedOctal := false
	if c != '.' {
		if c != '0' {
			// decimal or float
			for isDigit(c) {
				cp.WriteByte(byte(c))
				c = l.getr()
			}

		} else {
			// c == 0
			cp.WriteByte('0')
			c = l.getr()
			if c == 'x' || c == 'X' {
				isInt = true // must be int
				cp.WriteByte(byte(c))
				c = l.getr()
				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
					cp.WriteByte(byte(c))
					c = l.getr()
				}
				if lexbuf.Len() == 2 {
					Yyerror("malformed hex constant")
				}
			} else {
				// decimal 0, octal, or float
				for isDigit(c) {
					if c > '7' {
						malformedOctal = true
					}
					cp.WriteByte(byte(c))
					c = l.getr()
				}
			}
		}
	}

	// unless we have a hex number, parse fractional part or exponent, if any
	var str string
	if !isInt {
		isInt = true // assume int unless proven otherwise

		// fraction
		if c == '.' {
			isInt = false
			cp.WriteByte('.')
			c = l.getr()
			for isDigit(c) {
				cp.WriteByte(byte(c))
				c = l.getr()
			}
		}

		// exponent
		if c == 'e' || c == 'E' {
			isInt = false
			cp.WriteByte(byte(c))
			c = l.getr()
			if c == '+' || c == '-' {
				cp.WriteByte(byte(c))
				c = l.getr()
			}
			if !isDigit(c) {
				Yyerror("malformed floating point constant exponent")
			}
			for isDigit(c) {
				cp.WriteByte(byte(c))
				c = l.getr()
			}
		}

		// imaginary constant
		if c == 'i' {
			str = lexbuf.String()
			x := new(Mpcplx)
			x.Real.SetFloat64(0.0)
			x.Imag.SetString(str)
			if x.Imag.Val.IsInf() {
				Yyerror("overflow in imaginary constant")
				x.Imag.SetFloat64(0.0)
			}
			l.val.U = x

			if Debug['x'] != 0 {
				fmt.Printf("lex: imaginary literal\n")
			}
			goto done
		}
	}

	l.ungetr()

	if isInt {
		if malformedOctal {
			Yyerror("malformed octal constant")
		}

		str = lexbuf.String()
		x := new(Mpint)
		x.SetString(str)
		if x.Ovf {
			Yyerror("overflow in constant")
			x.SetInt64(0)
		}
		l.val.U = x

		if Debug['x'] != 0 {
			fmt.Printf("lex: integer literal\n")
		}

	} else { // float

		str = lexbuf.String()
		x := newMpflt()
		x.SetString(str)
		if x.Val.IsInf() {
			Yyerror("overflow in float constant")
			x.SetFloat64(0.0)
		}
		l.val.U = x

		if Debug['x'] != 0 {
			fmt.Printf("lex: floating literal\n")
		}
	}

done:
	litbuf = "" // lazily initialized in (*parser).syntax_error
	l.nlsemi = true
	l.tok = LLITERAL
}

func (l *lexer) stdString() {
	lexbuf.Reset()
	lexbuf.WriteString(`"<string>"`)

	cp := &strbuf
	cp.Reset()

	for {
		r, b, ok := l.onechar('"')
		if !ok {
			break
		}
		if r == 0 {
			cp.WriteByte(b)
		} else {
			cp.WriteRune(r)
		}
	}

	l.val.U = internString(cp.Bytes())
	if Debug['x'] != 0 {
		fmt.Printf("lex: string literal\n")
	}
	litbuf = "string literal"
	l.nlsemi = true
	l.tok = LLITERAL
}

func (l *lexer) rawString() {
	lexbuf.Reset()
	lexbuf.WriteString("`<string>`")

	cp := &strbuf
	cp.Reset()

	for {
		c := l.getr()
		if c == '\r' {
			continue
		}
		if c == EOF {
			Yyerror("eof in string")
			break
		}
		if c == '`' {
			break
		}
		cp.WriteRune(c)
	}

	l.val.U = internString(cp.Bytes())
	if Debug['x'] != 0 {
		fmt.Printf("lex: string literal\n")
	}
	litbuf = "string literal"
	l.nlsemi = true
	l.tok = LLITERAL
}

func (l *lexer) rune() {
	r, b, ok := l.onechar('\'')
	if !ok {
		Yyerror("empty character literal or unescaped ' in character literal")
		r = '\''
	}
	if r == 0 {
		r = rune(b)
	}

	if c := l.getr(); c != '\'' {
		Yyerror("missing '")
		l.ungetr()
	}

	x := new(Mpint)
	l.val.U = x
	x.SetInt64(int64(r))
	x.Rune = true
	if Debug['x'] != 0 {
		fmt.Printf("lex: codepoint literal\n")
	}
	litbuf = "rune literal"
	l.nlsemi = true
	l.tok = LLITERAL
}

var internedStrings = map[string]string{}

func internString(b []byte) string {
	s, ok := internedStrings[string(b)] // string(b) here doesn't allocate
	if !ok {
		s = string(b)
		internedStrings[s] = s
	}
	return s
}

// read and interpret syntax that looks like
// //line parse.y:15
// as a discontinuity in sequential line numbers.
// the next line of input comes from parse.y:15
func (l *lexer) getlinepragma() rune {
	c := l.getr()
	if c == 'g' { // check for //go: directive
		cp := &lexbuf
		cp.Reset()
		cp.WriteByte('g') // already read
		for {
			c = l.getr()
			if c == EOF || c >= utf8.RuneSelf {
				return c
			}
			if c == '\n' {
				break
			}
			cp.WriteByte(byte(c))
		}
		cp = nil

		text := strings.TrimSuffix(lexbuf.String(), "\r")

		if strings.HasPrefix(text, "go:cgo_") {
			pragcgobuf += pragcgo(text)
		}

		verb := text
		if i := strings.Index(text, " "); i >= 0 {
			verb = verb[:i]
		}

		switch verb {
		case "go:linkname":
			if !imported_unsafe {
				Yyerror("//go:linkname only allowed in Go files that import \"unsafe\"")
			}
			f := strings.Fields(text)
			if len(f) != 3 {
				Yyerror("usage: //go:linkname localname linkname")
				break
			}
			Lookup(f[1]).Linkname = f[2]
		default:
			l.pragma |= PragmaValue(verb)
		}
		return c
	}

	// check for //line directive
	if c != 'l' {
		return c
	}
	for i := 1; i < 5; i++ {
		c = l.getr()
		if c != rune("line "[i]) {
			return c
		}
	}

	cp := &lexbuf
	cp.Reset()
	linep := 0
	for {
		c = l.getr()
		if c == EOF {
			return c
		}
		if c == '\n' {
			break
		}
		if c == ' ' {
			continue
		}
		if c == ':' {
			linep = cp.Len() + 1
		}
		cp.WriteByte(byte(c))
	}
	cp = nil

	if linep == 0 {
		return c
	}
	text := strings.TrimSuffix(lexbuf.String(), "\r")
	n, err := strconv.Atoi(text[linep:])
	if err != nil {
		return c // todo: make this an error instead? it is almost certainly a bug.
	}
	if n > 1e8 {
		Yyerror("line number out of range")
		errorexit()
	}
	if n <= 0 {
		return c
	}

	linehistupdate(text[:linep-1], n)
	return c
}

func pragcgo(text string) string {
	f := pragmaFields(text)

	verb := f[0][3:] // skip "go:"
	switch verb {
	case "cgo_export_static", "cgo_export_dynamic":
		switch {
		case len(f) == 2 && !isQuoted(f[1]):
			local := plan9quote(f[1])
			return fmt.Sprintln(verb, local)

		case len(f) == 3 && !isQuoted(f[1]) && !isQuoted(f[2]):
			local := plan9quote(f[1])
			remote := plan9quote(f[2])
			return fmt.Sprintln(verb, local, remote)

		default:
			Yyerror(`usage: //go:%s local [remote]`, verb)
		}
	case "cgo_import_dynamic":
		switch {
		case len(f) == 2 && !isQuoted(f[1]):
			local := plan9quote(f[1])
			return fmt.Sprintln(verb, local)

		case len(f) == 3 && !isQuoted(f[1]) && !isQuoted(f[2]):
			local := plan9quote(f[1])
			remote := plan9quote(f[2])
			return fmt.Sprintln(verb, local, remote)

		case len(f) == 4 && !isQuoted(f[1]) && !isQuoted(f[2]) && isQuoted(f[3]):
			local := plan9quote(f[1])
			remote := plan9quote(f[2])
			library := plan9quote(strings.Trim(f[3], `"`))
			return fmt.Sprintln(verb, local, remote, library)

		default:
			Yyerror(`usage: //go:cgo_import_dynamic local [remote ["library"]]`)
		}
	case "cgo_import_static":
		switch {
		case len(f) == 2 && !isQuoted(f[1]):
			local := plan9quote(f[1])
			return fmt.Sprintln(verb, local)

		default:
			Yyerror(`usage: //go:cgo_import_static local`)
		}
	case "cgo_dynamic_linker":
		switch {
		case len(f) == 2 && isQuoted(f[1]):
			path := plan9quote(strings.Trim(f[1], `"`))
			return fmt.Sprintln(verb, path)

		default:
			Yyerror(`usage: //go:cgo_dynamic_linker "path"`)
		}
	case "cgo_ldflag":
		switch {
		case len(f) == 2 && isQuoted(f[1]):
			arg := plan9quote(strings.Trim(f[1], `"`))
			return fmt.Sprintln(verb, arg)

		default:
			Yyerror(`usage: //go:cgo_ldflag "arg"`)
		}
	}
	return ""
}

// pragmaFields is similar to strings.FieldsFunc(s, isSpace)
// but does not split when inside double quoted regions and always
// splits before the start and after the end of a double quoted region.
// pragmaFields does not recognize escaped quotes. If a quote in s is not
// closed the part after the opening quote will not be returned as a field.
func pragmaFields(s string) []string {
	var a []string
	inQuote := false
	fieldStart := -1 // Set to -1 when looking for start of field.
	for i, c := range s {
		switch {
		case c == '"':
			if inQuote {
				inQuote = false
				a = append(a, s[fieldStart:i+1])
				fieldStart = -1
			} else {
				inQuote = true
				if fieldStart >= 0 {
					a = append(a, s[fieldStart:i])
				}
				fieldStart = i
			}
		case !inQuote && isSpace(c):
			if fieldStart >= 0 {
				a = append(a, s[fieldStart:i])
				fieldStart = -1
			}
		default:
			if fieldStart == -1 {
				fieldStart = i
			}
		}
	}
	if !inQuote && fieldStart >= 0 { // Last field might end at the end of the string.
		a = append(a, s[fieldStart:])
	}
	return a
}

func (l *lexer) getr() rune {
redo:
	l.prevlineno = lexlineno
	r, w, err := l.bin.ReadRune()
	if err != nil {
		if err != io.EOF {
			Fatalf("io error: %v", err)
		}
		return -1
	}
	switch r {
	case 0:
		yyerrorl(lexlineno, "illegal NUL byte")
	case '\n':
		lexlineno++
	case utf8.RuneError:
		if w == 1 {
			yyerrorl(lexlineno, "illegal UTF-8 sequence")
		}
	case BOM:
		yyerrorl(lexlineno, "Unicode (UTF-8) BOM in middle of file")
		goto redo
	}

	return r
}

func (l *lexer) ungetr() {
	l.bin.UnreadRune()
	lexlineno = l.prevlineno
}

// onechar lexes a single character within a rune or interpreted string literal,
// handling escape sequences as necessary.
func (l *lexer) onechar(quote rune) (r rune, b byte, ok bool) {
	c := l.getr()
	switch c {
	case EOF:
		Yyerror("eof in string")
		l.ungetr()
		return

	case '\n':
		Yyerror("newline in string")
		l.ungetr()
		return

	case '\\':
		break

	case quote:
		return

	default:
		return c, 0, true
	}

	c = l.getr()
	switch c {
	case 'x':
		return 0, byte(l.hexchar(2)), true

	case 'u':
		return l.unichar(4), 0, true

	case 'U':
		return l.unichar(8), 0, true

	case '0', '1', '2', '3', '4', '5', '6', '7':
		x := c - '0'
		for i := 2; i > 0; i-- {
			c = l.getr()
			if c >= '0' && c <= '7' {
				x = x*8 + c - '0'
				continue
			}

			Yyerror("non-octal character in escape sequence: %c", c)
			l.ungetr()
		}

		if x > 255 {
			Yyerror("octal escape value > 255: %d", x)
		}

		return 0, byte(x), true

	case 'a':
		c = '\a'
	case 'b':
		c = '\b'
	case 'f':
		c = '\f'
	case 'n':
		c = '\n'
	case 'r':
		c = '\r'
	case 't':
		c = '\t'
	case 'v':
		c = '\v'
	case '\\':
		c = '\\'

	default:
		if c != quote {
			Yyerror("unknown escape sequence: %c", c)
		}
	}

	return c, 0, true
}

func (l *lexer) unichar(n int) rune {
	x := l.hexchar(n)
	if x > utf8.MaxRune || 0xd800 <= x && x < 0xe000 {
		Yyerror("invalid Unicode code point in escape sequence: %#x", x)
		x = utf8.RuneError
	}
	return rune(x)
}

func (l *lexer) hexchar(n int) uint32 {
	var x uint32

	for ; n > 0; n-- {
		var d uint32
		switch c := l.getr(); {
		case isDigit(c):
			d = uint32(c - '0')
		case 'a' <= c && c <= 'f':
			d = uint32(c - 'a' + 10)
		case 'A' <= c && c <= 'F':
			d = uint32(c - 'A' + 10)
		default:
			Yyerror("non-hex character in escape sequence: %c", c)
			l.ungetr()
			return x
		}
		x = x*16 + d
	}

	return x
}