cmd/compile/internal/syntax: faster and simpler source reader

This is one of several changes that were part of a larger rewrite
which I made in early 2019 after switching to the new number literal
syntax implementation. The purpose of the rewrite was to simplify
reading of source code (Unicode character by character) and speed up
the scanner but was never submitted for review due to other priorities.

Part 3 of 3:

This change contains a complete rewrite of source.go, the file that
implements reading individual Unicode characters from the source.
The new implementation is easier to use and has simpler literal
buffer management, resulting in faster scanner and thus parser
performance.

Thew new source.go (internal) API is centered around nextch() which
advances the scanner by one character. The scanner has been adjusted
around nextch() and now consistently does one character look-ahead
(there's no need for complicated ungetr-ing anymore). Only in one
case backtrack is needed (when finding '..' rather than '...') and
that case is now more cleanly solved with the new reset() function.

Measuring line/s parsing peformance by running

go test -run StdLib -fast -skip "syntax/(scanner|source)\.go"

(best of 5 runs on "quiet" MacBook Pro, 3.3GHz Dual-Core i7, 16GB RAM,
OS X 10.15.3) before and after shows consistently 3-5% improvement of
line parsing speed:

old: parsed 1788155 lines (3969 files) in 1.255520307s (1424234 lines/s)
new: parsed 1788155 lines (3969 files) in 1.213197037s (1473919 lines/s)

(scanner.go and parser.go are skipped because this CL changed those files.)

Change-Id: Ida947f4b538d42eb2d2349062c69edb6c9e5ca66
Reviewed-on: https://go-review.googlesource.com/c/go/+/221603
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
This commit is contained in:
Robert Griesemer 2020-02-26 21:31:00 -08:00
parent cc6a8bd0d7
commit 4de606b55f
4 changed files with 373 additions and 372 deletions

View file

@ -419,7 +419,7 @@ func (p *parser) fileOrNil() *File {
} }
// p.tok == _EOF // p.tok == _EOF
f.Lines = p.source.line f.Lines = p.line
return f return f
} }

View file

@ -6,9 +6,9 @@
// Go source. After initialization, consecutive calls of // Go source. After initialization, consecutive calls of
// next advance the scanner one token at a time. // next advance the scanner one token at a time.
// //
// This file, source.go, and tokens.go are self-contained // This file, source.go, tokens.go, and token_string.go are self-contained
// (go tool compile scanner.go source.go tokens.go compiles) // (`go tool compile scanner.go source.go tokens.go token_string.go` compiles)
// and thus could be made into its own package. // and thus could be made into their own package.
package syntax package syntax
@ -86,20 +86,21 @@ func (s *scanner) next() {
redo: redo:
// skip white space // skip white space
c := s.getr() s.stop()
for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' { for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
c = s.getr() s.nextch()
} }
// token start // token start
s.line, s.col = s.source.line0, s.source.col0 s.line, s.col = s.pos()
s.start()
if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) { if isLetter(s.ch) || s.ch >= utf8.RuneSelf && s.atIdentChar(true) {
s.nextch()
s.ident() s.ident()
return return
} }
switch c { switch s.ch {
case -1: case -1:
if nlsemi { if nlsemi {
s.lit = "EOF" s.lit = "EOF"
@ -109,11 +110,12 @@ redo:
s.tok = _EOF s.tok = _EOF
case '\n': case '\n':
s.nextch()
s.lit = "newline" s.lit = "newline"
s.tok = _Semi s.tok = _Semi
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
s.number(c) s.number(false)
case '"': case '"':
s.stdString() s.stdString()
@ -125,97 +127,110 @@ redo:
s.rune() s.rune()
case '(': case '(':
s.nextch()
s.tok = _Lparen s.tok = _Lparen
case '[': case '[':
s.nextch()
s.tok = _Lbrack s.tok = _Lbrack
case '{': case '{':
s.nextch()
s.tok = _Lbrace s.tok = _Lbrace
case ',': case ',':
s.nextch()
s.tok = _Comma s.tok = _Comma
case ';': case ';':
s.nextch()
s.lit = "semicolon" s.lit = "semicolon"
s.tok = _Semi s.tok = _Semi
case ')': case ')':
s.nextch()
s.nlsemi = true s.nlsemi = true
s.tok = _Rparen s.tok = _Rparen
case ']': case ']':
s.nextch()
s.nlsemi = true s.nlsemi = true
s.tok = _Rbrack s.tok = _Rbrack
case '}': case '}':
s.nextch()
s.nlsemi = true s.nlsemi = true
s.tok = _Rbrace s.tok = _Rbrace
case ':': case ':':
if s.getr() == '=' { s.nextch()
if s.ch == '=' {
s.nextch()
s.tok = _Define s.tok = _Define
break break
} }
s.ungetr()
s.tok = _Colon s.tok = _Colon
case '.': case '.':
c = s.getr() s.nextch()
if isDecimal(c) { if isDecimal(s.ch) {
s.ungetr() s.number(true)
s.unread(1) // correct position of '.' (needed by startLit in number)
s.number('.')
break break
} }
if c == '.' { if s.ch == '.' {
c = s.getr() s.nextch()
if c == '.' { if s.ch == '.' {
s.nextch()
s.tok = _DotDotDot s.tok = _DotDotDot
break break
} }
s.unread(1) s.rewind() // now s.ch holds 1st '.'
s.nextch() // consume 1st '.' again
} }
s.ungetr()
s.tok = _Dot s.tok = _Dot
case '+': case '+':
s.nextch()
s.op, s.prec = Add, precAdd s.op, s.prec = Add, precAdd
c = s.getr() if s.ch != '+' {
if c != '+' {
goto assignop goto assignop
} }
s.nextch()
s.nlsemi = true s.nlsemi = true
s.tok = _IncOp s.tok = _IncOp
case '-': case '-':
s.nextch()
s.op, s.prec = Sub, precAdd s.op, s.prec = Sub, precAdd
c = s.getr() if s.ch != '-' {
if c != '-' {
goto assignop goto assignop
} }
s.nextch()
s.nlsemi = true s.nlsemi = true
s.tok = _IncOp s.tok = _IncOp
case '*': case '*':
s.nextch()
s.op, s.prec = Mul, precMul s.op, s.prec = Mul, precMul
// don't goto assignop - want _Star token // don't goto assignop - want _Star token
if s.getr() == '=' { if s.ch == '=' {
s.nextch()
s.tok = _AssignOp s.tok = _AssignOp
break break
} }
s.ungetr()
s.tok = _Star s.tok = _Star
case '/': case '/':
c = s.getr() s.nextch()
if c == '/' { if s.ch == '/' {
s.nextch()
s.lineComment() s.lineComment()
goto redo goto redo
} }
if c == '*' { if s.ch == '*' {
s.nextch()
s.fullComment() s.fullComment()
if s.source.line > s.line && nlsemi { if line, _ := s.pos(); line > s.line && nlsemi {
// A multi-line comment acts like a newline; // A multi-line comment acts like a newline;
// it translates to a ';' if nlsemi is set. // it translates to a ';' if nlsemi is set.
s.lit = "newline" s.lit = "newline"
@ -228,27 +243,29 @@ redo:
goto assignop goto assignop
case '%': case '%':
s.nextch()
s.op, s.prec = Rem, precMul s.op, s.prec = Rem, precMul
c = s.getr()
goto assignop goto assignop
case '&': case '&':
c = s.getr() s.nextch()
if c == '&' { if s.ch == '&' {
s.nextch()
s.op, s.prec = AndAnd, precAndAnd s.op, s.prec = AndAnd, precAndAnd
s.tok = _Operator s.tok = _Operator
break break
} }
s.op, s.prec = And, precMul s.op, s.prec = And, precMul
if c == '^' { if s.ch == '^' {
s.nextch()
s.op = AndNot s.op = AndNot
c = s.getr()
} }
goto assignop goto assignop
case '|': case '|':
c = s.getr() s.nextch()
if c == '|' { if s.ch == '|' {
s.nextch()
s.op, s.prec = OrOr, precOrOr s.op, s.prec = OrOr, precOrOr
s.tok = _Operator s.tok = _Operator
break break
@ -257,106 +274,100 @@ redo:
goto assignop goto assignop
case '^': case '^':
s.nextch()
s.op, s.prec = Xor, precAdd s.op, s.prec = Xor, precAdd
c = s.getr()
goto assignop goto assignop
case '<': case '<':
c = s.getr() s.nextch()
if c == '=' { if s.ch == '=' {
s.nextch()
s.op, s.prec = Leq, precCmp s.op, s.prec = Leq, precCmp
s.tok = _Operator s.tok = _Operator
break break
} }
if c == '<' { if s.ch == '<' {
s.nextch()
s.op, s.prec = Shl, precMul s.op, s.prec = Shl, precMul
c = s.getr()
goto assignop goto assignop
} }
if c == '-' { if s.ch == '-' {
s.nextch()
s.tok = _Arrow s.tok = _Arrow
break break
} }
s.ungetr()
s.op, s.prec = Lss, precCmp s.op, s.prec = Lss, precCmp
s.tok = _Operator s.tok = _Operator
case '>': case '>':
c = s.getr() s.nextch()
if c == '=' { if s.ch == '=' {
s.nextch()
s.op, s.prec = Geq, precCmp s.op, s.prec = Geq, precCmp
s.tok = _Operator s.tok = _Operator
break break
} }
if c == '>' { if s.ch == '>' {
s.nextch()
s.op, s.prec = Shr, precMul s.op, s.prec = Shr, precMul
c = s.getr()
goto assignop goto assignop
} }
s.ungetr()
s.op, s.prec = Gtr, precCmp s.op, s.prec = Gtr, precCmp
s.tok = _Operator s.tok = _Operator
case '=': case '=':
if s.getr() == '=' { s.nextch()
if s.ch == '=' {
s.nextch()
s.op, s.prec = Eql, precCmp s.op, s.prec = Eql, precCmp
s.tok = _Operator s.tok = _Operator
break break
} }
s.ungetr()
s.tok = _Assign s.tok = _Assign
case '!': case '!':
if s.getr() == '=' { s.nextch()
if s.ch == '=' {
s.nextch()
s.op, s.prec = Neq, precCmp s.op, s.prec = Neq, precCmp
s.tok = _Operator s.tok = _Operator
break break
} }
s.ungetr()
s.op, s.prec = Not, 0 s.op, s.prec = Not, 0
s.tok = _Operator s.tok = _Operator
default: default:
s.tok = 0 s.errorf("invalid character %#U", s.ch)
s.errorf("invalid character %#U", c) s.nextch()
goto redo goto redo
} }
return return
assignop: assignop:
if c == '=' { if s.ch == '=' {
s.nextch()
s.tok = _AssignOp s.tok = _AssignOp
return return
} }
s.ungetr()
s.tok = _Operator s.tok = _Operator
} }
func isLetter(c rune) bool {
return 'a' <= lower(c) && lower(c) <= 'z' || c == '_'
}
func (s *scanner) ident() { func (s *scanner) ident() {
s.startLit()
// accelerate common case (7bit ASCII) // accelerate common case (7bit ASCII)
c := s.getr() for isLetter(s.ch) || isDecimal(s.ch) {
for isLetter(c) || isDecimal(c) { s.nextch()
c = s.getr()
} }
// general case // general case
if c >= utf8.RuneSelf { if s.ch >= utf8.RuneSelf {
for s.isIdentRune(c, false) { for s.atIdentChar(false) {
c = s.getr() s.nextch()
} }
} }
s.ungetr()
lit := s.stopLit()
// possibly a keyword // possibly a keyword
lit := s.segment()
if len(lit) >= 2 { if len(lit) >= 2 {
if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) { if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok) s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
@ -376,16 +387,16 @@ func tokStrFast(tok token) string {
return _token_name[_token_index[tok-1]:_token_index[tok]] return _token_name[_token_index[tok-1]:_token_index[tok]]
} }
func (s *scanner) isIdentRune(c rune, first bool) bool { func (s *scanner) atIdentChar(first bool) bool {
switch { switch {
case unicode.IsLetter(c) || c == '_': case unicode.IsLetter(s.ch) || s.ch == '_':
// ok // ok
case unicode.IsDigit(c): case unicode.IsDigit(s.ch):
if first { if first {
s.errorf("identifier cannot begin with digit %#U", c) s.errorf("identifier cannot begin with digit %#U", s.ch)
} }
case c >= utf8.RuneSelf: case s.ch >= utf8.RuneSelf:
s.errorf("invalid character %#U in identifier", c) s.errorf("invalid character %#U in identifier", s.ch)
default: default:
return false return false
} }
@ -411,46 +422,45 @@ func init() {
} }
} }
func lower(c rune) rune { return ('a' - 'A') | c } // returns lower-case c iff c is ASCII letter func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
func isDecimal(c rune) bool { return '0' <= c && c <= '9' } func isLetter(ch rune) bool { return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' }
func isHex(c rune) bool { return '0' <= c && c <= '9' || 'a' <= lower(c) && lower(c) <= 'f' } func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
// digits accepts the sequence { digit | '_' } starting with c0. // digits accepts the sequence { digit | '_' }.
// If base <= 10, digits accepts any decimal digit but records // If base <= 10, digits accepts any decimal digit but records
// the index (relative to the literal start) of a digit >= base // the index (relative to the literal start) of a digit >= base
// in *invalid, if *invalid < 0. // in *invalid, if *invalid < 0.
// digits returns the first rune that is not part of the sequence // digits returns a bitset describing whether the sequence contained
// anymore, and a bitset describing whether the sequence contained
// digits (bit 0 is set), or separators '_' (bit 1 is set). // digits (bit 0 is set), or separators '_' (bit 1 is set).
func (s *scanner) digits(c0 rune, base int, invalid *int) (c rune, digsep int) { func (s *scanner) digits(base int, invalid *int) (digsep int) {
c = c0
if base <= 10 { if base <= 10 {
max := rune('0' + base) max := rune('0' + base)
for isDecimal(c) || c == '_' { for isDecimal(s.ch) || s.ch == '_' {
ds := 1 ds := 1
if c == '_' { if s.ch == '_' {
ds = 2 ds = 2
} else if c >= max && *invalid < 0 { } else if s.ch >= max && *invalid < 0 {
*invalid = int(s.col0 - s.col) // record invalid rune index _, col := s.pos()
*invalid = int(col - s.col) // record invalid rune index
} }
digsep |= ds digsep |= ds
c = s.getr() s.nextch()
} }
} else { } else {
for isHex(c) || c == '_' { for isHex(s.ch) || s.ch == '_' {
ds := 1 ds := 1
if c == '_' { if s.ch == '_' {
ds = 2 ds = 2
} }
digsep |= ds digsep |= ds
c = s.getr() s.nextch()
} }
} }
return return
} }
func (s *scanner) number(c rune) { func (s *scanner) number(seenPoint bool) {
s.startLit()
s.bad = false s.bad = false
base := 10 // number base base := 10 // number base
@ -459,38 +469,39 @@ func (s *scanner) number(c rune) {
invalid := -1 // index of invalid digit in literal, or < 0 invalid := -1 // index of invalid digit in literal, or < 0
// integer part // integer part
var ds int if !seenPoint {
if c != '.' {
s.kind = IntLit s.kind = IntLit
if c == '0' { if s.ch == '0' {
c = s.getr() s.nextch()
switch lower(c) { switch lower(s.ch) {
case 'x': case 'x':
c = s.getr() s.nextch()
base, prefix = 16, 'x' base, prefix = 16, 'x'
case 'o': case 'o':
c = s.getr() s.nextch()
base, prefix = 8, 'o' base, prefix = 8, 'o'
case 'b': case 'b':
c = s.getr() s.nextch()
base, prefix = 2, 'b' base, prefix = 2, 'b'
default: default:
base, prefix = 8, '0' base, prefix = 8, '0'
digsep = 1 // leading 0 digsep = 1 // leading 0
} }
} }
c, ds = s.digits(c, base, &invalid) digsep |= s.digits(base, &invalid)
digsep |= ds if s.ch == '.' {
if prefix == 'o' || prefix == 'b' {
s.errorf("invalid radix point in %s", litname(prefix))
}
s.nextch()
seenPoint = true
}
} }
// fractional part // fractional part
if c == '.' { if seenPoint {
s.kind = FloatLit s.kind = FloatLit
if prefix == 'o' || prefix == 'b' { digsep |= s.digits(base, &invalid)
s.errorf("invalid radix point in %s", litname(prefix))
}
c, ds = s.digits(s.getr(), base, &invalid)
digsep |= ds
} }
if digsep&1 == 0 && !s.bad { if digsep&1 == 0 && !s.bad {
@ -498,23 +509,22 @@ func (s *scanner) number(c rune) {
} }
// exponent // exponent
if e := lower(c); e == 'e' || e == 'p' { if e := lower(s.ch); e == 'e' || e == 'p' {
if !s.bad { if !s.bad {
switch { switch {
case e == 'e' && prefix != 0 && prefix != '0': case e == 'e' && prefix != 0 && prefix != '0':
s.errorf("%q exponent requires decimal mantissa", c) s.errorf("%q exponent requires decimal mantissa", s.ch)
case e == 'p' && prefix != 'x': case e == 'p' && prefix != 'x':
s.errorf("%q exponent requires hexadecimal mantissa", c) s.errorf("%q exponent requires hexadecimal mantissa", s.ch)
} }
} }
c = s.getr() s.nextch()
s.kind = FloatLit s.kind = FloatLit
if c == '+' || c == '-' { if s.ch == '+' || s.ch == '-' {
c = s.getr() s.nextch()
} }
c, ds = s.digits(c, 10, nil) digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit
digsep |= ds if digsep&1 == 0 && !s.bad {
if ds&1 == 0 && !s.bad {
s.errorf("exponent has no digits") s.errorf("exponent has no digits")
} }
} else if prefix == 'x' && s.kind == FloatLit && !s.bad { } else if prefix == 'x' && s.kind == FloatLit && !s.bad {
@ -522,14 +532,13 @@ func (s *scanner) number(c rune) {
} }
// suffix 'i' // suffix 'i'
if c == 'i' { if s.ch == 'i' {
s.kind = ImagLit s.kind = ImagLit
c = s.getr() s.nextch()
} }
s.ungetr()
s.nlsemi = true s.nlsemi = true
s.lit = string(s.stopLit()) s.lit = string(s.segment())
s.tok = _Literal s.tok = _Literal
if s.kind == IntLit && invalid >= 0 && !s.bad { if s.kind == IntLit && invalid >= 0 && !s.bad {
@ -596,199 +605,195 @@ func invalidSep(x string) int {
} }
func (s *scanner) rune() { func (s *scanner) rune() {
s.startLit()
s.bad = false s.bad = false
s.nextch()
n := 0 n := 0
for ; ; n++ { for ; ; n++ {
r := s.getr() if s.ch == '\'' {
if r == '\'' { if !s.bad {
if n == 0 {
s.errorf("empty rune literal or unescaped '")
} else if n != 1 {
s.errorAtf(0, "more than one character in rune literal")
}
}
s.nextch()
break break
} }
if r == '\\' { if s.ch == '\\' {
s.nextch()
s.escape('\'') s.escape('\'')
continue continue
} }
if r == '\n' { if s.ch == '\n' {
s.ungetr() // assume newline is not part of literal
if !s.bad { if !s.bad {
s.errorf("newline in rune literal") s.errorf("newline in rune literal")
} }
break break
} }
if r < 0 { if s.ch < 0 {
if !s.bad { if !s.bad {
s.errorAtf(0, "rune literal not terminated") s.errorAtf(0, "rune literal not terminated")
} }
break break
} }
} s.nextch()
if !s.bad {
if n == 0 {
s.errorf("empty rune literal or unescaped '")
} else if n != 1 {
s.errorAtf(0, "more than one character in rune literal")
}
} }
s.nlsemi = true s.nlsemi = true
s.lit = string(s.stopLit()) s.lit = string(s.segment())
s.kind = RuneLit s.kind = RuneLit
s.tok = _Literal s.tok = _Literal
} }
func (s *scanner) stdString() { func (s *scanner) stdString() {
s.startLit()
s.bad = false s.bad = false
s.nextch()
for { for {
r := s.getr() if s.ch == '"' {
if r == '"' { s.nextch()
break break
} }
if r == '\\' { if s.ch == '\\' {
s.nextch()
s.escape('"') s.escape('"')
continue continue
} }
if r == '\n' { if s.ch == '\n' {
s.ungetr() // assume newline is not part of literal
s.errorf("newline in string") s.errorf("newline in string")
break break
} }
if r < 0 { if s.ch < 0 {
s.errorAtf(0, "string not terminated") s.errorAtf(0, "string not terminated")
break break
} }
s.nextch()
} }
s.nlsemi = true s.nlsemi = true
s.lit = string(s.stopLit()) s.lit = string(s.segment())
s.kind = StringLit s.kind = StringLit
s.tok = _Literal s.tok = _Literal
} }
func (s *scanner) rawString() { func (s *scanner) rawString() {
s.startLit()
s.bad = false s.bad = false
s.nextch()
for { for {
r := s.getr() if s.ch == '`' {
if r == '`' { s.nextch()
break break
} }
if r < 0 { if s.ch < 0 {
s.errorAtf(0, "string not terminated") s.errorAtf(0, "string not terminated")
break break
} }
s.nextch()
} }
// We leave CRs in the string since they are part of the // We leave CRs in the string since they are part of the
// literal (even though they are not part of the literal // literal (even though they are not part of the literal
// value). // value).
s.nlsemi = true s.nlsemi = true
s.lit = string(s.stopLit()) s.lit = string(s.segment())
s.kind = StringLit s.kind = StringLit
s.tok = _Literal s.tok = _Literal
} }
func (s *scanner) comment(text string) { func (s *scanner) comment(text string) {
s.errh(s.line, s.col, text) s.errorAtf(0, text)
} }
func (s *scanner) skipLine(r rune) { func (s *scanner) skipLine() {
for r >= 0 { // don't consume '\n' - needed for nlsemi logic
if r == '\n' { for s.ch >= 0 && s.ch != '\n' {
s.ungetr() // don't consume '\n' - needed for nlsemi logic s.nextch()
break
}
r = s.getr()
} }
} }
func (s *scanner) lineComment() { func (s *scanner) lineComment() {
r := s.getr() // opening has already been consumed
if s.mode&comments != 0 { if s.mode&comments != 0 {
s.startLit() s.skipLine()
s.skipLine(r) s.comment(string(s.segment()))
s.comment("//" + string(s.stopLit()))
return return
} }
// directives must start at the beginning of the line (s.col == colbase) // directives must start at the beginning of the line (s.col == colbase)
if s.mode&directives == 0 || s.col != colbase || (r != 'g' && r != 'l') { if s.mode&directives == 0 || s.col != colbase || (s.ch != 'g' && s.ch != 'l') {
s.skipLine(r) s.stop()
s.skipLine()
return return
} }
// recognize go: or line directives // recognize go: or line directives
prefix := "go:" prefix := "go:"
if r == 'l' { if s.ch == 'l' {
prefix = "line " prefix = "line "
} }
for _, m := range prefix { for _, m := range prefix {
if r != m { if s.ch != m {
s.skipLine(r) s.stop()
s.skipLine()
return return
} }
r = s.getr() s.nextch()
} }
// directive text // directive text
s.startLit() s.skipLine()
s.skipLine(r) s.comment(string(s.segment()))
s.comment("//" + prefix + string(s.stopLit()))
} }
func (s *scanner) skipComment(r rune) bool { func (s *scanner) skipComment() bool {
for r >= 0 { for s.ch >= 0 {
for r == '*' { for s.ch == '*' {
r = s.getr() s.nextch()
if r == '/' { if s.ch == '/' {
s.nextch()
return true return true
} }
} }
r = s.getr() s.nextch()
} }
s.errorAtf(0, "comment not terminated") s.errorAtf(0, "comment not terminated")
return false return false
} }
func (s *scanner) fullComment() { func (s *scanner) fullComment() {
r := s.getr() /* opening has already been consumed */
if s.mode&comments != 0 { if s.mode&comments != 0 {
s.startLit() if s.skipComment() {
if s.skipComment(r) { s.comment(string(s.segment()))
s.comment("/*" + string(s.stopLit()))
} else {
s.killLit() // not a complete comment - ignore
} }
return return
} }
if s.mode&directives == 0 || r != 'l' { if s.mode&directives == 0 || s.ch != 'l' {
s.skipComment(r) s.stop()
s.skipComment()
return return
} }
// recognize line directive // recognize line directive
const prefix = "line " const prefix = "line "
for _, m := range prefix { for _, m := range prefix {
if r != m { if s.ch != m {
s.skipComment(r) s.stop()
s.skipComment()
return return
} }
r = s.getr() s.nextch()
} }
// directive text // directive text
s.startLit() if s.skipComment() {
if s.skipComment(r) { s.comment(string(s.segment()))
s.comment("/*" + prefix + string(s.stopLit()))
} else {
s.killLit() // not a complete comment - ignore
} }
} }
@ -796,23 +801,23 @@ func (s *scanner) escape(quote rune) {
var n int var n int
var base, max uint32 var base, max uint32
c := s.getr() switch s.ch {
switch c { case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: s.nextch()
return return
case '0', '1', '2', '3', '4', '5', '6', '7': case '0', '1', '2', '3', '4', '5', '6', '7':
n, base, max = 3, 8, 255 n, base, max = 3, 8, 255
case 'x': case 'x':
c = s.getr() s.nextch()
n, base, max = 2, 16, 255 n, base, max = 2, 16, 255
case 'u': case 'u':
c = s.getr() s.nextch()
n, base, max = 4, 16, unicode.MaxRune n, base, max = 4, 16, unicode.MaxRune
case 'U': case 'U':
c = s.getr() s.nextch()
n, base, max = 8, 16, unicode.MaxRune n, base, max = 8, 16, unicode.MaxRune
default: default:
if c < 0 { if s.ch < 0 {
return // complain in caller about EOF return // complain in caller about EOF
} }
s.errorf("unknown escape") s.errorf("unknown escape")
@ -821,30 +826,27 @@ func (s *scanner) escape(quote rune) {
var x uint32 var x uint32
for i := n; i > 0; i-- { for i := n; i > 0; i-- {
if s.ch < 0 {
return // complain in caller about EOF
}
d := base d := base
switch { if isDecimal(s.ch) {
case isDecimal(c): d = uint32(s.ch) - '0'
d = uint32(c) - '0' } else if 'a' <= lower(s.ch) && lower(s.ch) <= 'f' {
case 'a' <= lower(c) && lower(c) <= 'f': d = uint32(lower(s.ch)) - 'a' + 10
d = uint32(lower(c)) - ('a' - 10)
} }
if d >= base { if d >= base {
if c < 0 {
return // complain in caller about EOF
}
kind := "hex" kind := "hex"
if base == 8 { if base == 8 {
kind = "octal" kind = "octal"
} }
s.errorf("invalid character %q in %s escape", c, kind) s.errorf("invalid character %q in %s escape", s.ch, kind)
s.ungetr()
return return
} }
// d < base // d < base
x = x*base + d x = x*base + d
c = s.getr() s.nextch()
} }
s.ungetr()
if x > max && base == 8 { if x > max && base == 8 {
s.errorf("octal escape value %d > 255", x) s.errorf("octal escape value %d > 255", x)

View file

@ -19,8 +19,8 @@ func errh(line, col uint, msg string) {
// Don't bother with other tests if TestSmoke doesn't pass. // Don't bother with other tests if TestSmoke doesn't pass.
func TestSmoke(t *testing.T) { func TestSmoke(t *testing.T) {
const src = "if (+foo\t+=..123/***/4.2_0e-0i'a'`raw`\"string\" ;//$" const src = "if (+foo\t+=..123/***/0.9_0e-0i'a'`raw`\"string\"..f;//$"
tokens := []token{_If, _Lparen, _Operator, _Name, _AssignOp, _Dot, _Literal, _Literal, _Literal, _Literal, _Literal, _Semi, _EOF} tokens := []token{_If, _Lparen, _Operator, _Name, _AssignOp, _Dot, _Literal, _Literal, _Literal, _Literal, _Literal, _Dot, _Dot, _Name, _Semi, _EOF}
var got scanner var got scanner
got.init(strings.NewReader(src), errh, 0) got.init(strings.NewReader(src), errh, 0)

View file

@ -3,11 +3,10 @@
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// This file implements source, a buffered rune reader // This file implements source, a buffered rune reader
// which is specialized for the needs of the Go scanner: // specialized for scanning Go code: Reading
// Contiguous sequences of runes (literals) are extracted // ASCII characters, maintaining current (line, col)
// directly as []byte without the need to re-encode the // position information, and recording of the most
// runes in UTF-8 (as would be necessary with bufio.Reader). // recently read source segment are highly optimized.
//
// This file is self-contained (go tool compile source.go // This file is self-contained (go tool compile source.go
// compiles) and thus could be made into its own package. // compiles) and thus could be made into its own package.
@ -18,202 +17,202 @@ import (
"unicode/utf8" "unicode/utf8"
) )
// The source buffer is accessed using three indices b (begin),
// r (read), and e (end):
//
// - If b >= 0, it points to the beginning of a segment of most
// recently read characters (typically a Go literal).
//
// - r points to the byte immediately following the most recently
// read character ch, which starts at r-chw.
//
// - e points to the byte immediately following the last byte that
// was read into the buffer.
//
// The buffer content is terminated at buf[e] with the sentinel
// character utf8.RuneSelf. This makes it possible to test for
// the common case of ASCII characters with a single 'if' (see
// nextch method).
//
// +------ content in use -------+
// v v
// buf [...read...|...segment...|ch|...unread...|s|...free...]
// ^ ^ ^ ^
// | | | |
// b r-chw r e
//
// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
type source struct {
in io.Reader
errh func(line, col uint, msg string)
buf []byte // source buffer
ioerr error // pending I/O error, or nil
b, r, e int // buffer indices (see comment above)
line, col uint // source position of ch (0-based)
ch rune // most recently read character
chw int // width of ch
}
const sentinel = utf8.RuneSelf
func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
s.in = in
s.errh = errh
if s.buf == nil {
s.buf = make([]byte, nextSize(0))
}
s.buf[0] = sentinel
s.ioerr = nil
s.b, s.r, s.e = -1, 0, 0
s.line, s.col = 0, 0
s.ch = ' '
s.chw = 0
}
// starting points for line and column numbers // starting points for line and column numbers
const linebase = 1 const linebase = 1
const colbase = 1 const colbase = 1
// max. number of bytes to unread // pos returns the (line, col) source position of s.ch.
const maxunread = 10 func (s *source) pos() (line, col uint) {
return linebase + s.line, colbase + s.col
// buf [...read...|...|...unread...|s|...free...]
// ^ ^ ^ ^
// | | | |
// suf r0 r w
type source struct {
src io.Reader
errh func(line, pos uint, msg string)
// source buffer
buf [4 << 10]byte
r0, r, w int // previous/current read and write buf positions, excluding sentinel
line0, line uint // previous/current line
col0, col uint // previous/current column (byte offsets from line start)
ioerr error // pending io error
// literal buffer
lit []byte // literal prefix
suf int // literal suffix; suf >= 0 means we are scanning a literal
}
// init initializes source to read from src and to report errors via errh.
// errh must not be nil.
func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
s.src = src
s.errh = errh
s.buf[0] = utf8.RuneSelf // terminate with sentinel
s.r0, s.r, s.w = 0, 0, 0
s.line0, s.line = 0, linebase
s.col0, s.col = 0, colbase
s.ioerr = nil
s.lit = s.lit[:0]
s.suf = -1
}
// ungetr sets the reading position to a previous reading
// position, usually the one of the most recently read
// rune, but possibly earlier (see unread below).
func (s *source) ungetr() {
s.r, s.line, s.col = s.r0, s.line0, s.col0
}
// unread moves the previous reading position to a position
// that is n bytes earlier in the source. The next ungetr
// call will set the reading position to that moved position.
// The "unread" runes must be single byte and not contain any
// newlines; and 0 <= n <= maxunread must hold.
func (s *source) unread(n int) {
s.r0 -= n
s.col0 -= uint(n)
} }
// error reports the error msg at source position s.pos().
func (s *source) error(msg string) { func (s *source) error(msg string) {
s.errh(s.line0, s.col0, msg) line, col := s.pos()
s.errh(line, col, msg)
} }
// getr reads and returns the next rune. // start starts a new active source segment (including s.ch).
// // As long as stop has not been called, the active segment's
// If a read or source encoding error occurs, getr // bytes (excluding s.ch) may be retrieved by calling segment.
// calls the error handler installed with init. func (s *source) start() { s.b = s.r - s.chw }
// The handler must exist. func (s *source) stop() { s.b = -1 }
// func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
// The (line, col) position passed to the error handler
// is always at the current source reading position. // rewind rewinds the scanner's read position and character s.ch
func (s *source) getr() rune { // to the start of the currently active segment, which must not
// contain any newlines (otherwise position information will be
// incorrect). Currently, rewind is only needed for handling the
// source sequence ".."; it must not be called outside an active
// segment.
func (s *source) rewind() {
// ok to verify precondition - rewind is rarely called
if s.b < 0 {
panic("no active segment")
}
s.col -= uint(s.r - s.b)
s.r = s.b
s.nextch()
}
func (s *source) nextch() {
redo: redo:
s.r0, s.line0, s.col0 = s.r, s.line, s.col s.col += uint(s.chw)
if s.ch == '\n' {
// We could avoid at least one test that is always taken in the s.line++
// for loop below by duplicating the common case code (ASCII) s.col = 0
// here since we always have at least the sentinel (utf8.RuneSelf)
// in the buffer. Measure and optimize if necessary.
// make sure we have at least one rune in buffer, or we are at EOF
for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
s.fill() // s.w-s.r < len(s.buf) => buffer is not full
} }
// common case: ASCII and enough bytes // fast common case: at least one ASCII character
// (invariant: s.buf[s.w] == utf8.RuneSelf) if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
if b := s.buf[s.r]; b < utf8.RuneSelf {
s.r++ s.r++
// TODO(gri) Optimization: Instead of adjusting s.col for each character, s.chw = 1
// remember the line offset instead and then compute the offset as needed if s.ch == 0 {
// (which is less often).
s.col++
if b == 0 {
s.error("invalid NUL character") s.error("invalid NUL character")
goto redo goto redo
} }
if b == '\n' { return
s.line++ }
s.col = colbase
} // slower general case: add more bytes to buffer if we don't have a full rune
return rune(b) for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
s.fill()
} }
// EOF // EOF
if s.r == s.w { if s.r == s.e {
if s.ioerr != io.EOF { if s.ioerr != io.EOF {
// ensure we never start with a '/' (e.g., rooted path) in the error message // ensure we never start with a '/' (e.g., rooted path) in the error message
s.error("I/O error: " + s.ioerr.Error()) s.error("I/O error: " + s.ioerr.Error())
s.ioerr = nil
} }
return -1 s.ch = -1
s.chw = 0
return
} }
// uncommon case: not ASCII s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
r, w := utf8.DecodeRune(s.buf[s.r:s.w]) s.r += s.chw
s.r += w
s.col += uint(w)
if r == utf8.RuneError && w == 1 { if s.ch == utf8.RuneError && s.chw == 1 {
s.error("invalid UTF-8 encoding") s.error("invalid UTF-8 encoding")
goto redo goto redo
} }
// BOM's are only allowed as the first character in a file // BOM's are only allowed as the first character in a file
const BOM = 0xfeff const BOM = 0xfeff
if r == BOM { if s.ch == BOM {
if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to maxunread) if s.line > 0 || s.col > 0 {
s.error("invalid BOM in the middle of the file") s.error("invalid BOM in the middle of the file")
} }
goto redo goto redo
} }
return r
} }
// fill reads more source bytes into s.buf.
// It returns with at least one more byte in the buffer, or with s.ioerr != nil.
func (s *source) fill() { func (s *source) fill() {
// Slide unread bytes to beginning but preserve last read char // determine content to preserve
// (for one ungetr call) plus maxunread extra bytes (for one b := s.r
// unread call). if s.b >= 0 {
if s.r0 > maxunread { b = s.b
n := s.r0 - maxunread // number of bytes to slide down s.b = 0 // after buffer has grown or content has been moved down
// save literal prefix, if any
// (make sure we keep maxunread bytes and the last
// read char in the buffer)
if s.suf >= 0 {
// we have a literal
if s.suf < n {
// save literal prefix
s.lit = append(s.lit, s.buf[s.suf:n]...)
s.suf = 0
} else {
s.suf -= n
}
}
copy(s.buf[:], s.buf[n:s.w])
s.r0 = maxunread // eqv: s.r0 -= n
s.r -= n
s.w -= n
} }
content := s.buf[b:s.e]
// grow buffer or move content down
if len(content)*2 > len(s.buf) {
s.buf = make([]byte, nextSize(len(s.buf)))
copy(s.buf, content)
} else if b > 0 {
copy(s.buf, content)
}
s.r -= b
s.e -= b
// read more data: try a limited number of times // read more data: try a limited number of times
for i := 100; i > 0; i-- { for i := 0; i < 10; i++ {
n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel var n int
n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
if n < 0 { if n < 0 {
panic("negative read") // incorrect underlying io.Reader implementation panic("negative read") // incorrect underlying io.Reader implementation
} }
s.w += n if n > 0 || s.ioerr != nil {
if n > 0 || err != nil { s.e += n
s.buf[s.w] = utf8.RuneSelf // sentinel s.buf[s.e] = sentinel
if err != nil {
s.ioerr = err
}
return return
} }
// n == 0
} }
s.buf[s.w] = utf8.RuneSelf // sentinel s.buf[s.e] = sentinel
s.ioerr = io.ErrNoProgress s.ioerr = io.ErrNoProgress
} }
func (s *source) startLit() { // nextSize returns the next bigger size for a buffer of a given size.
s.suf = s.r0 func nextSize(size int) int {
s.lit = s.lit[:0] // reuse lit const min = 4 << 10 // 4K: minimum buffer size
} const max = 1 << 20 // 1M: maximum buffer size which is still doubled
if size < min {
func (s *source) stopLit() []byte { return min
lit := s.buf[s.suf:s.r]
if len(s.lit) > 0 {
lit = append(s.lit, lit...)
} }
s.killLit() if size <= max {
return lit return size << 1
} }
return size + max
func (s *source) killLit() {
s.suf = -1 // no pending literal
} }