cmd/compile/internal/syntax: various cleanups following CL 221603

1) Introduced setLit method to uniformly set the scanner state for
   literals instead of directly manipulating the scanner fields.

2) Use a local variable 'ok' to track validity of literals instead
   of relying on the side-effect of error reporters setting s.bad.
   More code but clearer because it is local and explicit.

3) s/litname/baseName/ and use this function uniformly, also for
   escapes. Consequently we now report always "hexadecimal" and
   not "hex" (in the case of invalid escapes).

4) Added TestDirectives verifying that we get the correct directive
   string (even if that string contains '%').

Verified that lines/s parsing performance is unchanged by comparing

go test -run StdLib -fast -skip "syntax/(scanner|scanner_test)\.go"

before and after (no relevant difference).

Change-Id: I143e4648fdaa31d1c365fb794a1cae4bc1c3f5ba
Reviewed-on: https://go-review.googlesource.com/c/go/+/222258
Run-TryBot: Robert Griesemer <gri@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
This commit is contained in:
Robert Griesemer 2020-03-05 14:14:54 -08:00
parent eafb4d8f8f
commit f6a0d72385
2 changed files with 126 additions and 67 deletions

View file

@ -50,16 +50,23 @@ func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mod
// errorf reports an error at the most recently read character position. // errorf reports an error at the most recently read character position.
func (s *scanner) errorf(format string, args ...interface{}) { func (s *scanner) errorf(format string, args ...interface{}) {
s.bad = true
s.error(fmt.Sprintf(format, args...)) s.error(fmt.Sprintf(format, args...))
} }
// errorAtf reports an error at a byte column offset relative to the current token start. // errorAtf reports an error at a byte column offset relative to the current token start.
func (s *scanner) errorAtf(offset int, format string, args ...interface{}) { func (s *scanner) errorAtf(offset int, format string, args ...interface{}) {
s.bad = true
s.errh(s.line, s.col+uint(offset), fmt.Sprintf(format, args...)) s.errh(s.line, s.col+uint(offset), fmt.Sprintf(format, args...))
} }
// setLit sets the scanner state for a recognized _Literal token.
func (s *scanner) setLit(kind LitKind, ok bool) {
s.nlsemi = true
s.tok = _Literal
s.lit = string(s.segment())
s.bad = !ok
s.kind = kind
}
// next advances the scanner by reading the next token. // next advances the scanner by reading the next token.
// //
// If a read, source encoding, or lexical error occurs, next calls // If a read, source encoding, or lexical error occurs, next calls
@ -461,8 +468,8 @@ func (s *scanner) digits(base int, invalid *int) (digsep int) {
} }
func (s *scanner) number(seenPoint bool) { func (s *scanner) number(seenPoint bool) {
s.bad = false ok := true
kind := IntLit
base := 10 // number base base := 10 // number base
prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
digsep := 0 // bit 0: digit present, bit 1: '_' present digsep := 0 // bit 0: digit present, bit 1: '_' present
@ -470,7 +477,6 @@ func (s *scanner) number(seenPoint bool) {
// integer part // integer part
if !seenPoint { if !seenPoint {
s.kind = IntLit
if s.ch == '0' { if s.ch == '0' {
s.nextch() s.nextch()
switch lower(s.ch) { switch lower(s.ch) {
@ -491,7 +497,8 @@ func (s *scanner) number(seenPoint bool) {
digsep |= s.digits(base, &invalid) digsep |= s.digits(base, &invalid)
if s.ch == '.' { if s.ch == '.' {
if prefix == 'o' || prefix == 'b' { if prefix == 'o' || prefix == 'b' {
s.errorf("invalid radix point in %s", litname(prefix)) s.errorf("invalid radix point in %s literal", baseName(base))
ok = false
} }
s.nextch() s.nextch()
seenPoint = true seenPoint = true
@ -500,68 +507,77 @@ func (s *scanner) number(seenPoint bool) {
// fractional part // fractional part
if seenPoint { if seenPoint {
s.kind = FloatLit kind = FloatLit
digsep |= s.digits(base, &invalid) digsep |= s.digits(base, &invalid)
} }
if digsep&1 == 0 && !s.bad { if digsep&1 == 0 && ok {
s.errorf("%s has no digits", litname(prefix)) s.errorf("%s literal has no digits", baseName(base))
ok = false
} }
// exponent // exponent
if e := lower(s.ch); e == 'e' || e == 'p' { if e := lower(s.ch); e == 'e' || e == 'p' {
if !s.bad { if ok {
switch { switch {
case e == 'e' && prefix != 0 && prefix != '0': case e == 'e' && prefix != 0 && prefix != '0':
s.errorf("%q exponent requires decimal mantissa", s.ch) s.errorf("%q exponent requires decimal mantissa", s.ch)
ok = false
case e == 'p' && prefix != 'x': case e == 'p' && prefix != 'x':
s.errorf("%q exponent requires hexadecimal mantissa", s.ch) s.errorf("%q exponent requires hexadecimal mantissa", s.ch)
ok = false
} }
} }
s.nextch() s.nextch()
s.kind = FloatLit kind = FloatLit
if s.ch == '+' || s.ch == '-' { if s.ch == '+' || s.ch == '-' {
s.nextch() s.nextch()
} }
digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit
if digsep&1 == 0 && !s.bad { if digsep&1 == 0 && ok {
s.errorf("exponent has no digits") s.errorf("exponent has no digits")
ok = false
} }
} else if prefix == 'x' && s.kind == FloatLit && !s.bad { } else if prefix == 'x' && kind == FloatLit && ok {
s.errorf("hexadecimal mantissa requires a 'p' exponent") s.errorf("hexadecimal mantissa requires a 'p' exponent")
ok = false
} }
// suffix 'i' // suffix 'i'
if s.ch == 'i' { if s.ch == 'i' {
s.kind = ImagLit kind = ImagLit
s.nextch() s.nextch()
} }
s.nlsemi = true s.setLit(kind, ok) // do this now so we can use s.lit below
s.lit = string(s.segment())
s.tok = _Literal
if s.kind == IntLit && invalid >= 0 && !s.bad { if kind == IntLit && invalid >= 0 && ok {
s.errorAtf(invalid, "invalid digit %q in %s", s.lit[invalid], litname(prefix)) s.errorAtf(invalid, "invalid digit %q in %s literal", s.lit[invalid], baseName(base))
ok = false
} }
if digsep&2 != 0 && !s.bad { if digsep&2 != 0 && ok {
if i := invalidSep(s.lit); i >= 0 { if i := invalidSep(s.lit); i >= 0 {
s.errorAtf(i, "'_' must separate successive digits") s.errorAtf(i, "'_' must separate successive digits")
ok = false
} }
} }
s.bad = !ok // correct s.bad
} }
func litname(prefix rune) string { func baseName(base int) string {
switch prefix { switch base {
case 'x': case 2:
return "hexadecimal literal" return "binary"
case 'o', '0': case 8:
return "octal literal" return "octal"
case 'b': case 10:
return "binary literal" return "decimal"
case 16:
return "hexadecimal"
} }
return "decimal literal" panic("invalid base")
} }
// invalidSep returns the index of the first invalid separator in x, or -1. // invalidSep returns the index of the first invalid separator in x, or -1.
@ -605,17 +621,19 @@ func invalidSep(x string) int {
} }
func (s *scanner) rune() { func (s *scanner) rune() {
s.bad = false ok := true
s.nextch() s.nextch()
n := 0 n := 0
for ; ; n++ { for ; ; n++ {
if s.ch == '\'' { if s.ch == '\'' {
if !s.bad { if ok {
if n == 0 { if n == 0 {
s.errorf("empty rune literal or unescaped '") s.errorf("empty rune literal or unescaped '")
ok = false
} else if n != 1 { } else if n != 1 {
s.errorAtf(0, "more than one character in rune literal") s.errorAtf(0, "more than one character in rune literal")
ok = false
} }
} }
s.nextch() s.nextch()
@ -623,32 +641,33 @@ func (s *scanner) rune() {
} }
if s.ch == '\\' { if s.ch == '\\' {
s.nextch() s.nextch()
s.escape('\'') if !s.escape('\'') {
ok = false
}
continue continue
} }
if s.ch == '\n' { if s.ch == '\n' {
if !s.bad { if ok {
s.errorf("newline in rune literal") s.errorf("newline in rune literal")
ok = false
} }
break break
} }
if s.ch < 0 { if s.ch < 0 {
if !s.bad { if ok {
s.errorAtf(0, "rune literal not terminated") s.errorAtf(0, "rune literal not terminated")
ok = false
} }
break break
} }
s.nextch() s.nextch()
} }
s.nlsemi = true s.setLit(RuneLit, ok)
s.lit = string(s.segment())
s.kind = RuneLit
s.tok = _Literal
} }
func (s *scanner) stdString() { func (s *scanner) stdString() {
s.bad = false ok := true
s.nextch() s.nextch()
for { for {
@ -658,28 +677,29 @@ func (s *scanner) stdString() {
} }
if s.ch == '\\' { if s.ch == '\\' {
s.nextch() s.nextch()
s.escape('"') if !s.escape('"') {
ok = false
}
continue continue
} }
if s.ch == '\n' { if s.ch == '\n' {
s.errorf("newline in string") s.errorf("newline in string")
ok = false
break break
} }
if s.ch < 0 { if s.ch < 0 {
s.errorAtf(0, "string not terminated") s.errorAtf(0, "string not terminated")
ok = false
break break
} }
s.nextch() s.nextch()
} }
s.nlsemi = true s.setLit(StringLit, ok)
s.lit = string(s.segment())
s.kind = StringLit
s.tok = _Literal
} }
func (s *scanner) rawString() { func (s *scanner) rawString() {
s.bad = false ok := true
s.nextch() s.nextch()
for { for {
@ -689,6 +709,7 @@ func (s *scanner) rawString() {
} }
if s.ch < 0 { if s.ch < 0 {
s.errorAtf(0, "string not terminated") s.errorAtf(0, "string not terminated")
ok = false
break break
} }
s.nextch() s.nextch()
@ -697,10 +718,7 @@ func (s *scanner) rawString() {
// literal (even though they are not part of the literal // literal (even though they are not part of the literal
// value). // value).
s.nlsemi = true s.setLit(StringLit, ok)
s.lit = string(s.segment())
s.kind = StringLit
s.tok = _Literal
} }
func (s *scanner) comment(text string) { func (s *scanner) comment(text string) {
@ -797,14 +815,14 @@ func (s *scanner) fullComment() {
} }
} }
func (s *scanner) escape(quote rune) { func (s *scanner) escape(quote rune) bool {
var n int var n int
var base, max uint32 var base, max uint32
switch s.ch { switch s.ch {
case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\': case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
s.nextch() s.nextch()
return return true
case '0', '1', '2', '3', '4', '5', '6', '7': case '0', '1', '2', '3', '4', '5', '6', '7':
n, base, max = 3, 8, 255 n, base, max = 3, 8, 255
case 'x': case 'x':
@ -818,16 +836,16 @@ func (s *scanner) escape(quote rune) {
n, base, max = 8, 16, unicode.MaxRune n, base, max = 8, 16, unicode.MaxRune
default: default:
if s.ch < 0 { if s.ch < 0 {
return // complain in caller about EOF return true // complain in caller about EOF
} }
s.errorf("unknown escape") s.errorf("unknown escape")
return return false
} }
var x uint32 var x uint32
for i := n; i > 0; i-- { for i := n; i > 0; i-- {
if s.ch < 0 { if s.ch < 0 {
return // complain in caller about EOF return true // complain in caller about EOF
} }
d := base d := base
if isDecimal(s.ch) { if isDecimal(s.ch) {
@ -836,12 +854,8 @@ func (s *scanner) escape(quote rune) {
d = uint32(lower(s.ch)) - 'a' + 10 d = uint32(lower(s.ch)) - 'a' + 10
} }
if d >= base { if d >= base {
kind := "hex" s.errorf("invalid character %q in %s escape", s.ch, baseName(int(base)))
if base == 8 { return false
kind = "octal"
}
s.errorf("invalid character %q in %s escape", s.ch, kind)
return
} }
// d < base // d < base
x = x*base + d x = x*base + d
@ -850,10 +864,13 @@ func (s *scanner) escape(quote rune) {
if x > max && base == 8 { if x > max && base == 8 {
s.errorf("octal escape value %d > 255", x) s.errorf("octal escape value %d > 255", x)
return return false
} }
if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ { if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
s.errorf("escape is invalid Unicode code point %#U", x) s.errorf("escape is invalid Unicode code point %#U", x)
return false
} }
return true
} }

View file

@ -613,9 +613,9 @@ func TestScanErrors(t *testing.T) {
{`'\`, "rune literal not terminated", 0, 0}, {`'\`, "rune literal not terminated", 0, 0},
{`'\'`, "rune literal not terminated", 0, 0}, {`'\'`, "rune literal not terminated", 0, 0},
{`'\x`, "rune literal not terminated", 0, 0}, {`'\x`, "rune literal not terminated", 0, 0},
{`'\x'`, "invalid character '\\'' in hex escape", 0, 3}, {`'\x'`, "invalid character '\\'' in hexadecimal escape", 0, 3},
{`'\y'`, "unknown escape", 0, 2}, {`'\y'`, "unknown escape", 0, 2},
{`'\x0'`, "invalid character '\\'' in hex escape", 0, 4}, {`'\x0'`, "invalid character '\\'' in hexadecimal escape", 0, 4},
{`'\00'`, "invalid character '\\'' in octal escape", 0, 4}, {`'\00'`, "invalid character '\\'' in octal escape", 0, 4},
{`'\377' /*`, "comment not terminated", 0, 7}, // valid octal escape {`'\377' /*`, "comment not terminated", 0, 7}, // valid octal escape
{`'\378`, "invalid character '8' in octal escape", 0, 4}, {`'\378`, "invalid character '8' in octal escape", 0, 4},
@ -633,9 +633,9 @@ func TestScanErrors(t *testing.T) {
{`"\`, "string not terminated", 0, 0}, {`"\`, "string not terminated", 0, 0},
{`"\"`, "string not terminated", 0, 0}, {`"\"`, "string not terminated", 0, 0},
{`"\x`, "string not terminated", 0, 0}, {`"\x`, "string not terminated", 0, 0},
{`"\x"`, "invalid character '\"' in hex escape", 0, 3}, {`"\x"`, "invalid character '\"' in hexadecimal escape", 0, 3},
{`"\y"`, "unknown escape", 0, 2}, {`"\y"`, "unknown escape", 0, 2},
{`"\x0"`, "invalid character '\"' in hex escape", 0, 4}, {`"\x0"`, "invalid character '\"' in hexadecimal escape", 0, 4},
{`"\00"`, "invalid character '\"' in octal escape", 0, 4}, {`"\00"`, "invalid character '\"' in octal escape", 0, 4},
{`"\377" /*`, "comment not terminated", 0, 7}, // valid octal escape {`"\377" /*`, "comment not terminated", 0, 7}, // valid octal escape
{`"\378"`, "invalid character '8' in octal escape", 0, 4}, {`"\378"`, "invalid character '8' in octal escape", 0, 4},
@ -644,8 +644,8 @@ func TestScanErrors(t *testing.T) {
{`s := "foo\z"`, "unknown escape", 0, 10}, {`s := "foo\z"`, "unknown escape", 0, 10},
{`s := "foo\z00\nbar"`, "unknown escape", 0, 10}, {`s := "foo\z00\nbar"`, "unknown escape", 0, 10},
{`"\x`, "string not terminated", 0, 0}, {`"\x`, "string not terminated", 0, 0},
{`"\x"`, "invalid character '\"' in hex escape", 0, 3}, {`"\x"`, "invalid character '\"' in hexadecimal escape", 0, 3},
{`var s string = "\x"`, "invalid character '\"' in hex escape", 0, 18}, {`var s string = "\x"`, "invalid character '\"' in hexadecimal escape", 0, 18},
{`return "\Uffffffff"`, "escape is invalid Unicode code point U+FFFFFFFF", 0, 18}, {`return "\Uffffffff"`, "escape is invalid Unicode code point U+FFFFFFFF", 0, 18},
{"0b.0", "invalid radix point in binary literal", 0, 2}, {"0b.0", "invalid radix point in binary literal", 0, 2},
@ -687,6 +687,48 @@ func TestScanErrors(t *testing.T) {
} }
} }
func TestDirectives(t *testing.T) {
for _, src := range []string{
"line",
"// line",
"//line",
"//line foo",
"//line foo%bar",
"go",
"// go:",
"//go:",
"//go :foo",
"//go:foo",
"//go:foo%bar",
} {
got := ""
var s scanner
s.init(strings.NewReader(src), func(_, col uint, msg string) {
if col != colbase {
t.Errorf("%s: got col = %d; want %d", src, col, colbase)
}
if msg == "" {
t.Errorf("%s: handler called with empty msg", src)
}
got = msg
}, directives)
s.next()
if strings.HasPrefix(src, "//line ") || strings.HasPrefix(src, "//go:") {
// handler should have been called
if got != src {
t.Errorf("got %s; want %s", got, src)
}
} else {
// handler should not have been called
if got != "" {
t.Errorf("got %s for %s", got, src)
}
}
}
}
func TestIssue21938(t *testing.T) { func TestIssue21938(t *testing.T) {
s := "/*" + strings.Repeat(" ", 4089) + "*/ .5" s := "/*" + strings.Repeat(" ", 4089) + "*/ .5"