mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
encoding/csv: simplify and optimize Reader
The Reader implementation is slow because it operates on a rune-by-rune basis via bufio.Reader.ReadRune. We speed this up by operating on entire lines that we read from bufio.Reader.ReadSlice. In order to ensure that we read the full line, we augment ReadSlice in our Reader.readLine method to automatically expand the slice if bufio.ErrBufferFull is every hit. This change happens to fix #19410 because it no longer relies on rune-by-rune parsing and only searches for the relevant delimiter rune. In order to keep column accounting simple and consistent, this change reverts parts of CL 52830. This CL is an alternative to CL 36270 and builds on some of the ideas from that change by Diogo Pinela. name old time/op new time/op delta Read-8 3.12µs ± 1% 2.54µs ± 2% -18.76% (p=0.000 n=10+9) ReadWithFieldsPerRecord-8 3.12µs ± 1% 2.53µs ± 1% -18.91% (p=0.000 n=9+9) ReadWithoutFieldsPerRecord-8 3.13µs ± 0% 2.57µs ± 3% -18.07% (p=0.000 n=10+10) ReadLargeFields-8 52.3µs ± 1% 5.3µs ± 2% -89.93% (p=0.000 n=10+9) ReadReuseRecord-8 2.05µs ± 1% 1.40µs ± 1% -31.48% (p=0.000 n=10+9) ReadReuseRecordWithFieldsPerRecord-8 2.05µs ± 1% 1.41µs ± 0% -31.03% (p=0.000 n=10+9) ReadReuseRecordWithoutFieldsPerRecord-8 2.06µs ± 1% 1.40µs ± 1% -31.70% (p=0.000 n=9+10) ReadReuseRecordLargeFields-8 50.9µs ± 0% 4.1µs ± 3% -92.01% (p=0.000 n=10+10) name old alloc/op new alloc/op Read-8 664B ± 0% 664B ± 0% ReadWithFieldsPerRecord-8 664B ± 0% 664B ± 0% ReadWithoutFieldsPerRecord-8 664B ± 0% 664B ± 0% ReadLargeFields-8 3.94kB ± 0% 3.94kB ± 0% ReadReuseRecord-8 24.0B ± 0% 24.0B ± 0% ReadReuseRecordWithFieldsPerRecord-8 24.0B ± 0% 24.0B ± 0% ReadReuseRecordWithoutFieldsPerRecord-8 24.0B ± 0% 24.0B ± 0% ReadReuseRecordLargeFields-8 2.98kB ± 0% 2.98kB ± 0% name old allocs/op new allocs/op Read-8 18.0 ± 0% 18.0 ± 0% ReadWithFieldsPerRecord-8 18.0 ± 0% 18.0 ± 0% ReadWithoutFieldsPerRecord-8 18.0 ± 0% 18.0 ± 0% ReadLargeFields-8 24.0 ± 0% 24.0 ± 0% ReadReuseRecord-8 8.00 ± 0% 8.00 ± 0% ReadReuseRecordWithFieldsPerRecord-8 8.00 ± 0% 8.00 ± 0% ReadReuseRecordWithoutFieldsPerRecord-8 8.00 ± 0% 8.00 ± 0% ReadReuseRecordLargeFields-8 12.0 ± 0% 12.0 ± 0% Updates #22352 Updates #19019 Fixes #16791 Fixes #19410 Change-Id: I31c27cfcc56880e6abac262f36c947179b550bbf Reviewed-on: https://go-review.googlesource.com/72150 Reviewed-by: Ian Lance Taylor <iant@golang.org> Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
23aad448b1
commit
89ccfe4962
2 changed files with 293 additions and 275 deletions
|
|
@ -26,9 +26,8 @@ var readTests = []struct {
|
|||
TrimLeadingSpace bool
|
||||
ReuseRecord bool
|
||||
|
||||
Error string
|
||||
Line int // Expected error line if != 0
|
||||
Column int // Expected error column if line != 0
|
||||
Error error
|
||||
Line int // Expected error line if != 0
|
||||
}{
|
||||
{
|
||||
Name: "Simple",
|
||||
|
|
@ -140,7 +139,7 @@ field"`,
|
|||
{
|
||||
Name: "BadDoubleQuotes",
|
||||
Input: `a""b,c`,
|
||||
Error: `bare " in non-quoted-field`, Line: 1, Column: 1,
|
||||
Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote},
|
||||
},
|
||||
{
|
||||
Name: "TrimQuote",
|
||||
|
|
@ -151,30 +150,30 @@ field"`,
|
|||
{
|
||||
Name: "BadBareQuote",
|
||||
Input: `a "word","b"`,
|
||||
Error: `bare " in non-quoted-field`, Line: 1, Column: 2,
|
||||
Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote},
|
||||
},
|
||||
{
|
||||
Name: "BadTrailingQuote",
|
||||
Input: `"a word",b"`,
|
||||
Error: `bare " in non-quoted-field`, Line: 1, Column: 10,
|
||||
Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote},
|
||||
},
|
||||
{
|
||||
Name: "ExtraneousQuote",
|
||||
Input: `"a "word","b"`,
|
||||
Error: `extraneous " in field`, Line: 1, Column: 3,
|
||||
Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote},
|
||||
},
|
||||
{
|
||||
Name: "BadFieldCount",
|
||||
UseFieldsPerRecord: true,
|
||||
Input: "a,b,c\nd,e",
|
||||
Error: "wrong number of fields", Line: 2,
|
||||
Error: &ParseError{Line: 2, Err: ErrFieldCount},
|
||||
},
|
||||
{
|
||||
Name: "BadFieldCount1",
|
||||
UseFieldsPerRecord: true,
|
||||
FieldsPerRecord: 2,
|
||||
Input: `a,b,c`,
|
||||
Error: "wrong number of fields", Line: 1,
|
||||
Error: &ParseError{Line: 1, Err: ErrFieldCount},
|
||||
},
|
||||
{
|
||||
Name: "FieldCount",
|
||||
|
|
@ -271,18 +270,14 @@ x,,,
|
|||
},
|
||||
},
|
||||
{ // issue 19019
|
||||
Name: "RecordLine1",
|
||||
Input: "a,\"b\nc\"d,e",
|
||||
Error: `extraneous " in field`,
|
||||
Line: 1,
|
||||
Column: 1,
|
||||
Name: "RecordLine1",
|
||||
Input: "a,\"b\nc\"d,e",
|
||||
Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote},
|
||||
},
|
||||
{
|
||||
Name: "RecordLine2",
|
||||
Input: "a,b\n\"d\n\n,e",
|
||||
Error: `extraneous " in field`,
|
||||
Line: 2,
|
||||
Column: 2,
|
||||
Name: "RecordLine2",
|
||||
Input: "a,b\n\"d\n\n,e",
|
||||
Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote},
|
||||
},
|
||||
{ // issue 21201
|
||||
Name: "CRLFInQuotedField",
|
||||
|
|
@ -291,6 +286,95 @@ x,,,
|
|||
{"Hello\r\nHi"},
|
||||
},
|
||||
},
|
||||
{ // issue 19410
|
||||
Name: "BinaryBlobField",
|
||||
Input: "x09\x41\xb4\x1c,aktau",
|
||||
Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
|
||||
},
|
||||
{
|
||||
Name: "TrailingCR",
|
||||
Input: "field1,field2\r",
|
||||
Output: [][]string{{"field1", "field2\r"}},
|
||||
},
|
||||
{
|
||||
Name: "NonASCIICommaAndComment",
|
||||
TrimLeadingSpace: true,
|
||||
Comma: '£',
|
||||
Comment: '€',
|
||||
Input: "a£b,c£ \td,e\n€ comment\n",
|
||||
Output: [][]string{{"a", "b,c", "d,e"}},
|
||||
},
|
||||
{
|
||||
Name: "NonASCIICommaAndCommentWithQuotes",
|
||||
Comma: '€',
|
||||
Comment: 'λ',
|
||||
Input: "a€\" b,\"€ c\nλ comment\n",
|
||||
Output: [][]string{{"a", " b,", " c"}},
|
||||
},
|
||||
{
|
||||
Name: "NonASCIICommaConfusion",
|
||||
Comma: 'λ',
|
||||
Comment: '€',
|
||||
// λ and θ start with the same byte. This test is intended to ensure the parser doesn't
|
||||
// confuse such characters.
|
||||
Input: "\"abθcd\"λefθgh",
|
||||
Output: [][]string{{"abθcd", "efθgh"}},
|
||||
},
|
||||
{
|
||||
Name: "NonASCIICommentConfusion",
|
||||
Comment: 'θ',
|
||||
Input: "λ\nλ\nθ\nλ\n",
|
||||
Output: [][]string{{"λ"}, {"λ"}, {"λ"}},
|
||||
},
|
||||
{
|
||||
Name: "QuotedFieldMultipleLF",
|
||||
Input: "\"\n\n\n\n\"",
|
||||
Output: [][]string{{"\n\n\n\n"}},
|
||||
},
|
||||
{
|
||||
Name: "MultipleCRLF",
|
||||
Input: "\r\n\r\n\r\n\r\n",
|
||||
},
|
||||
{
|
||||
// The implementation may read each line in several chunks if it doesn't fit entirely
|
||||
// in the read buffer, so we should test the code to handle that condition.
|
||||
Name: "HugeLines",
|
||||
Comment: '#',
|
||||
Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
|
||||
Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
|
||||
},
|
||||
{
|
||||
Name: "QuoteWithTrailingCRLF",
|
||||
Input: "\"foo\"bar\"\r\n",
|
||||
Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote},
|
||||
},
|
||||
{
|
||||
Name: "LazyQuoteWithTrailingCRLF",
|
||||
Input: "\"foo\"bar\"\r\n",
|
||||
LazyQuotes: true,
|
||||
Output: [][]string{{`foo"bar`}},
|
||||
},
|
||||
{
|
||||
Name: "DoubleQuoteWithTrailingCRLF",
|
||||
Input: "\"foo\"\"bar\"\r\n",
|
||||
Output: [][]string{{`foo"bar`}},
|
||||
},
|
||||
{
|
||||
Name: "EvenQuotes",
|
||||
Input: `""""""""`,
|
||||
Output: [][]string{{`"""`}},
|
||||
},
|
||||
{
|
||||
Name: "OddQuotes",
|
||||
Input: `"""""""`,
|
||||
Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote},
|
||||
},
|
||||
{
|
||||
Name: "LazyOddQuotes",
|
||||
Input: `"""""""`,
|
||||
LazyQuotes: true,
|
||||
Output: [][]string{{`"""`}},
|
||||
},
|
||||
}
|
||||
|
||||
func TestRead(t *testing.T) {
|
||||
|
|
@ -310,17 +394,10 @@ func TestRead(t *testing.T) {
|
|||
r.Comma = tt.Comma
|
||||
}
|
||||
out, err := r.ReadAll()
|
||||
perr, _ := err.(*ParseError)
|
||||
if tt.Error != "" {
|
||||
if err == nil || !strings.Contains(err.Error(), tt.Error) {
|
||||
t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error)
|
||||
} else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) {
|
||||
t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column)
|
||||
}
|
||||
} else if err != nil {
|
||||
t.Errorf("%s: unexpected error %v", tt.Name, err)
|
||||
if !reflect.DeepEqual(err, tt.Error) {
|
||||
t.Errorf("%s: ReadAll() error:\ngot %v\nwant %v", tt.Name, err, tt.Error)
|
||||
} else if !reflect.DeepEqual(out, tt.Output) {
|
||||
t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output)
|
||||
t.Errorf("%s: ReadAll() output:\ngot %q\nwant %q", tt.Name, out, tt.Output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue