go/src/encoding/csv/reader_test.go
Joe Tsai 89ccfe4962 encoding/csv: simplify and optimize Reader
The Reader implementation is slow because it operates on a rune-by-rune
basis via bufio.Reader.ReadRune. We speed this up by operating on entire
lines that we read from bufio.Reader.ReadSlice.

In order to ensure that we read the full line, we augment ReadSlice
in our Reader.readLine method to automatically expand the slice if
bufio.ErrBufferFull is every hit.

This change happens to fix #19410 because it no longer relies on
rune-by-rune parsing and only searches for the relevant delimiter rune.

In order to keep column accounting simple and consistent, this change
reverts parts of CL 52830.

This CL is an alternative to CL 36270 and builds on some of the ideas
from that change by Diogo Pinela.

name                                     old time/op    new time/op    delta
Read-8                                   3.12µs ± 1%    2.54µs ± 2%  -18.76%   (p=0.000 n=10+9)
ReadWithFieldsPerRecord-8                3.12µs ± 1%    2.53µs ± 1%  -18.91%    (p=0.000 n=9+9)
ReadWithoutFieldsPerRecord-8             3.13µs ± 0%    2.57µs ± 3%  -18.07%  (p=0.000 n=10+10)
ReadLargeFields-8                        52.3µs ± 1%     5.3µs ± 2%  -89.93%   (p=0.000 n=10+9)
ReadReuseRecord-8                        2.05µs ± 1%    1.40µs ± 1%  -31.48%   (p=0.000 n=10+9)
ReadReuseRecordWithFieldsPerRecord-8     2.05µs ± 1%    1.41µs ± 0%  -31.03%   (p=0.000 n=10+9)
ReadReuseRecordWithoutFieldsPerRecord-8  2.06µs ± 1%    1.40µs ± 1%  -31.70%   (p=0.000 n=9+10)
ReadReuseRecordLargeFields-8             50.9µs ± 0%     4.1µs ± 3%  -92.01%  (p=0.000 n=10+10)

name                                     old alloc/op   new alloc/op
Read-8                                       664B ± 0%      664B ± 0%
ReadWithFieldsPerRecord-8                    664B ± 0%      664B ± 0%
ReadWithoutFieldsPerRecord-8                 664B ± 0%      664B ± 0%
ReadLargeFields-8                          3.94kB ± 0%    3.94kB ± 0%
ReadReuseRecord-8                           24.0B ± 0%     24.0B ± 0%
ReadReuseRecordWithFieldsPerRecord-8        24.0B ± 0%     24.0B ± 0%
ReadReuseRecordWithoutFieldsPerRecord-8     24.0B ± 0%     24.0B ± 0%
ReadReuseRecordLargeFields-8               2.98kB ± 0%    2.98kB ± 0%

name                                     old allocs/op  new allocs/op
Read-8                                       18.0 ± 0%      18.0 ± 0%
ReadWithFieldsPerRecord-8                    18.0 ± 0%      18.0 ± 0%
ReadWithoutFieldsPerRecord-8                 18.0 ± 0%      18.0 ± 0%
ReadLargeFields-8                            24.0 ± 0%      24.0 ± 0%
ReadReuseRecord-8                            8.00 ± 0%      8.00 ± 0%
ReadReuseRecordWithFieldsPerRecord-8         8.00 ± 0%      8.00 ± 0%
ReadReuseRecordWithoutFieldsPerRecord-8      8.00 ± 0%      8.00 ± 0%
ReadReuseRecordLargeFields-8                 12.0 ± 0%      12.0 ± 0%

Updates #22352
Updates #19019
Fixes #16791
Fixes #19410

Change-Id: I31c27cfcc56880e6abac262f36c947179b550bbf
Reviewed-on: https://go-review.googlesource.com/72150
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-10-20 23:20:46 +00:00

500 lines
13 KiB
Go

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package csv
import (
"io"
"reflect"
"strings"
"testing"
)
var readTests = []struct {
Name string
Input string
Output [][]string
UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1
// These fields are copied into the Reader
Comma rune
Comment rune
FieldsPerRecord int
LazyQuotes bool
TrailingComma bool
TrimLeadingSpace bool
ReuseRecord bool
Error error
Line int // Expected error line if != 0
}{
{
Name: "Simple",
Input: "a,b,c\n",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "CRLF",
Input: "a,b\r\nc,d\r\n",
Output: [][]string{{"a", "b"}, {"c", "d"}},
},
{
Name: "BareCR",
Input: "a,b\rc,d\r\n",
Output: [][]string{{"a", "b\rc", "d"}},
},
{
Name: "RFC4180test",
UseFieldsPerRecord: true,
Input: `#field1,field2,field3
"aaa","bb
b","ccc"
"a,a","b""bb","ccc"
zzz,yyy,xxx
`,
Output: [][]string{
{"#field1", "field2", "field3"},
{"aaa", "bb\nb", "ccc"},
{"a,a", `b"bb`, "ccc"},
{"zzz", "yyy", "xxx"},
},
},
{
Name: "NoEOLTest",
Input: "a,b,c",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "Semicolon",
Comma: ';',
Input: "a;b;c\n",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "MultiLine",
Input: `"two
line","one line","three
line
field"`,
Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}},
},
{
Name: "BlankLine",
Input: "a,b,c\n\nd,e,f\n\n",
Output: [][]string{
{"a", "b", "c"},
{"d", "e", "f"},
},
},
{
Name: "BlankLineFieldCount",
Input: "a,b,c\n\nd,e,f\n\n",
UseFieldsPerRecord: true,
Output: [][]string{
{"a", "b", "c"},
{"d", "e", "f"},
},
},
{
Name: "TrimSpace",
Input: " a, b, c\n",
TrimLeadingSpace: true,
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "LeadingSpace",
Input: " a, b, c\n",
Output: [][]string{{" a", " b", " c"}},
},
{
Name: "Comment",
Comment: '#',
Input: "#1,2,3\na,b,c\n#comment",
Output: [][]string{{"a", "b", "c"}},
},
{
Name: "NoComment",
Input: "#1,2,3\na,b,c",
Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}},
},
{
Name: "LazyQuotes",
LazyQuotes: true,
Input: `a "word","1"2",a","b`,
Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}},
},
{
Name: "BareQuotes",
LazyQuotes: true,
Input: `a "word","1"2",a"`,
Output: [][]string{{`a "word"`, `1"2`, `a"`}},
},
{
Name: "BareDoubleQuotes",
LazyQuotes: true,
Input: `a""b,c`,
Output: [][]string{{`a""b`, `c`}},
},
{
Name: "BadDoubleQuotes",
Input: `a""b,c`,
Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote},
},
{
Name: "TrimQuote",
Input: ` "a"," b",c`,
TrimLeadingSpace: true,
Output: [][]string{{"a", " b", "c"}},
},
{
Name: "BadBareQuote",
Input: `a "word","b"`,
Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote},
},
{
Name: "BadTrailingQuote",
Input: `"a word",b"`,
Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote},
},
{
Name: "ExtraneousQuote",
Input: `"a "word","b"`,
Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote},
},
{
Name: "BadFieldCount",
UseFieldsPerRecord: true,
Input: "a,b,c\nd,e",
Error: &ParseError{Line: 2, Err: ErrFieldCount},
},
{
Name: "BadFieldCount1",
UseFieldsPerRecord: true,
FieldsPerRecord: 2,
Input: `a,b,c`,
Error: &ParseError{Line: 1, Err: ErrFieldCount},
},
{
Name: "FieldCount",
Input: "a,b,c\nd,e",
Output: [][]string{{"a", "b", "c"}, {"d", "e"}},
},
{
Name: "TrailingCommaEOF",
Input: "a,b,c,",
Output: [][]string{{"a", "b", "c", ""}},
},
{
Name: "TrailingCommaEOL",
Input: "a,b,c,\n",
Output: [][]string{{"a", "b", "c", ""}},
},
{
Name: "TrailingCommaSpaceEOF",
TrimLeadingSpace: true,
Input: "a,b,c, ",
Output: [][]string{{"a", "b", "c", ""}},
},
{
Name: "TrailingCommaSpaceEOL",
TrimLeadingSpace: true,
Input: "a,b,c, \n",
Output: [][]string{{"a", "b", "c", ""}},
},
{
Name: "TrailingCommaLine3",
TrimLeadingSpace: true,
Input: "a,b,c\nd,e,f\ng,hi,",
Output: [][]string{{"a", "b", "c"}, {"d", "e", "f"}, {"g", "hi", ""}},
},
{
Name: "NotTrailingComma3",
Input: "a,b,c, \n",
Output: [][]string{{"a", "b", "c", " "}},
},
{
Name: "CommaFieldTest",
TrailingComma: true,
Input: `x,y,z,w
x,y,z,
x,y,,
x,,,
,,,
"x","y","z","w"
"x","y","z",""
"x","y","",""
"x","","",""
"","","",""
`,
Output: [][]string{
{"x", "y", "z", "w"},
{"x", "y", "z", ""},
{"x", "y", "", ""},
{"x", "", "", ""},
{"", "", "", ""},
{"x", "y", "z", "w"},
{"x", "y", "z", ""},
{"x", "y", "", ""},
{"x", "", "", ""},
{"", "", "", ""},
},
},
{
Name: "TrailingCommaIneffective1",
TrailingComma: true,
TrimLeadingSpace: true,
Input: "a,b,\nc,d,e",
Output: [][]string{
{"a", "b", ""},
{"c", "d", "e"},
},
},
{
Name: "TrailingCommaIneffective2",
TrailingComma: false,
TrimLeadingSpace: true,
Input: "a,b,\nc,d,e",
Output: [][]string{
{"a", "b", ""},
{"c", "d", "e"},
},
},
{
Name: "ReadAllReuseRecord",
ReuseRecord: true,
Input: "a,b\nc,d",
Output: [][]string{
{"a", "b"},
{"c", "d"},
},
},
{ // issue 19019
Name: "RecordLine1",
Input: "a,\"b\nc\"d,e",
Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote},
},
{
Name: "RecordLine2",
Input: "a,b\n\"d\n\n,e",
Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote},
},
{ // issue 21201
Name: "CRLFInQuotedField",
Input: "\"Hello\r\nHi\"",
Output: [][]string{
{"Hello\r\nHi"},
},
},
{ // issue 19410
Name: "BinaryBlobField",
Input: "x09\x41\xb4\x1c,aktau",
Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
},
{
Name: "TrailingCR",
Input: "field1,field2\r",
Output: [][]string{{"field1", "field2\r"}},
},
{
Name: "NonASCIICommaAndComment",
TrimLeadingSpace: true,
Comma: '£',
Comment: '€',
Input: "a£b,c£ \td,e\n€ comment\n",
Output: [][]string{{"a", "b,c", "d,e"}},
},
{
Name: "NonASCIICommaAndCommentWithQuotes",
Comma: '€',
Comment: 'λ',
Input: "a€\" b,\"€ c\nλ comment\n",
Output: [][]string{{"a", " b,", " c"}},
},
{
Name: "NonASCIICommaConfusion",
Comma: 'λ',
Comment: '€',
// λ and θ start with the same byte. This test is intended to ensure the parser doesn't
// confuse such characters.
Input: "\"abθcd\"λefθgh",
Output: [][]string{{"abθcd", "efθgh"}},
},
{
Name: "NonASCIICommentConfusion",
Comment: 'θ',
Input: "λ\nλ\nθ\nλ\n",
Output: [][]string{{"λ"}, {"λ"}, {"λ"}},
},
{
Name: "QuotedFieldMultipleLF",
Input: "\"\n\n\n\n\"",
Output: [][]string{{"\n\n\n\n"}},
},
{
Name: "MultipleCRLF",
Input: "\r\n\r\n\r\n\r\n",
},
{
// The implementation may read each line in several chunks if it doesn't fit entirely
// in the read buffer, so we should test the code to handle that condition.
Name: "HugeLines",
Comment: '#',
Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
},
{
Name: "QuoteWithTrailingCRLF",
Input: "\"foo\"bar\"\r\n",
Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote},
},
{
Name: "LazyQuoteWithTrailingCRLF",
Input: "\"foo\"bar\"\r\n",
LazyQuotes: true,
Output: [][]string{{`foo"bar`}},
},
{
Name: "DoubleQuoteWithTrailingCRLF",
Input: "\"foo\"\"bar\"\r\n",
Output: [][]string{{`foo"bar`}},
},
{
Name: "EvenQuotes",
Input: `""""""""`,
Output: [][]string{{`"""`}},
},
{
Name: "OddQuotes",
Input: `"""""""`,
Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote},
},
{
Name: "LazyOddQuotes",
Input: `"""""""`,
LazyQuotes: true,
Output: [][]string{{`"""`}},
},
}
func TestRead(t *testing.T) {
for _, tt := range readTests {
r := NewReader(strings.NewReader(tt.Input))
r.Comment = tt.Comment
if tt.UseFieldsPerRecord {
r.FieldsPerRecord = tt.FieldsPerRecord
} else {
r.FieldsPerRecord = -1
}
r.LazyQuotes = tt.LazyQuotes
r.TrailingComma = tt.TrailingComma
r.TrimLeadingSpace = tt.TrimLeadingSpace
r.ReuseRecord = tt.ReuseRecord
if tt.Comma != 0 {
r.Comma = tt.Comma
}
out, err := r.ReadAll()
if !reflect.DeepEqual(err, tt.Error) {
t.Errorf("%s: ReadAll() error:\ngot %v\nwant %v", tt.Name, err, tt.Error)
} else if !reflect.DeepEqual(out, tt.Output) {
t.Errorf("%s: ReadAll() output:\ngot %q\nwant %q", tt.Name, out, tt.Output)
}
}
}
// nTimes is an io.Reader which yields the string s n times.
type nTimes struct {
s string
n int
off int
}
func (r *nTimes) Read(p []byte) (n int, err error) {
for {
if r.n <= 0 || r.s == "" {
return n, io.EOF
}
n0 := copy(p, r.s[r.off:])
p = p[n0:]
n += n0
r.off += n0
if r.off == len(r.s) {
r.off = 0
r.n--
}
if len(p) == 0 {
return
}
}
}
// benchmarkRead measures reading the provided CSV rows data.
// initReader, if non-nil, modifies the Reader before it's used.
func benchmarkRead(b *testing.B, initReader func(*Reader), rows string) {
b.ReportAllocs()
r := NewReader(&nTimes{s: rows, n: b.N})
if initReader != nil {
initReader(r)
}
for {
_, err := r.Read()
if err == io.EOF {
break
}
if err != nil {
b.Fatal(err)
}
}
}
const benchmarkCSVData = `x,y,z,w
x,y,z,
x,y,,
x,,,
,,,
"x","y","z","w"
"x","y","z",""
"x","y","",""
"x","","",""
"","","",""
`
func BenchmarkRead(b *testing.B) {
benchmarkRead(b, nil, benchmarkCSVData)
}
func BenchmarkReadWithFieldsPerRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = 4 }, benchmarkCSVData)
}
func BenchmarkReadWithoutFieldsPerRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = -1 }, benchmarkCSVData)
}
func BenchmarkReadLargeFields(b *testing.B) {
benchmarkRead(b, nil, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv
,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
`, 3))
}
func BenchmarkReadReuseRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData)
}
func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData)
}
func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData)
}
func BenchmarkReadReuseRecordLargeFields(b *testing.B) {
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv
,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
`, 3))
}