encoding/csv: add option to reuse slices returned by Read

In many cases the records returned by Reader.Read will only be used between calls to Read and become garbage once a new record is read. In this case, instead of allocating a new slice on each call to Read, we can reuse the last allocated slice for successive calls to avoid unnecessary allocations. This change adds a new field ReuseRecord to the Reader struct to enable this reuse. ReuseRecord is false by default to avoid breaking existing code which dependss on the current behaviour. I also added 4 new benchmarks, corresponding to the existing Read benchmarks, which set ReuseRecord to true. Benchstat on my local machine (old is ReuseRecord = false, new is ReuseRecord = true) name old time/op new time/op delta Read-8 2.75µs ± 2% 1.88µs ± 1% -31.52% (p=0.000 n=14+15) ReadWithFieldsPerRecord-8 2.75µs ± 0% 1.89µs ± 1% -31.43% (p=0.000 n=13+13) ReadWithoutFieldsPerRecord-8 2.77µs ± 1% 1.88µs ± 1% -32.06% (p=0.000 n=15+15) ReadLargeFields-8 55.4µs ± 1% 54.2µs ± 0% -2.07% (p=0.000 n=15+14) name old alloc/op new alloc/op delta Read-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadWithFieldsPerRecord-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadWithoutFieldsPerRecord-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadLargeFields-8 3.94kB ± 0% 2.98kB ± 0% -24.39% (p=0.000 n=15+15) name old allocs/op new allocs/op delta Read-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadWithFieldsPerRecord-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadWithoutFieldsPerRecord-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadLargeFields-8 24.0 ± 0% 12.0 ± 0% -50.00% (p=0.000 n=15+15) Fixes #19721 Change-Id: I79b14128bb9bb3465f53f40f93b1b528a9da6f58 Reviewed-on: https://go-review.googlesource.com/41730 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
author: Justin Nuß <nuss.justin@gmail.com> 2017-04-25 21:23:34 +0200
committer: Brad Fitzpatrick <bradfitz@golang.org> 2017-04-26 15:55:56 +0000
commit: 2181653be637cdcc7a6efee8ec0a719df1d83c00 (patch)
tree: 230724478e771bb993a82411d0b9e8278acd790f /src/encoding/csv
parent: ba8ff87dbeb87813a4604e36adb609b1e8fcb7be (diff)
download: go-2181653be637cdcc7a6efee8ec0a719df1d83c00.tar.gz
go-2181653be637cdcc7a6efee8ec0a719df1d83c00.zip
2 files changed, 80 insertions, 20 deletions
diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go
index c8c4ca7758..a3497c84f9 100644
--- a/src/encoding/csv/reader.go
+++ b/src/encoding/csv/reader.go
@@ -110,6 +110,10 @@ type Reader struct {
 	// If TrimLeadingSpace is true, leading white space in a field is ignored.
 	// This is done even if the field delimiter, Comma, is white space.
 	TrimLeadingSpace bool
+	// ReuseRecord controls whether calls to Read may return a slice sharing
+	// the backing array of the previous call's returned slice for performance.
+	// By default, each call to Read returns newly allocated memory owned by the caller.
+	ReuseRecord bool
 
 	line   int
 	column int
@@ -122,6 +126,9 @@ type Reader struct {
 	// Indexes of fields inside lineBuffer
 	// The i'th field starts at offset fieldIndexes[i] in lineBuffer.
 	fieldIndexes []int
+
+	// only used when ReuseRecord == true
+	lastRecord []string
 }
 
 // NewReader returns a new Reader that reads from r.
@@ -147,26 +154,17 @@ func (r *Reader) error(err error) error {
 // Except for that case, Read always returns either a non-nil
 // record or a non-nil error, but not both.
 // If there is no data left to be read, Read returns nil, io.EOF.
+// If ReuseRecord is true, the returned slice may be shared
+// between multiple calls to Read.
 func (r *Reader) Read() (record []string, err error) {
-	for {
-		record, err = r.parseRecord()
-		if record != nil {
-			break
-		}
-		if err != nil {
-			return nil, err
-		}
+	if r.ReuseRecord {
+		record, err = r.readRecord(r.lastRecord)
+		r.lastRecord = record
+	} else {
+		record, err = r.readRecord(nil)
 	}
 
-	if r.FieldsPerRecord > 0 {
-		if len(record) != r.FieldsPerRecord {
-			r.column = 0 // report at start of record
-			return record, r.error(ErrFieldCount)
-		}
-	} else if r.FieldsPerRecord == 0 {
-		r.FieldsPerRecord = len(record)
-	}
-	return record, nil
+	return record, err
 }
 
 // ReadAll reads all the remaining records from r.
@@ -176,7 +174,7 @@ func (r *Reader) Read() (record []string, err error) {
 // reported.
 func (r *Reader) ReadAll() (records [][]string, err error) {
 	for {
-		record, err := r.Read()
+		record, err := r.readRecord(nil)
 		if err == io.EOF {
 			return records, nil
 		}
@@ -187,6 +185,31 @@ func (r *Reader) ReadAll() (records [][]string, err error) {
 	}
 }
 
+// readRecord reads and parses a single csv record from r.
+// Unlike parseRecord, readRecord handles FieldsPerRecord.
+// If dst has enough capacity it will be used for the returned record.
+func (r *Reader) readRecord(dst []string) (record []string, err error) {
+	for {
+		record, err = r.parseRecord(dst)
+		if record != nil {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if r.FieldsPerRecord > 0 {
+		if len(record) != r.FieldsPerRecord {
+			r.column = 0 // report at start of record
+			return record, r.error(ErrFieldCount)
+		}
+	} else if r.FieldsPerRecord == 0 {
+		r.FieldsPerRecord = len(record)
+	}
+	return record, nil
+}
+
 // readRune reads one rune from r, folding \r\n to \n and keeping track
 // of how far into the line we have read.  r.column will point to the start
 // of this rune, not the end of this rune.
@@ -223,7 +246,8 @@ func (r *Reader) skip(delim rune) error {
 }
 
 // parseRecord reads and parses a single csv record from r.
-func (r *Reader) parseRecord() (fields []string, err error) {
+// If dst has enough capacity it will be used for the returned fields.
+func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
 	// Each record starts on a new line. We increment our line
 	// number (lines start at 1, not 0) and set column to -1
 	// so as we increment in readRune it points to the character we read.
@@ -275,7 +299,12 @@ func (r *Reader) parseRecord() (fields []string, err error) {
 	// minimal and a tradeoff for better performance through the combined
 	// allocations.
 	line := r.lineBuffer.String()
-	fields = make([]string, fieldCount)
+
+	if cap(dst) >= fieldCount {
+		fields = dst[:fieldCount]
+	} else {
+		fields = make([]string, fieldCount)
+	}
 
 	for i, idx := range r.fieldIndexes {
 		if i == fieldCount-1 {
diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go
index 7b3aca4c5f..5ab1b61256 100644
--- a/src/encoding/csv/reader_test.go
+++ b/src/encoding/csv/reader_test.go
@@ -24,6 +24,7 @@ var readTests = []struct {
 	LazyQuotes       bool
 	TrailingComma    bool
 	TrimLeadingSpace bool
+	ReuseRecord      bool
 
 	Error  string
 	Line   int // Expected error line if != 0
@@ -260,6 +261,15 @@ x,,,
 			{"c", "d", "e"},
 		},
 	},
+	{
+		Name:        "ReadAllReuseRecord",
+		ReuseRecord: true,
+		Input:       "a,b\nc,d",
+		Output: [][]string{
+			{"a", "b"},
+			{"c", "d"},
+		},
+	},
 }
 
 func TestRead(t *testing.T) {
@@ -274,6 +284,7 @@ func TestRead(t *testing.T) {
 		r.LazyQuotes = tt.LazyQuotes
 		r.TrailingComma = tt.TrailingComma
 		r.TrimLeadingSpace = tt.TrimLeadingSpace
+		r.ReuseRecord = tt.ReuseRecord
 		if tt.Comma != 0 {
 			r.Comma = tt.Comma
 		}
@@ -369,3 +380,23 @@ xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzz
 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
 `, 3))
 }
+
+func BenchmarkReadReuseRecord(b *testing.B) {
+	benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData)
+}
+
+func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) {
+	benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData)
+}
+
+func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) {
+	benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData)
+}
+
+func BenchmarkReadReuseRecordLargeFields(b *testing.B) {
+	benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
+xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv
+,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
+xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
+`, 3))
+}
author	Justin Nuß <nuss.justin@gmail.com>	2017-04-25 21:23:34 +0200
committer	Brad Fitzpatrick <bradfitz@golang.org>	2017-04-26 15:55:56 +0000
commit	2181653be637cdcc7a6efee8ec0a719df1d83c00 (patch)
tree	230724478e771bb993a82411d0b9e8278acd790f /src/encoding/csv
parent	ba8ff87dbeb87813a4604e36adb609b1e8fcb7be (diff)
download	go-2181653be637cdcc7a6efee8ec0a719df1d83c00.tar.gz go-2181653be637cdcc7a6efee8ec0a719df1d83c00.zip