aboutsummaryrefslogtreecommitdiff
path: root/src/encoding/csv
diff options
context:
space:
mode:
authorJustin Nuß <nuss.justin@gmail.com>2016-07-03 17:49:29 +0200
committerBrad Fitzpatrick <bradfitz@golang.org>2016-10-05 16:57:44 +0000
commitbd06d4827ae637cd08f85962f996760e76e28efc (patch)
tree1872d95a6ae41319d7b00a9c6e2b0faf6cec61cd /src/encoding/csv
parentdce0df29dd9052c0f00ce8217b9a51a84206e892 (diff)
downloadgo-bd06d4827ae637cd08f85962f996760e76e28efc.tar.gz
go-bd06d4827ae637cd08f85962f996760e76e28efc.zip
encoding/csv: avoid allocations when reading records
This commit changes parseRecord to allocate a single string per record, instead of per field, by using indexes into the raw record. Benchstat (done with f69991c17) name old time/op new time/op delta Read-8 3.17µs ± 0% 2.78µs ± 1% -12.35% (p=0.016 n=4+5) ReadWithFieldsPerRecord-8 3.18µs ± 1% 2.79µs ± 1% -12.23% (p=0.008 n=5+5) ReadWithoutFieldsPerRecord-8 4.59µs ± 0% 2.77µs ± 0% -39.58% (p=0.016 n=4+5) ReadLargeFields-8 57.0µs ± 0% 55.7µs ± 0% -2.18% (p=0.008 n=5+5) name old alloc/op new alloc/op delta Read-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5) ReadWithFieldsPerRecord-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5) ReadWithoutFieldsPerRecord-8 1.14kB ± 0% 0.66kB ± 0% -41.75% (p=0.008 n=5+5) ReadLargeFields-8 3.86kB ± 0% 3.94kB ± 0% +1.86% (p=0.008 n=5+5) name old allocs/op new allocs/op delta Read-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5) ReadWithFieldsPerRecord-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5) ReadWithoutFieldsPerRecord-8 50.0 ± 0% 18.0 ± 0% -64.00% (p=0.008 n=5+5) ReadLargeFields-8 66.0 ± 0% 24.0 ± 0% -63.64% (p=0.008 n=5+5) For a simple application that I wrote, which reads in a CSV file (via ReadAll) and outputs the number of rows read (15857625 rows), this change reduces the total time on my notebook from ~58 seconds to ~48 seconds. This reduces time and allocations (bytes) each by ~6% for a real world CSV file at work (~230000 rows, 13 colums). Updates #16791 Change-Id: Ia07177c94624e55cdd3064a7d2751fb69322d3e4 Reviewed-on: https://go-review.googlesource.com/24723 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/encoding/csv')
-rw-r--r--src/encoding/csv/reader.go60
1 files changed, 45 insertions, 15 deletions
diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go
index a5e03a9f8e..28caa6aa27 100644
--- a/src/encoding/csv/reader.go
+++ b/src/encoding/csv/reader.go
@@ -114,7 +114,14 @@ type Reader struct {
line int
column int
r *bufio.Reader
- field bytes.Buffer
+ // lineBuffer holds the unescaped fields read by readField, one after another.
+ // The fields can be accessed by using the indexes in fieldIndexes.
+ // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
+ // fieldIndexes will contain the indexes 0, 1, 2, 5.
+ lineBuffer bytes.Buffer
+ // Indexes of fields inside lineBuffer
+ // The i'th field starts at offset fieldIndexes[i] in lineBuffer.
+ fieldIndexes []int
}
// NewReader returns a new Reader that reads from r.
@@ -233,31 +240,54 @@ func (r *Reader) parseRecord() (fields []string, err error) {
}
r.r.UnreadRune()
+ r.lineBuffer.Reset()
+ r.fieldIndexes = r.fieldIndexes[:0]
+
// At this point we have at least one field.
for {
+ idx := r.lineBuffer.Len()
+
haveField, delim, err := r.parseField()
if haveField {
- // If FieldsPerRecord is greater than 0 we can assume the final
- // length of fields to be equal to FieldsPerRecord.
- if r.FieldsPerRecord > 0 && fields == nil {
- fields = make([]string, 0, r.FieldsPerRecord)
- }
- fields = append(fields, r.field.String())
+ r.fieldIndexes = append(r.fieldIndexes, idx)
}
+
if delim == '\n' || err == io.EOF {
- return fields, err
- } else if err != nil {
+ if len(r.fieldIndexes) == 0 {
+ return nil, err
+ }
+ break
+ }
+
+ if err != nil {
return nil, err
}
}
+
+ fieldCount := len(r.fieldIndexes)
+ // Using this approach (creating a single string and taking slices of it)
+ // means that a single reference to any of the fields will retain the whole
+ // string. The risk of a nontrivial space leak caused by this is considered
+ // minimal and a tradeoff for better performance through the combined
+ // allocations.
+ line := r.lineBuffer.String()
+ fields = make([]string, fieldCount)
+
+ for i, idx := range r.fieldIndexes {
+ if i == fieldCount-1 {
+ fields[i] = line[idx:]
+ } else {
+ fields[i] = line[idx:r.fieldIndexes[i+1]]
+ }
+ }
+
+ return fields, nil
}
// parseField parses the next field in the record. The read field is
-// located in r.field. Delim is the first character not part of the field
+// appended to r.lineBuffer. Delim is the first character not part of the field
// (r.Comma or '\n').
func (r *Reader) parseField() (haveField bool, delim rune, err error) {
- r.field.Reset()
-
r1, err := r.readRune()
for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
r1, err = r.readRune()
@@ -310,19 +340,19 @@ func (r *Reader) parseField() (haveField bool, delim rune, err error) {
return false, 0, r.error(ErrQuote)
}
// accept the bare quote
- r.field.WriteRune('"')
+ r.lineBuffer.WriteRune('"')
}
case '\n':
r.line++
r.column = -1
}
- r.field.WriteRune(r1)
+ r.lineBuffer.WriteRune(r1)
}
default:
// unquoted field
for {
- r.field.WriteRune(r1)
+ r.lineBuffer.WriteRune(r1)
r1, err = r.readRune()
if err != nil || r1 == r.Comma {
break