diff options
author | Justin Nuß <nuss.justin@gmail.com> | 2016-07-03 17:49:29 +0200 |
---|---|---|
committer | Brad Fitzpatrick <bradfitz@golang.org> | 2016-10-05 16:57:44 +0000 |
commit | bd06d4827ae637cd08f85962f996760e76e28efc (patch) | |
tree | 1872d95a6ae41319d7b00a9c6e2b0faf6cec61cd /src/encoding/csv | |
parent | dce0df29dd9052c0f00ce8217b9a51a84206e892 (diff) | |
download | go-bd06d4827ae637cd08f85962f996760e76e28efc.tar.gz go-bd06d4827ae637cd08f85962f996760e76e28efc.zip |
encoding/csv: avoid allocations when reading records
This commit changes parseRecord to allocate a single string per record,
instead of per field, by using indexes into the raw record.
Benchstat (done with f69991c17)
name old time/op new time/op delta
Read-8 3.17µs ± 0% 2.78µs ± 1% -12.35% (p=0.016 n=4+5)
ReadWithFieldsPerRecord-8 3.18µs ± 1% 2.79µs ± 1% -12.23% (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8 4.59µs ± 0% 2.77µs ± 0% -39.58% (p=0.016 n=4+5)
ReadLargeFields-8 57.0µs ± 0% 55.7µs ± 0% -2.18% (p=0.008 n=5+5)
name old alloc/op new alloc/op delta
Read-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5)
ReadWithFieldsPerRecord-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8 1.14kB ± 0% 0.66kB ± 0% -41.75% (p=0.008 n=5+5)
ReadLargeFields-8 3.86kB ± 0% 3.94kB ± 0% +1.86% (p=0.008 n=5+5)
name old allocs/op new allocs/op delta
Read-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5)
ReadWithFieldsPerRecord-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8 50.0 ± 0% 18.0 ± 0% -64.00% (p=0.008 n=5+5)
ReadLargeFields-8 66.0 ± 0% 24.0 ± 0% -63.64% (p=0.008 n=5+5)
For a simple application that I wrote, which reads in a CSV file (via
ReadAll) and outputs the number of rows read (15857625 rows), this change
reduces the total time on my notebook from ~58 seconds to ~48 seconds.
This reduces time and allocations (bytes) each by ~6% for a real world
CSV file at work (~230000 rows, 13 colums).
Updates #16791
Change-Id: Ia07177c94624e55cdd3064a7d2751fb69322d3e4
Reviewed-on: https://go-review.googlesource.com/24723
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Diffstat (limited to 'src/encoding/csv')
-rw-r--r-- | src/encoding/csv/reader.go | 60 |
1 files changed, 45 insertions, 15 deletions
diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go index a5e03a9f8e..28caa6aa27 100644 --- a/src/encoding/csv/reader.go +++ b/src/encoding/csv/reader.go @@ -114,7 +114,14 @@ type Reader struct { line int column int r *bufio.Reader - field bytes.Buffer + // lineBuffer holds the unescaped fields read by readField, one after another. + // The fields can be accessed by using the indexes in fieldIndexes. + // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and + // fieldIndexes will contain the indexes 0, 1, 2, 5. + lineBuffer bytes.Buffer + // Indexes of fields inside lineBuffer + // The i'th field starts at offset fieldIndexes[i] in lineBuffer. + fieldIndexes []int } // NewReader returns a new Reader that reads from r. @@ -233,31 +240,54 @@ func (r *Reader) parseRecord() (fields []string, err error) { } r.r.UnreadRune() + r.lineBuffer.Reset() + r.fieldIndexes = r.fieldIndexes[:0] + // At this point we have at least one field. for { + idx := r.lineBuffer.Len() + haveField, delim, err := r.parseField() if haveField { - // If FieldsPerRecord is greater than 0 we can assume the final - // length of fields to be equal to FieldsPerRecord. - if r.FieldsPerRecord > 0 && fields == nil { - fields = make([]string, 0, r.FieldsPerRecord) - } - fields = append(fields, r.field.String()) + r.fieldIndexes = append(r.fieldIndexes, idx) } + if delim == '\n' || err == io.EOF { - return fields, err - } else if err != nil { + if len(r.fieldIndexes) == 0 { + return nil, err + } + break + } + + if err != nil { return nil, err } } + + fieldCount := len(r.fieldIndexes) + // Using this approach (creating a single string and taking slices of it) + // means that a single reference to any of the fields will retain the whole + // string. The risk of a nontrivial space leak caused by this is considered + // minimal and a tradeoff for better performance through the combined + // allocations. + line := r.lineBuffer.String() + fields = make([]string, fieldCount) + + for i, idx := range r.fieldIndexes { + if i == fieldCount-1 { + fields[i] = line[idx:] + } else { + fields[i] = line[idx:r.fieldIndexes[i+1]] + } + } + + return fields, nil } // parseField parses the next field in the record. The read field is -// located in r.field. Delim is the first character not part of the field +// appended to r.lineBuffer. Delim is the first character not part of the field // (r.Comma or '\n'). func (r *Reader) parseField() (haveField bool, delim rune, err error) { - r.field.Reset() - r1, err := r.readRune() for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { r1, err = r.readRune() @@ -310,19 +340,19 @@ func (r *Reader) parseField() (haveField bool, delim rune, err error) { return false, 0, r.error(ErrQuote) } // accept the bare quote - r.field.WriteRune('"') + r.lineBuffer.WriteRune('"') } case '\n': r.line++ r.column = -1 } - r.field.WriteRune(r1) + r.lineBuffer.WriteRune(r1) } default: // unquoted field for { - r.field.WriteRune(r1) + r.lineBuffer.WriteRune(r1) r1, err = r.readRune() if err != nil || r1 == r.Comma { break |