encoding/csv: restore Go 1.9 quoted \r\n handling in Reader

CL 52810 changed Reader to interpret a quoted \r\n as a raw \r\n when reading fields. This seems likely to break existing users, and discussion on both #21201 (the original issue that triggered the change) and #22746 (discussing whether to revert the change) failed to identify a single motivating example for this change. To avoid breaking existing users for no clear reason, revert the change. The Reader has been rewritten in the interim so this is not a git revert but instead and adjustment (and slight simplification) of the new Reader. Fixes #22746. Change-Id: Ie857b2f4b1359a207d085b6d3c3a6d440a997d12 Reviewed-on: https://go-review.googlesource.com/78295 Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
author: Russ Cox <rsc@golang.org> 2017-11-16 10:29:50 -0500
committer: Russ Cox <rsc@golang.org> 2017-11-16 16:29:37 +0000
commit: 1d47a145913f76f64414bbbc14bff3c95450535e (patch)
tree: 0444d95ec60f84cb4410590b1e06712ca99ac78a /src/encoding/csv
parent: 918b98ca707e36ec84c0494b884ff0a02c9121c2 (diff)
download: go-1d47a145913f76f64414bbbc14bff3c95450535e.tar.gz
go-1d47a145913f76f64414bbbc14bff3c95450535e.zip
3 files changed, 23 insertions, 12 deletions
diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go
index 09f0dac5d0..1350f3ebdd 100644
--- a/src/encoding/csv/reader.go
+++ b/src/encoding/csv/reader.go
@@ -99,15 +99,24 @@ func validDelim(r rune) bool {
 // As returned by NewReader, a Reader expects input conforming to RFC 4180.
 // The exported fields can be changed to customize the details before the
 // first call to Read or ReadAll.
+//
+// The Reader converts all \r\n sequences in its input to plain \n,
+// including in multiline field values, so that the returned data does
+// not depend on which line-ending convention an input file uses.
 type Reader struct {
 	// Comma is the field delimiter.
 	// It is set to comma (',') by NewReader.
+	// Comma must be a valid rune and must not be \r, \n,
+	// or the Unicode replacement character (0xFFFD).
 	Comma rune
 
 	// Comment, if not 0, is the comment character. Lines beginning with the
 	// Comment character without preceding whitespace are ignored.
 	// With leading whitespace the Comment character becomes part of the
 	// field, even if TrimLeadingSpace is true.
+	// Comment must be a valid rune and must not be \r, \n,
+	// or the Unicode replacement character (0xFFFD).
+	// It must also not be equal to Comma.
 	Comment rune
 
 	// FieldsPerRecord is the number of expected fields per record.
@@ -217,15 +226,17 @@ func (r *Reader) readLine() ([]byte, error) {
 		err = nil
 	}
 	r.numLine++
+	// Normalize \r\n to \n on all input lines.
+	if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
+		line[n-2] = '\n'
+		line = line[:n-1]
+	}
 	return line, err
 }
 
-// lengthCRLF reports the number of bytes for a trailing "\r\n".
-func lengthCRLF(b []byte) int {
-	if j := len(b) - 1; j >= 0 && b[j] == '\n' {
-		if j := len(b) - 2; j >= 0 && b[j] == '\r' {
-			return 2
-		}
+// lengthNL reports the number of bytes for the trailing \n.
+func lengthNL(b []byte) int {
+	if len(b) > 0 && b[len(b)-1] == '\n' {
 		return 1
 	}
 	return 0
@@ -251,7 +262,7 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
 			line = nil
 			continue // Skip comment lines
 		}
-		if errRead == nil && len(line) == lengthCRLF(line) {
+		if errRead == nil && len(line) == lengthNL(line) {
 			line = nil
 			continue // Skip empty lines
 		}
@@ -281,7 +292,7 @@ parseField:
 			if i >= 0 {
 				field = field[:i]
 			} else {
-				field = field[:len(field)-lengthCRLF(field)]
+				field = field[:len(field)-lengthNL(field)]
 			}
 			// Check to make sure a quote does not appear in field.
 			if !r.LazyQuotes {
@@ -317,7 +328,7 @@ parseField:
 						line = line[commaLen:]
 						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
 						continue parseField
-					case lengthCRLF(line) == len(line):
+					case lengthNL(line) == len(line):
 						// `"\n` sequence (end of line).
 						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
 						break parseField
diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go
index 69e2e2becd..d62aa77382 100644
--- a/src/encoding/csv/reader_test.go
+++ b/src/encoding/csv/reader_test.go
@@ -235,9 +235,9 @@ x,,,
 		Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote},
 	}, {
 		Name:  "CRLFInQuotedField", // Issue 21201
-		Input: "\"Hello\r\nHi\"",
+		Input: "A,\"Hello\r\nHi\",B\r\n",
 		Output: [][]string{
-			{"Hello\r\nHi"},
+			{"A", "Hello\nHi", "B"},
 		},
 	}, {
 		Name:   "BinaryBlobField", // Issue 19410
diff --git a/src/encoding/csv/writer.go b/src/encoding/csv/writer.go
index b23cae4517..ef3594e523 100644
--- a/src/encoding/csv/writer.go
+++ b/src/encoding/csv/writer.go
@@ -20,7 +20,7 @@ import (
 //
 // Comma is the field delimiter.
 //
-// If UseCRLF is true, the Writer ends each record with \r\n instead of \n.
+// If UseCRLF is true, the Writer ends each output line with \r\n instead of \n.
 type Writer struct {
 	Comma   rune // Field delimiter (set to ',' by NewWriter)
 	UseCRLF bool // True to use \r\n as the line terminator
author	Russ Cox <rsc@golang.org>	2017-11-16 10:29:50 -0500
committer	Russ Cox <rsc@golang.org>	2017-11-16 16:29:37 +0000
commit	1d47a145913f76f64414bbbc14bff3c95450535e (patch)
tree	0444d95ec60f84cb4410590b1e06712ca99ac78a /src/encoding/csv
parent	918b98ca707e36ec84c0494b884ff0a02c9121c2 (diff)
download	go-1d47a145913f76f64414bbbc14bff3c95450535e.tar.gz go-1d47a145913f76f64414bbbc14bff3c95450535e.zip