[release-branch.go1.14] encoding/json: revert "avoid work when unquoting strings, take 2"

This reverts golang.org/cl/190659 and golang.org/cl/226218, minus the regression tests in the latter. The original work happened in golang.org/cl/151157, which was reverted in golang.org/cl/190909 due to a crash found by fuzzing. We tried a second time in golang.org/cl/190659, which shipped with Go 1.14. A bug was found, where strings would be mangled in certain edge cases. The fix for that was golang.org/cl/226218, which was backported into Go 1.14.4. Unfortunately, a second regression was just reported in #39555, which is a similar case of strings getting mangled when decoding under certain conditions. It would be possible to come up with another small patch to fix that edge case, but instead, let's just revert the entire optimization, as it has proved to do more harm than good. Moreover, it's hard to argue or prove that there will be no more such regressions. However, all the work wasn't for nothing. First, we learned that the way the decoder unquotes tokenized strings isn't simple; initially, we had wrongly assumed that each string was unquoted exactly once and in order. Second, we have gained a number of regression tests which will be useful to prevent the same mistakes in the future, including the test cases we add in this CL. For #39555. Fixes #39585. Change-Id: I66a6919c2dd6d9789232482ba6cf3814eaa70f61 Reviewed-on: https://go-review.googlesource.com/c/go/+/237838 Run-TryBot: Daniel Martí <mvdan@mvdan.cc> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Andrew Bonventre <andybons@golang.org> (cherry picked from commit 11389baf2ea0b5e920959b0aa8d406d8090a0a93) Reviewed-on: https://go-review.googlesource.com/c/go/+/241081 Reviewed-by: Daniel Martí <mvdan@mvdan.cc> Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
author: Daniel Martí <mvdan@mvdan.cc> 2020-06-14 22:09:18 +0100
committer: Dmitri Shuralyov <dmitshur@golang.org> 2020-07-09 17:46:51 +0000
commit: be0254a141ebc01f3dbe56507c7b65ced00e5a46 (patch)
tree: a2d3588f2c64fe16c3516196cc0087618982bc3e
parent: efed90aedc039caffb6e412e31ee2f1fa4094bce (diff)
download: go-be0254a141ebc01f3dbe56507c7b65ced00e5a46.tar.gz
go-be0254a141ebc01f3dbe56507c7b65ced00e5a46.zip
2 files changed, 48 insertions, 42 deletions
diff --git a/src/encoding/json/decode.go b/src/encoding/json/decode.go
index b60e2bb0b2..86d8a69db7 100644
--- a/src/encoding/json/decode.go
+++ b/src/encoding/json/decode.go
@@ -213,9 +213,6 @@ type decodeState struct {
 	savedError            error
 	useNumber             bool
 	disallowUnknownFields bool
-	// safeUnquote is the number of current string literal bytes that don't
-	// need to be unquoted. When negative, no bytes need unquoting.
-	safeUnquote int
 }
 
 // readIndex returns the position of the last byte read.
@@ -317,27 +314,13 @@ func (d *decodeState) rescanLiteral() {
 Switch:
 	switch data[i-1] {
 	case '"': // string
-		// safeUnquote is initialized at -1, which means that all bytes
-		// checked so far can be unquoted at a later time with no work
-		// at all. When reaching the closing '"', if safeUnquote is
-		// still -1, all bytes can be unquoted with no work. Otherwise,
-		// only those bytes up until the first '\\' or non-ascii rune
-		// can be safely unquoted.
-		safeUnquote := -1
 		for ; i < len(data); i++ {
-			if c := data[i]; c == '\\' {
-				if safeUnquote < 0 { // first unsafe byte
-					safeUnquote = int(i - d.off)
-				}
+			switch data[i] {
+			case '\\':
 				i++ // escaped char
-			} else if c == '"' {
-				d.safeUnquote = safeUnquote
+			case '"':
 				i++ // tokenize the closing quote too
 				break Switch
-			} else if c >= utf8.RuneSelf {
-				if safeUnquote < 0 { // first unsafe byte
-					safeUnquote = int(i - d.off)
-				}
 			}
 		}
 	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number
@@ -691,7 +674,7 @@ func (d *decodeState) object(v reflect.Value) error {
 		start := d.readIndex()
 		d.rescanLiteral()
 		item := d.data[start:d.readIndex()]
-		key, ok := d.unquoteBytes(item)
+		key, ok := unquoteBytes(item)
 		if !ok {
 			panic(phasePanicMsg)
 		}
@@ -892,7 +875,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
 			d.saveError(&UnmarshalTypeError{Value: val, Type: v.Type(), Offset: int64(d.readIndex())})
 			return nil
 		}
-		s, ok := d.unquoteBytes(item)
+		s, ok := unquoteBytes(item)
 		if !ok {
 			if fromQuoted {
 				return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -943,7 +926,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
 		}
 
 	case '"': // string
-		s, ok := d.unquoteBytes(item)
+		s, ok := unquoteBytes(item)
 		if !ok {
 			if fromQuoted {
 				return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -1103,7 +1086,7 @@ func (d *decodeState) objectInterface() map[string]interface{} {
 		start := d.readIndex()
 		d.rescanLiteral()
 		item := d.data[start:d.readIndex()]
-		key, ok := d.unquote(item)
+		key, ok := unquote(item)
 		if !ok {
 			panic(phasePanicMsg)
 		}
@@ -1152,7 +1135,7 @@ func (d *decodeState) literalInterface() interface{} {
 		return c == 't'
 
 	case '"': // string
-		s, ok := d.unquote(item)
+		s, ok := unquote(item)
 		if !ok {
 			panic(phasePanicMsg)
 		}
@@ -1195,33 +1178,40 @@ func getu4(s []byte) rune {
 
 // unquote converts a quoted JSON string literal s into an actual string t.
 // The rules are different than for Go, so cannot use strconv.Unquote.
-// The first byte in s must be '"'.
-func (d *decodeState) unquote(s []byte) (t string, ok bool) {
-	s, ok = d.unquoteBytes(s)
+func unquote(s []byte) (t string, ok bool) {
+	s, ok = unquoteBytes(s)
 	t = string(s)
 	return
 }
 
-func (d *decodeState) unquoteBytes(s []byte) (t []byte, ok bool) {
-	// We already know that s[0] == '"'. However, we don't know that the
-	// closing quote exists in all cases, such as when the string is nested
-	// via the ",string" option.
-	if len(s) < 2 || s[len(s)-1] != '"' {
+func unquoteBytes(s []byte) (t []byte, ok bool) {
+	if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' {
 		return
 	}
 	s = s[1 : len(s)-1]
 
-	// If there are no unusual characters, no unquoting is needed, so return
-	// a slice of the original bytes.
-	r := d.safeUnquote
-	if r == -1 {
+	// Check for unusual characters. If there are none,
+	// then no unquoting is needed, so return a slice of the
+	// original bytes.
+	r := 0
+	for r < len(s) {
+		c := s[r]
+		if c == '\\' || c == '"' || c < ' ' {
+			break
+		}
+		if c < utf8.RuneSelf {
+			r++
+			continue
+		}
+		rr, size := utf8.DecodeRune(s[r:])
+		if rr == utf8.RuneError && size == 1 {
+			break
+		}
+		r += size
+	}
+	if r == len(s) {
 		return s, true
 	}
-	// Only perform up to one safe unquote for each re-scanned string
-	// literal. In some edge cases, the decoder unquotes a literal a second
-	// time, even after another literal has been re-scanned. Thus, only the
-	// first unquote can safely use safeUnquote.
-	d.safeUnquote = 0
 
 	b := make([]byte, len(s)+2*utf8.UTFMax)
 	w := copy(b, s[0:r])
diff --git a/src/encoding/json/decode_test.go b/src/encoding/json/decode_test.go
index a49181e982..689cc34c24 100644
--- a/src/encoding/json/decode_test.go
+++ b/src/encoding/json/decode_test.go
@@ -2459,4 +2459,20 @@ func TestUnmarshalRescanLiteralMangledUnquote(t *testing.T) {
 	if t1 != t2 {
 		t.Errorf("Marshal and Unmarshal roundtrip mismatch: want %q got %q", t1, t2)
 	}
+
+	// See golang.org/issues/39555.
+	input := map[textUnmarshalerString]string{"FOO": "", `"`: ""}
+
+	encoded, err := Marshal(input)
+	if err != nil {
+		t.Fatalf("Marshal unexpected error: %v", err)
+	}
+	var got map[textUnmarshalerString]string
+	if err := Unmarshal(encoded, &got); err != nil {
+		t.Fatalf("Unmarshal unexpected error: %v", err)
+	}
+	want := map[textUnmarshalerString]string{"foo": "", `"`: ""}
+	if !reflect.DeepEqual(want, got) {
+		t.Fatalf("Unexpected roundtrip result:\nwant: %q\ngot:  %q", want, got)
+	}
 }
author	Daniel Martí <mvdan@mvdan.cc>	2020-06-14 22:09:18 +0100
committer	Dmitri Shuralyov <dmitshur@golang.org>	2020-07-09 17:46:51 +0000
commit	be0254a141ebc01f3dbe56507c7b65ced00e5a46 (patch)
tree	a2d3588f2c64fe16c3516196cc0087618982bc3e
parent	efed90aedc039caffb6e412e31ee2f1fa4094bce (diff)
download	go-be0254a141ebc01f3dbe56507c7b65ced00e5a46.tar.gz go-be0254a141ebc01f3dbe56507c7b65ced00e5a46.zip