bytes, strings: optimize Trim for single byte cutsets

Using the latest version of all modules known by the module proxy, we determine that for all Trim usages (and related functionality): * 76.6% have cutsets of len=1, and * 13.4% have cutsets of len=2. Given that a vast majority of usages only have a cutset of len=1, we should more heavily optimize for that situation. Previously, there was some optimization for cutsets of len=1, but it's within the internal makeCutsetFunc function. This is sub-optimal as it incurs an allocation in makeCutsetFunc for the closure over that single byte. This CL removes special-casing of one-byte cutsets from makeCutsetFunc and instead distributes it directly in Trim, TrimRight, and TrimLeft. Whether we should distribute the entire ASCII cutset logic into Trim is a future CL that should be discussed and handled separately. The evidence for multibyte cutsets is not as obviously compelling. name old time/op new time/op delta bytes/TrimByte-4 84.1ns ± 2% 7.5ns ± 1% -91.10% (p=0.000 n=9+7) strings/TrimByte-4 86.2ns ± 3% 8.3ns ± 1% -90.33% (p=0.000 n=9+10) Fixes #46446 Change-Id: Ia0e31a8384c3ce111ae35465605bcec45df2ebec Reviewed-on: https://go-review.googlesource.com/c/go/+/323318 Trust: Joe Tsai <joetsai@digital-static.net> Run-TryBot: Joe Tsai <joetsai@digital-static.net> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
author: Joe Tsai <joetsai@digital-static.net> 2021-05-29 19:11:37 -0700
committer: Joe Tsai <joetsai@digital-static.net> 2021-08-25 19:29:15 +0000
commit: 5baf60d47245c792c50a349cd6b8586d23204895 (patch)
tree: 523c53fe8a2121f153cf98ca3e2599687f1f3de5 /src
parent: 3d667671ad767d66bf792c5a8d623cb829f6366a (diff)
download: go-5baf60d47245c792c50a349cd6b8586d23204895.tar.gz
go-5baf60d47245c792c50a349cd6b8586d23204895.zip
4 files changed, 64 insertions, 10 deletions
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
index ce52649f13..cd859d086d 100644
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -888,11 +888,6 @@ func (as *asciiSet) contains(c byte) bool {
 }
 
 func makeCutsetFunc(cutset string) func(r rune) bool {
-	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
-		return func(r rune) bool {
-			return r == rune(cutset[0])
-		}
-	}
 	if as, isASCII := makeASCIISet(cutset); isASCII {
 		return func(r rune) bool {
 			return r < utf8.RuneSelf && as.contains(byte(r))
@@ -911,21 +906,44 @@ func makeCutsetFunc(cutset string) func(r rune) bool {
 // Trim returns a subslice of s by slicing off all leading and
 // trailing UTF-8-encoded code points contained in cutset.
 func Trim(s []byte, cutset string) []byte {
+	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
+		return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0])
+	}
 	return TrimFunc(s, makeCutsetFunc(cutset))
 }
 
 // TrimLeft returns a subslice of s by slicing off all leading
 // UTF-8-encoded code points contained in cutset.
 func TrimLeft(s []byte, cutset string) []byte {
+	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
+		return trimLeftByte(s, cutset[0])
+	}
 	return TrimLeftFunc(s, makeCutsetFunc(cutset))
 }
 
+func trimLeftByte(s []byte, c byte) []byte {
+	for len(s) > 0 && s[0] == c {
+		s = s[1:]
+	}
+	return s
+}
+
 // TrimRight returns a subslice of s by slicing off all trailing
 // UTF-8-encoded code points that are contained in cutset.
 func TrimRight(s []byte, cutset string) []byte {
+	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
+		return trimRightByte(s, cutset[0])
+	}
 	return TrimRightFunc(s, makeCutsetFunc(cutset))
 }
 
+func trimRightByte(s []byte, c byte) []byte {
+	for len(s) > 0 && s[len(s)-1] == c {
+		s = s[:len(s)-1]
+	}
+	return s
+}
+
 // TrimSpace returns a subslice of s by slicing off all leading and
 // trailing white space, as defined by Unicode.
 func TrimSpace(s []byte) []byte {
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
index 544ee46f90..850b2ed061 100644
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -1251,7 +1251,9 @@ var trimTests = []TrimTest{
 	{"TrimLeft", "abba", "ab", ""},
 	{"TrimRight", "abba", "ab", ""},
 	{"TrimLeft", "abba", "a", "bba"},
+	{"TrimLeft", "abba", "b", "abba"},
 	{"TrimRight", "abba", "a", "abb"},
+	{"TrimRight", "abba", "b", "abba"},
 	{"Trim", "<tag>", "<>", "tag"},
 	{"Trim", "* listitem", " *", "listitem"},
 	{"Trim", `"quote"`, `"`, "quote"},
@@ -1963,6 +1965,13 @@ func BenchmarkTrimASCII(b *testing.B) {
 	}
 }
 
+func BenchmarkTrimByte(b *testing.B) {
+	x := []byte("  the quick brown fox   ")
+	for i := 0; i < b.N; i++ {
+		Trim(x, " ")
+	}
+}
+
 func BenchmarkIndexPeriodic(b *testing.B) {
 	key := []byte{1, 1}
 	for _, skip := range [...]int{2, 4, 8, 16, 32, 64} {
diff --git a/src/strings/strings.go b/src/strings/strings.go
index b429735fea..0df8d2eb28 100644
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -818,11 +818,6 @@ func (as *asciiSet) contains(c byte) bool {
 }
 
 func makeCutsetFunc(cutset string) func(rune) bool {
-	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
-		return func(r rune) bool {
-			return r == rune(cutset[0])
-		}
-	}
 	if as, isASCII := makeASCIISet(cutset); isASCII {
 		return func(r rune) bool {
 			return r < utf8.RuneSelf && as.contains(byte(r))
@@ -837,6 +832,9 @@ func Trim(s, cutset string) string {
 	if s == "" || cutset == "" {
 		return s
 	}
+	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
+		return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0])
+	}
 	return TrimFunc(s, makeCutsetFunc(cutset))
 }
 
@@ -848,9 +846,19 @@ func TrimLeft(s, cutset string) string {
 	if s == "" || cutset == "" {
 		return s
 	}
+	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
+		return trimLeftByte(s, cutset[0])
+	}
 	return TrimLeftFunc(s, makeCutsetFunc(cutset))
 }
 
+func trimLeftByte(s string, c byte) string {
+	for len(s) > 0 && s[0] == c {
+		s = s[1:]
+	}
+	return s
+}
+
 // TrimRight returns a slice of the string s, with all trailing
 // Unicode code points contained in cutset removed.
 //
@@ -859,9 +867,19 @@ func TrimRight(s, cutset string) string {
 	if s == "" || cutset == "" {
 		return s
 	}
+	if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
+		return trimRightByte(s, cutset[0])
+	}
 	return TrimRightFunc(s, makeCutsetFunc(cutset))
 }
 
+func trimRightByte(s string, c byte) string {
+	for len(s) > 0 && s[len(s)-1] == c {
+		s = s[:len(s)-1]
+	}
+	return s
+}
+
 // TrimSpace returns a slice of the string s, with all leading
 // and trailing white space removed, as defined by Unicode.
 func TrimSpace(s string) string {
diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go
index 09e5b27cc3..edc6c20590 100644
--- a/src/strings/strings_test.go
+++ b/src/strings/strings_test.go
@@ -808,7 +808,9 @@ var trimTests = []struct {
 	{"TrimLeft", "abba", "ab", ""},
 	{"TrimRight", "abba", "ab", ""},
 	{"TrimLeft", "abba", "a", "bba"},
+	{"TrimLeft", "abba", "b", "abba"},
 	{"TrimRight", "abba", "a", "abb"},
+	{"TrimRight", "abba", "b", "abba"},
 	{"Trim", "<tag>", "<>", "tag"},
 	{"Trim", "* listitem", " *", "listitem"},
 	{"Trim", `"quote"`, `"`, "quote"},
@@ -1860,6 +1862,13 @@ func BenchmarkTrimASCII(b *testing.B) {
 	}
 }
 
+func BenchmarkTrimByte(b *testing.B) {
+	x := "  the quick brown fox   "
+	for i := 0; i < b.N; i++ {
+		Trim(x, " ")
+	}
+}
+
 func BenchmarkIndexPeriodic(b *testing.B) {
 	key := "aa"
 	for _, skip := range [...]int{2, 4, 8, 16, 32, 64} {
author	Joe Tsai <joetsai@digital-static.net>	2021-05-29 19:11:37 -0700
committer	Joe Tsai <joetsai@digital-static.net>	2021-08-25 19:29:15 +0000
commit	5baf60d47245c792c50a349cd6b8586d23204895 (patch)
tree	523c53fe8a2121f153cf98ca3e2599687f1f3de5 /src
parent	3d667671ad767d66bf792c5a8d623cb829f6366a (diff)
download	go-5baf60d47245c792c50a349cd6b8586d23204895.tar.gz go-5baf60d47245c792c50a349cd6b8586d23204895.zip