Add IndexFunc and LastIndexFunc.

Change TrimRight and TrimLeft to use these functions. Incidentally fix minor bug in TrimRight. Add some test cases for this. YMMV whether it's worth saving the closure allocation. R=r, r2 CC=golang-dev, hoisie, rsc https://golang.org/cl/1198044
author: Roger Peppe <rogpeppe@gmail.com> 2010-06-14 14:54:48 -0700
committer: Rob Pike <r@golang.org> 2010-06-14 14:54:48 -0700
commit: 18274e007598436e62182c72c3103bf94a8feade (patch)
tree: 4b6d2e8dd877088e3501784ef9706ed37de14322
parent: b9055629c191deab9c4dffc0d0b5f8fb31687e1d (diff)
download: go-18274e007598436e62182c72c3103bf94a8feade.tar.gz
go-18274e007598436e62182c72c3103bf94a8feade.zip
2 files changed, 187 insertions, 36 deletions
diff --git a/src/pkg/strings/strings.go b/src/pkg/strings/strings.go
index b6d84d07aa..c192b1826e 100644
--- a/src/pkg/strings/strings.go
+++ b/src/pkg/strings/strings.go
@@ -328,49 +328,99 @@ func ToTitleSpecial(_case unicode.SpecialCase, s string) string {
 // TrimLeftFunc returns a slice of the string s with all leading
 // Unicode code points c satisfying f(c) removed.
 func TrimLeftFunc(s string, f func(r int) bool) string {
-	start, end := 0, len(s)
-	for start < end {
+	i := indexFunc(s, f, false)
+	if i == -1 {
+		return ""
+	}
+	return s[i:]
+}
+
+// TrimRightFunc returns a slice of the string s with all trailing
+// Unicode code points c satisfying f(c) removed.
+func TrimRightFunc(s string, f func(r int) bool) string {
+	i := lastIndexFunc(s, f, false)
+	if i >= 0 && s[i] >= utf8.RuneSelf {
+		_, wid := utf8.DecodeRuneInString(s[i:])
+		i += wid
+	} else {
+		i++
+	}
+	return s[0:i]
+}
+
+// TrimFunc returns a slice of the string s with all leading
+// and trailing Unicode code points c satisfying f(c) removed.
+func TrimFunc(s string, f func(r int) bool) string {
+	return TrimRightFunc(TrimLeftFunc(s, f), f)
+}
+
+// IndexFunc returns the index into s of the first Unicode
+// code point satisfying f(c), or -1 if none do.
+func IndexFunc(s string, f func(r int) bool) int {
+	return indexFunc(s, f, true)
+}
+
+// LastIndexFunc returns the index into s of the last
+// Unicode code point satisfying f(c), or -1 if none do.
+func LastIndexFunc(s string, f func(r int) bool) int {
+	return lastIndexFunc(s, f, true)
+}
+
+// indexFunc is the same as IndexFunc except that if
+// truth==false, the sense of the predicate function is
+// inverted. We could use IndexFunc directly, but this
+// way saves a closure allocation.
+func indexFunc(s string, f func(r int) bool, truth bool) int {
+	start := 0
+	for start < len(s) {
 		wid := 1
 		rune := int(s[start])
 		if rune >= utf8.RuneSelf {
-			rune, wid = utf8.DecodeRuneInString(s[start:end])
+			rune, wid = utf8.DecodeRuneInString(s[start:])
 		}
-		if !f(rune) {
-			return s[start:]
+		if f(rune) == truth {
+			return start
 		}
 		start += wid
 	}
-	return s[start:]
+	return -1
 }
 
-// TrimRightFunc returns a slice of the string s with all trailing
-// Unicode code points c satisfying f(c) removed.
-func TrimRightFunc(s string, f func(r int) bool) string {
-	start, end := 0, len(s)
-	for start < end {
-		wid := 1
-		rune := int(s[end-wid])
+// lastIndexFunc is the same as LastIndexFunc except that if
+// truth==false, the sense of the predicate function is
+// inverted. We could use IndexFunc directly, but this
+// way saves a closure allocation.
+func lastIndexFunc(s string, f func(r int) bool, truth bool) int {
+	end := len(s)
+	for end > 0 {
+		start := end - 1
+		rune := int(s[start])
 		if rune >= utf8.RuneSelf {
 			// Back up & look for beginning of rune. Mustn't pass start.
-			for wid = 2; start <= end-wid && !utf8.RuneStart(s[end-wid]); wid++ {
+			for start--; start >= 0; start-- {
+				if utf8.RuneStart(s[start]) {
+					break
+				}
 			}
-			if start > end-wid { // invalid UTF-8 sequence; stop processing
-				return s[start:end]
+			if start < 0 {
+				return -1
+			}
+			var wid int
+			rune, wid = utf8.DecodeRuneInString(s[start:end])
+
+			// If we've decoded fewer bytes than we expected,
+			// we've got some invalid UTF-8, so make sure we return
+			// the last possible index in s.
+			if start+wid < end && f(utf8.RuneError) == truth {
+				return end - 1
 			}
-			rune, wid = utf8.DecodeRuneInString(s[end-wid : end])
 		}
-		if !f(rune) {
-			return s[0:end]
+		if f(rune) == truth {
+			return start
 		}
-		end -= wid
+		end = start
 	}
-	return s[0:end]
-}
-
-// TrimFunc returns a slice of the string s with all leading
-// and trailing Unicode code points c satisfying f(c) removed.
-func TrimFunc(s string, f func(r int) bool) string {
-	return TrimRightFunc(TrimLeftFunc(s, f), f)
+	return -1
 }
 
 func makeCutsetFunc(cutset string) func(rune int) bool {
diff --git a/src/pkg/strings/strings_test.go b/src/pkg/strings/strings_test.go
index 2c99a6ec36..e4134d8d67 100644
--- a/src/pkg/strings/strings_test.go
+++ b/src/pkg/strings/strings_test.go
@@ -283,8 +283,14 @@ var trimSpaceTests = []StringTest{
 	StringTest{" \t\r\n x\t\t\r\r\n\n ", "x"},
 	StringTest{" \u2000\t\r\n x\t\t\r\r\ny\n \u3000", "x\t\t\r\r\ny"},
 	StringTest{"1 \t\r\n2", "1 \t\r\n2"},
-	StringTest{" x\x80", "x\x80"}, // invalid UTF-8 on end
-	StringTest{" x\xc0", "x\xc0"}, // invalid UTF-8 on end
+	StringTest{" x\x80", "x\x80"},
+	StringTest{" x\xc0", "x\xc0"},
+	StringTest{"x \xc0\xc0 ", "x \xc0\xc0"},
+	StringTest{"x \xc0", "x \xc0"},
+	StringTest{"x \xc0 ", "x \xc0"},
+	StringTest{"x \xc0\xc0 ", "x \xc0\xc0"},
+	StringTest{"x ☺\xc0\xc0 ", "x ☺\xc0\xc0"},
+	StringTest{"x ☺ ", "x ☺"},
 }
 
 func tenRunes(rune int) string {
@@ -407,8 +413,28 @@ var trimTests = []TrimTest{
 	TrimTest{TrimRight, "abba", "", "abba"},
 	TrimTest{TrimRight, "", "123", ""},
 	TrimTest{TrimRight, "", "", ""},
+	TrimTest{TrimRight, "☺\xc0", "☺", "☺\xc0"},
 }
 
+// naiveTrimRight implements a version of TrimRight
+// by scanning forwards from the start of s.
+func naiveTrimRight(s string, cutset string) string {
+	i := -1
+	for j, r := range s {
+		if IndexRune(cutset, r) == -1 {
+			i = j
+		}
+	}
+	if i >= 0 && s[i] >= utf8.RuneSelf {
+		_, wid := utf8.DecodeRuneInString(s[i:])
+		i += wid
+	} else {
+		i++
+	}
+	return s[0:i]
+}
+
+
 func TestTrim(t *testing.T) {
 	for _, tc := range trimTests {
 		actual := tc.f(tc.in, tc.cutset)
@@ -426,25 +452,100 @@ func TestTrim(t *testing.T) {
 		if actual != tc.out {
 			t.Errorf("%s(%q, %q) = %q; want %q", name, tc.in, tc.cutset, actual, tc.out)
 		}
+		// test equivalence of TrimRight to naive version
+		if tc.f == TrimRight {
+			naive := naiveTrimRight(tc.in, tc.cutset)
+			if naive != actual {
+				t.Errorf("TrimRight(%q, %q) = %q, want %q", tc.in, tc.cutset, actual, naive)
+			}
+		}
 	}
 }
 
+var isSpace = predicate{unicode.IsSpace, "IsSpace"}
+var isDigit = predicate{unicode.IsDigit, "IsDigit"}
+var isUpper = predicate{unicode.IsUpper, "IsUpper"}
+var isValidRune = predicate{
+	func(r int) bool {
+		return r != utf8.RuneError
+	},
+	"IsValidRune",
+}
+
+type predicate struct {
+	f    func(r int) bool
+	name string
+}
+
 type TrimFuncTest struct {
-	f             func(r int) bool
-	name, in, out string
+	f       predicate
+	in, out string
+}
+
+func not(p predicate) predicate {
+	return predicate{
+		func(r int) bool {
+			return !p.f(r)
+		},
+		"not " + p.name,
+	}
 }
 
 var trimFuncTests = []TrimFuncTest{
-	TrimFuncTest{unicode.IsSpace, "IsSpace", space + " hello " + space, "hello"},
-	TrimFuncTest{unicode.IsDigit, "IsDigit", "\u0e50\u0e5212hello34\u0e50\u0e51", "hello"},
-	TrimFuncTest{unicode.IsUpper, "IsUpper", "\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", "hello"},
+	TrimFuncTest{isSpace, space + " hello " + space, "hello"},
+	TrimFuncTest{isDigit, "\u0e50\u0e5212hello34\u0e50\u0e51", "hello"},
+	TrimFuncTest{isUpper, "\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", "hello"},
+	TrimFuncTest{not(isSpace), "hello" + space + "hello", space},
+	TrimFuncTest{not(isDigit), "hello\u0e50\u0e521234\u0e50\u0e51helo", "\u0e50\u0e521234\u0e50\u0e51"},
+	TrimFuncTest{isValidRune, "ab\xc0a\xc0cd", "\xc0a\xc0"},
+	TrimFuncTest{not(isValidRune), "\xc0a\xc0", "a"},
 }
 
 func TestTrimFunc(t *testing.T) {
 	for _, tc := range trimFuncTests {
-		actual := TrimFunc(tc.in, tc.f)
+		actual := TrimFunc(tc.in, tc.f.f)
 		if actual != tc.out {
-			t.Errorf("TrimFunc(%q, %q) = %q; want %q", tc.in, tc.name, actual, tc.out)
+			t.Errorf("TrimFunc(%q, %q) = %q; want %q", tc.in, tc.f.name, actual, tc.out)
+		}
+	}
+}
+
+type IndexFuncTest struct {
+	in          string
+	f           predicate
+	first, last int
+}
+
+var indexFuncTests = []IndexFuncTest{
+	IndexFuncTest{"", isValidRune, -1, -1},
+	IndexFuncTest{"abc", isDigit, -1, -1},
+	IndexFuncTest{"0123", isDigit, 0, 3},
+	IndexFuncTest{"a1b", isDigit, 1, 1},
+	IndexFuncTest{space, isSpace, 0, len(space) - 3}, // last rune in space is 3 bytes
+	IndexFuncTest{"\u0e50\u0e5212hello34\u0e50\u0e51", isDigit, 0, 18},
+	IndexFuncTest{"\u2C6F\u2C6F\u2C6F\u2C6FABCDhelloEF\u2C6F\u2C6FGH\u2C6F\u2C6F", isUpper, 0, 34},
+	IndexFuncTest{"12\u0e50\u0e52hello34\u0e50\u0e51", not(isDigit), 8, 12},
+
+	// broken unicode tests
+	IndexFuncTest{"\x801", isDigit, 1, 1},
+	IndexFuncTest{"\x80abc", isDigit, -1, -1},
+	IndexFuncTest{"\xc0a\xc0", isValidRune, 1, 1},
+	IndexFuncTest{"\xc0a\xc0", not(isValidRune), 0, 2},
+	IndexFuncTest{"\xc0☺\xc0", not(isValidRune), 0, 4},
+	IndexFuncTest{"\xc0☺\xc0\xc0", not(isValidRune), 0, 5},
+	IndexFuncTest{"ab\xc0a\xc0cd", not(isValidRune), 2, 4},
+	IndexFuncTest{"a\xe0\x80cd", not(isValidRune), 1, 2},
+}
+
+func TestIndexFunc(t *testing.T) {
+	for _, tc := range indexFuncTests {
+		first := IndexFunc(tc.in, tc.f.f)
+		if first != tc.first {
+			t.Errorf("IndexFunc(%q, %s) = %d; want %d", tc.in, tc.f.name, first, tc.first)
+		}
+		last := LastIndexFunc(tc.in, tc.f.f)
+		if last != tc.last {
+			t.Errorf("LastIndexFunc(%q, %s) = %d; want %d", tc.in, tc.f.name, last, tc.last)
 		}
 	}
 }
author	Roger Peppe <rogpeppe@gmail.com>	2010-06-14 14:54:48 -0700
committer	Rob Pike <r@golang.org>	2010-06-14 14:54:48 -0700
commit	18274e007598436e62182c72c3103bf94a8feade (patch)
tree	4b6d2e8dd877088e3501784ef9706ed37de14322
parent	b9055629c191deab9c4dffc0d0b5f8fb31687e1d (diff)
download	go-18274e007598436e62182c72c3103bf94a8feade.tar.gz go-18274e007598436e62182c72c3103bf94a8feade.zip