aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2011-06-16 17:56:25 -0400
committerRuss Cox <rsc@golang.org>2011-06-16 17:56:25 -0400
commitfc77e826175c56914a6ecc9a1abb9cfa1a00aa8d (patch)
tree7ec777ffc67906cf88d2287eed527d7c9f3bf738
parent6e9b1a78ff1a394a60a815b23564ad58a0cf0ce1 (diff)
downloadgo-fc77e826175c56914a6ecc9a1abb9cfa1a00aa8d.tar.gz
go-fc77e826175c56914a6ecc9a1abb9cfa1a00aa8d.zip
unicode: add case folding tables
R=r, r CC=golang-dev https://golang.org/cl/4571074
-rw-r--r--src/pkg/unicode/letter.go49
-rw-r--r--src/pkg/unicode/letter_test.go46
-rw-r--r--src/pkg/unicode/maketables.go301
-rw-r--r--src/pkg/unicode/tables.go634
4 files changed, 921 insertions, 109 deletions
diff --git a/src/pkg/unicode/letter.go b/src/pkg/unicode/letter.go
index a0c55bbf78..dbd8638eaa 100644
--- a/src/pkg/unicode/letter.go
+++ b/src/pkg/unicode/letter.go
@@ -275,3 +275,52 @@ func (special SpecialCase) ToLower(rune int) int {
}
return r
}
+
+// caseOrbit is defined in tables.go as []foldPair. Right now all the
+// entries fit in uint16, so use uint16. If that changes, compilation
+// will fail (the constants in the composite literal will not fit in uint16)
+// and the types here can change to uint32.
+type foldPair struct {
+ From uint16
+ To uint16
+}
+
+// SimpleFold iterates over Unicode code points equivalent under
+// the Unicode-defined simple case folding. Among the code points
+// equivalent to rune (including rune itself), SimpleFold returns the
+// smallest r >= rune if one exists, or else the smallest r >= 0.
+//
+// For example:
+// SimpleFold('A') = 'a'
+// SimpleFold('a') = 'A'
+//
+// SimpleFold('K') = 'k'
+// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
+// SimpleFold('\u212A') = 'K'
+//
+// SimpleFold('1') = '1'
+//
+func SimpleFold(rune int) int {
+ // Consult caseOrbit table for special cases.
+ lo := 0
+ hi := len(caseOrbit)
+ for lo < hi {
+ m := lo + (hi-lo)/2
+ if int(caseOrbit[m].From) < rune {
+ lo = m + 1
+ } else {
+ hi = m
+ }
+ }
+ if lo < len(caseOrbit) && int(caseOrbit[lo].From) == rune {
+ return int(caseOrbit[lo].To)
+ }
+
+ // No folding specified. This is a one- or two-element
+ // equivalence class containing rune and ToLower(rune)
+ // and ToUpper(rune) if they are different from rune.
+ if l := ToLower(rune); l != rune {
+ return l
+ }
+ return ToUpper(rune)
+}
diff --git a/src/pkg/unicode/letter_test.go b/src/pkg/unicode/letter_test.go
index 4c24ffc516..c4e26df580 100644
--- a/src/pkg/unicode/letter_test.go
+++ b/src/pkg/unicode/letter_test.go
@@ -376,3 +376,49 @@ func TestTurkishCase(t *testing.T) {
}
}
}
+
+var simpleFoldTests = []string{
+ // SimpleFold could order its returned slices in any order it wants,
+ // but we know it orders them in increasing order starting at in
+ // and looping around from MaxRune to 0.
+
+ // Easy cases.
+ "Aa",
+ "aA",
+ "δΔ",
+ "Δδ",
+
+ // ASCII special cases.
+ "KkK",
+ "kKK",
+ "KKk",
+ "Ssſ",
+ "sſS",
+ "ſSs",
+
+ // Non-ASCII special cases.
+ "ρϱΡ",
+ "ϱΡρ",
+ "Ρρϱ",
+ "ͅΙιι",
+ "Ιιιͅ",
+ "ιιͅΙ",
+ "ιͅΙι",
+
+ // Extra special cases: has lower/upper but no case fold.
+ "İ",
+ "ı",
+}
+
+func TestSimpleFold(t *testing.T) {
+ for _, tt := range simpleFoldTests {
+ cycle := []int(tt)
+ rune := cycle[len(cycle)-1]
+ for _, out := range cycle {
+ if r := SimpleFold(rune); r != out {
+ t.Errorf("SimpleFold(%#U) = %#U, want %#U", rune, r, out)
+ }
+ rune = out
+ }
+ }
+}
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go
index 39c7121a67..421d294552 100644
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -24,15 +24,18 @@ import (
func main() {
flag.Parse()
loadChars() // always needed
+ loadCasefold()
printCategories()
printScriptOrProperty(false)
printScriptOrProperty(true)
printCases()
printLatinProperties()
+ printCasefold()
printSizes()
}
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
+var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
var url = flag.String("url",
"http://www.unicode.org/Public/6.0.0/ucd/",
"URL of Unicode database directory")
@@ -119,6 +122,8 @@ type Char struct {
upperCase int
lowerCase int
titleCase int
+ foldCase int // simple case folding
+ caseOrbit int // next in simple case folding orbit
}
// Scripts.txt has form:
@@ -308,8 +313,53 @@ func loadChars() {
resp.Body.Close()
}
+func loadCasefold() {
+ if *casefoldingURL == "" {
+ flag.Set("casefolding", *url+"CaseFolding.txt")
+ }
+ resp, err := http.Get(*casefoldingURL)
+ if err != nil {
+ logger.Fatal(err)
+ }
+ if resp.StatusCode != 200 {
+ logger.Fatal("bad GET status for CaseFolding.txt", resp.Status)
+ }
+ input := bufio.NewReader(resp.Body)
+ for {
+ line, err := input.ReadString('\n')
+ if err != nil {
+ if err == os.EOF {
+ break
+ }
+ logger.Fatal(err)
+ }
+ if line[0] == '#' {
+ continue
+ }
+ field := strings.Split(line, "; ", -1)
+ if len(field) != 4 {
+ logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
+ }
+ kind := field[1]
+ if kind != "C" && kind != "S" {
+ // Only care about 'common' and 'simple' foldings.
+ continue
+ }
+ p1, err := strconv.Btoui64(field[0], 16)
+ if err != nil {
+ logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
+ }
+ p2, err := strconv.Btoui64(field[2], 16)
+ if err != nil {
+ logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
+ }
+ chars[p1].foldCase = int(p2)
+ }
+ resp.Body.Close()
+}
+
const progHeader = `// Generated by running
-// maketables --tables=%s --data=%s
+// maketables --tables=%s --data=%s --casefolding=%s
// DO NOT EDIT
package unicode
@@ -330,7 +380,7 @@ func printCategories() {
fullCategoryTest(list)
return
}
- fmt.Printf(progHeader, *tablelist, *dataURL)
+ fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version())
@@ -837,13 +887,13 @@ func printCases() {
}
fmt.Printf(
"// Generated by running\n"+
- "// maketables --data=%s\n"+
+ "// maketables --data=%s --casefolding=%s\n"+
"// DO NOT EDIT\n\n"+
"// CaseRanges is the table describing case mappings for all letters with\n"+
"// non-self mappings.\n"+
"var CaseRanges = _CaseRanges\n"+
"var _CaseRanges = []CaseRange {\n",
- *dataURL)
+ *dataURL, *casefoldingURL)
var startState *caseState // the start of a run; nil for not active
var prevState = &caseState{} // the state of the previous character
@@ -946,13 +996,246 @@ func printLatinProperties() {
if code == ' ' {
property = "pZ | pp"
}
- fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code)
+ fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)
+ }
+ fmt.Printf("}\n\n")
+}
+
+func printCasefold() {
+ // Build list of case-folding groups attached to each canonical folded char (typically lower case).
+ var caseOrbit = make([][]int, MaxChar+1)
+ for i := range chars {
+ c := &chars[i]
+ if c.foldCase == 0 {
+ continue
+ }
+ orb := caseOrbit[c.foldCase]
+ if orb == nil {
+ orb = append(orb, c.foldCase)
+ }
+ caseOrbit[c.foldCase] = append(orb, i)
+ }
+
+ // Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
+ for i := range chars {
+ c := &chars[i]
+ f := c.foldCase
+ if f == 0 {
+ f = i
+ }
+ orb := caseOrbit[f]
+ if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
+ // Default assumption of [upper, lower] is wrong.
+ caseOrbit[i] = []int{i}
+ }
+ }
+
+ // Delete the groups for which assuming [lower, upper] is right.
+ for i, orb := range caseOrbit {
+ if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
+ caseOrbit[i] = nil
+ }
+ }
+
+ // Record orbit information in chars.
+ for _, orb := range caseOrbit {
+ if orb == nil {
+ continue
+ }
+ sort.SortInts(orb)
+ c := orb[len(orb)-1]
+ for _, d := range orb {
+ chars[c].caseOrbit = d
+ c = d
+ }
+ }
+
+ printCaseOrbit()
+
+ // Tables of category and script folding exceptions: code points
+ // that must be added when interpreting a particular category/script
+ // in a case-folding context.
+ cat := make(map[string]map[int]bool)
+ for name := range category {
+ if x := foldExceptions(inCategory(name)); len(x) > 0 {
+ cat[name] = x
+ }
+ }
+
+ scr := make(map[string]map[int]bool)
+ for name := range scripts {
+ if x := foldExceptions(inScript(name)); len(x) > 0 {
+ cat[name] = x
+ }
+ }
+
+ printCatFold("FoldCategory", cat)
+ printCatFold("FoldScript", scr)
+}
+
+// inCategory returns a list of all the runes in the category.
+func inCategory(name string) []int {
+ var x []int
+ for i := range chars {
+ c := &chars[i]
+ if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
+ x = append(x, i)
+ }
}
- fmt.Println("}")
+ return x
}
-var range16Count = 0 // Number of entries in the 16-bit range tables.
-var range32Count = 0 // Number of entries in the 32-bit range tables.
+// inScript returns a list of all the runes in the script.
+func inScript(name string) []int {
+ var x []int
+ for _, s := range scripts[name] {
+ for c := s.lo; c <= s.hi; c++ {
+ x = append(x, int(c))
+ }
+ }
+ return x
+}
+
+// foldExceptions returns a list of all the runes fold-equivalent
+// to runes in class but not in class themselves.
+func foldExceptions(class []int) map[int]bool {
+ // Create map containing class and all fold-equivalent chars.
+ m := make(map[int]bool)
+ for _, r := range class {
+ c := &chars[r]
+ if c.caseOrbit == 0 {
+ // Just upper and lower.
+ if u := c.upperCase; u != 0 {
+ m[u] = true
+ }
+ if l := c.lowerCase; l != 0 {
+ m[l] = true
+ }
+ m[r] = true
+ continue
+ }
+ // Otherwise walk orbit.
+ r0 := r
+ for {
+ m[r] = true
+ r = chars[r].caseOrbit
+ if r == r0 {
+ break
+ }
+ }
+ }
+
+ // Remove class itself.
+ for _, r := range class {
+ m[r] = false, false
+ }
+
+ // What's left is the exceptions.
+ return m
+}
+
+var comment = map[string]string{
+ "FoldCategory": "// FoldCategory maps a category name to a table of\n" +
+ "// code points outside the category that are equivalent under\n" +
+ "// simple case folding to code points inside the category.\n" +
+ "// If there is no entry for a category name, there are no such points.\n",
+
+ "FoldScript": "// FoldScript maps a script name to a table of\n" +
+ "// code points outside the script that are equivalent under\n" +
+ "// simple case folding to code points inside the script.\n" +
+ "// If there is no entry for a script name, there are no such points.\n",
+}
+
+func printCaseOrbit() {
+ if *test {
+ for i := range chars {
+ c := &chars[i]
+ f := c.caseOrbit
+ if f == 0 {
+ if c.lowerCase != i && c.lowerCase != 0 {
+ f = c.lowerCase
+ } else if c.upperCase != i && c.upperCase != 0 {
+ f = c.upperCase
+ } else {
+ f = i
+ }
+ }
+ if g := unicode.SimpleFold(i); g != f {
+ fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
+ }
+ }
+ return
+ }
+
+ fmt.Printf("var caseOrbit = []foldPair{\n")
+ for i := range chars {
+ c := &chars[i]
+ if c.caseOrbit != 0 {
+ fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
+ foldPairCount++
+ }
+ }
+ fmt.Printf("}\n\n")
+}
+
+func printCatFold(name string, m map[string]map[int]bool) {
+ if *test {
+ var pkgMap map[string]*unicode.RangeTable
+ if name == "FoldCategory" {
+ pkgMap = unicode.FoldCategory
+ } else {
+ pkgMap = unicode.FoldScript
+ }
+ if len(pkgMap) != len(m) {
+ fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
+ return
+ }
+ for k, v := range m {
+ t, ok := pkgMap[k]
+ if !ok {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
+ continue
+ }
+ n := 0
+ for _, r := range t.R16 {
+ for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
+ if !v[c] {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
+ }
+ n++
+ }
+ }
+ for _, r := range t.R32 {
+ for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
+ if !v[c] {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
+ }
+ n++
+ }
+ }
+ if n != len(v) {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
+ }
+ }
+ return
+ }
+
+ fmt.Print(comment[name])
+ fmt.Printf("var %s = map[string]*RangeTable{\n", name)
+ for name := range m {
+ fmt.Printf("\t%q: fold%s,\n", name, name)
+ }
+ fmt.Printf("}\n\n")
+ for name, class := range m {
+ dumpRange(
+ fmt.Sprintf("var fold%s = &RangeTable{\n", name),
+ func(code int) bool { return class[code] })
+ }
+}
+
+var range16Count = 0 // Number of entries in the 16-bit range tables.
+var range32Count = 0 // Number of entries in the 32-bit range tables.
+var foldPairCount = 0 // Number of fold pairs in the exception tables.
func printSizes() {
if *test {
@@ -963,4 +1246,6 @@ func printSizes() {
range16Bytes := range16Count * 3 * 2
range32Bytes := range32Count * 3 * 4
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
+ fmt.Println()
+ fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
}
diff --git a/src/pkg/unicode/tables.go b/src/pkg/unicode/tables.go
index 32681a8c01..a75011adb9 100644
--- a/src/pkg/unicode/tables.go
+++ b/src/pkg/unicode/tables.go
@@ -1,5 +1,5 @@
// Generated by running
-// maketables --tables=all --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt
+// maketables --tables=all --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.0.0/ucd/CaseFolding.txt
// DO NOT EDIT
package unicode
@@ -5150,7 +5150,7 @@ var (
)
// Generated by running
-// maketables --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt
+// maketables --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.0.0/ucd/CaseFolding.txt
// DO NOT EDIT
// CaseRanges is the table describing case mappings for all letters with
@@ -5539,7 +5539,7 @@ var properties = [MaxLatin1 + 1]uint8{
0x7C: pS | pp, // '|'
0x7D: pP | pp, // '}'
0x7E: pS | pp, // '~'
- 0x7F: pC, // '\x7f'
+ 0x7F: pC, // '\u007f'
0x80: pC, // '\u0080'
0x81: pC, // '\u0081'
0x82: pC, // '\u0082'
@@ -5573,102 +5573,534 @@ var properties = [MaxLatin1 + 1]uint8{
0x9E: pC, // '\u009e'
0x9F: pC, // '\u009f'
0xA0: pZ, // '\u00a0'
- 0xA1: pP | pp, // '\u00a1'
- 0xA2: pS | pp, // '\u00a2'
- 0xA3: pS | pp, // '\u00a3'
- 0xA4: pS | pp, // '\u00a4'
- 0xA5: pS | pp, // '\u00a5'
- 0xA6: pS | pp, // '\u00a6'
- 0xA7: pS | pp, // '\u00a7'
- 0xA8: pS | pp, // '\u00a8'
- 0xA9: pS | pp, // '\u00a9'
- 0xAA: pLl | pp, // '\u00aa'
- 0xAB: pP | pp, // '\u00ab'
- 0xAC: pS | pp, // '\u00ac'
+ 0xA1: pP | pp, // '¡'
+ 0xA2: pS | pp, // '¢'
+ 0xA3: pS | pp, // '£'
+ 0xA4: pS | pp, // '¤'
+ 0xA5: pS | pp, // '¥'
+ 0xA6: pS | pp, // '¦'
+ 0xA7: pS | pp, // '§'
+ 0xA8: pS | pp, // '¨'
+ 0xA9: pS | pp, // '©'
+ 0xAA: pLl | pp, // 'ª'
+ 0xAB: pP | pp, // '«'
+ 0xAC: pS | pp, // '¬'
0xAD: 0, // '\u00ad'
- 0xAE: pS | pp, // '\u00ae'
- 0xAF: pS | pp, // '\u00af'
- 0xB0: pS | pp, // '\u00b0'
- 0xB1: pS | pp, // '\u00b1'
- 0xB2: pN | pp, // '\u00b2'
- 0xB3: pN | pp, // '\u00b3'
- 0xB4: pS | pp, // '\u00b4'
- 0xB5: pLl | pp, // '\u00b5'
- 0xB6: pS | pp, // '\u00b6'
- 0xB7: pP | pp, // '\u00b7'
- 0xB8: pS | pp, // '\u00b8'
- 0xB9: pN | pp, // '\u00b9'
- 0xBA: pLl | pp, // '\u00ba'
- 0xBB: pP | pp, // '\u00bb'
- 0xBC: pN | pp, // '\u00bc'
- 0xBD: pN | pp, // '\u00bd'
- 0xBE: pN | pp, // '\u00be'
- 0xBF: pP | pp, // '\u00bf'
- 0xC0: pLu | pp, // '\u00c0'
- 0xC1: pLu | pp, // '\u00c1'
- 0xC2: pLu | pp, // '\u00c2'
- 0xC3: pLu | pp, // '\u00c3'
- 0xC4: pLu | pp, // '\u00c4'
- 0xC5: pLu | pp, // '\u00c5'
- 0xC6: pLu | pp, // '\u00c6'
- 0xC7: pLu | pp, // '\u00c7'
- 0xC8: pLu | pp, // '\u00c8'
- 0xC9: pLu | pp, // '\u00c9'
- 0xCA: pLu | pp, // '\u00ca'
- 0xCB: pLu | pp, // '\u00cb'
- 0xCC: pLu | pp, // '\u00cc'
- 0xCD: pLu | pp, // '\u00cd'
- 0xCE: pLu | pp, // '\u00ce'
- 0xCF: pLu | pp, // '\u00cf'
- 0xD0: pLu | pp, // '\u00d0'
- 0xD1: pLu | pp, // '\u00d1'
- 0xD2: pLu | pp, // '\u00d2'
- 0xD3: pLu | pp, // '\u00d3'
- 0xD4: pLu | pp, // '\u00d4'
- 0xD5: pLu | pp, // '\u00d5'
- 0xD6: pLu | pp, // '\u00d6'
- 0xD7: pS | pp, // '\u00d7'
- 0xD8: pLu | pp, // '\u00d8'
- 0xD9: pLu | pp, // '\u00d9'
- 0xDA: pLu | pp, // '\u00da'
- 0xDB: pLu | pp, // '\u00db'
- 0xDC: pLu | pp, // '\u00dc'
- 0xDD: pLu | pp, // '\u00dd'
- 0xDE: pLu | pp, // '\u00de'
- 0xDF: pLl | pp, // '\u00df'
- 0xE0: pLl | pp, // '\u00e0'
- 0xE1: pLl | pp, // '\u00e1'
- 0xE2: pLl | pp, // '\u00e2'
- 0xE3: pLl | pp, // '\u00e3'
- 0xE4: pLl | pp, // '\u00e4'
- 0xE5: pLl | pp, // '\u00e5'
- 0xE6: pLl | pp, // '\u00e6'
- 0xE7: pLl | pp, // '\u00e7'
- 0xE8: pLl | pp, // '\u00e8'
- 0xE9: pLl | pp, // '\u00e9'
- 0xEA: pLl | pp, // '\u00ea'
- 0xEB: pLl | pp, // '\u00eb'
- 0xEC: pLl | pp, // '\u00ec'
- 0xED: pLl | pp, // '\u00ed'
- 0xEE: pLl | pp, // '\u00ee'
- 0xEF: pLl | pp, // '\u00ef'
- 0xF0: pLl | pp, // '\u00f0'
- 0xF1: pLl | pp, // '\u00f1'
- 0xF2: pLl | pp, // '\u00f2'
- 0xF3: pLl | pp, // '\u00f3'
- 0xF4: pLl | pp, // '\u00f4'
- 0xF5: pLl | pp, // '\u00f5'
- 0xF6: pLl | pp, // '\u00f6'
- 0xF7: pS | pp, // '\u00f7'
- 0xF8: pLl | pp, // '\u00f8'
- 0xF9: pLl | pp, // '\u00f9'
- 0xFA: pLl | pp, // '\u00fa'
- 0xFB: pLl | pp, // '\u00fb'
- 0xFC: pLl | pp, // '\u00fc'
- 0xFD: pLl | pp, // '\u00fd'
- 0xFE: pLl | pp, // '\u00fe'
- 0xFF: pLl | pp, // '\u00ff'
-}
-
-// Range entries: 3190 16-bit, 657 32-bit, 3847 total.
-// Range bytes: 19140 16-bit, 7884 32-bit, 27024 total.
+ 0xAE: pS | pp, // '®'
+ 0xAF: pS | pp, // '¯'
+ 0xB0: pS | pp, // '°'
+ 0xB1: pS | pp, // '±'
+ 0xB2: pN | pp, // '²'
+ 0xB3: pN | pp, // '³'
+ 0xB4: pS | pp, // '´'
+ 0xB5: pLl | pp, // 'µ'
+ 0xB6: pS | pp, // '¶'
+ 0xB7: pP | pp, // '·'
+ 0xB8: pS | pp, // '¸'
+ 0xB9: pN | pp, // '¹'
+ 0xBA: pLl | pp, // 'º'
+ 0xBB: pP | pp, // '»'
+ 0xBC: pN | pp, // '¼'
+ 0xBD: pN | pp, // '½'
+ 0xBE: pN | pp, // '¾'
+ 0xBF: pP | pp, // '¿'
+ 0xC0: pLu | pp, // 'À'
+ 0xC1: pLu | pp, // 'Á'
+ 0xC2: pLu | pp, // 'Â'
+ 0xC3: pLu | pp, // 'Ã'
+ 0xC4: pLu | pp, // 'Ä'
+ 0xC5: pLu | pp, // 'Å'
+ 0xC6: pLu | pp, // 'Æ'
+ 0xC7: pLu | pp, // 'Ç'
+ 0xC8: pLu | pp, // 'È'
+ 0xC9: pLu | pp, // 'É'
+ 0xCA: pLu | pp, // 'Ê'
+ 0xCB: pLu | pp, // 'Ë'
+ 0xCC: pLu | pp, // 'Ì'
+ 0xCD: pLu | pp, // 'Í'
+ 0xCE: pLu | pp, // 'Î'
+ 0xCF: pLu | pp, // 'Ï'
+ 0xD0: pLu | pp, // 'Ð'
+ 0xD1: pLu | pp, // 'Ñ'
+ 0xD2: pLu | pp, // 'Ò'
+ 0xD3: pLu | pp, // 'Ó'
+ 0xD4: pLu | pp, // 'Ô'
+ 0xD5: pLu | pp, // 'Õ'
+ 0xD6: pLu | pp, // 'Ö'
+ 0xD7: pS | pp, // '×'
+ 0xD8: pLu | pp, // 'Ø'
+ 0xD9: pLu | pp, // 'Ù'
+ 0xDA: pLu | pp, // 'Ú'
+ 0xDB: pLu | pp, // 'Û'
+ 0xDC: pLu | pp, // 'Ü'
+ 0xDD: pLu | pp, // 'Ý'
+ 0xDE: pLu | pp, // 'Þ'
+ 0xDF: pLl | pp, // 'ß'
+ 0xE0: pLl | pp, // 'à'
+ 0xE1: pLl | pp, // 'á'
+ 0xE2: pLl | pp, // 'â'
+ 0xE3: pLl | pp, // 'ã'
+ 0xE4: pLl | pp, // 'ä'
+ 0xE5: pLl | pp, // 'å'
+ 0xE6: pLl | pp, // 'æ'
+ 0xE7: pLl | pp, // 'ç'
+ 0xE8: pLl | pp, // 'è'
+ 0xE9: pLl | pp, // 'é'
+ 0xEA: pLl | pp, // 'ê'
+ 0xEB: pLl | pp, // 'ë'
+ 0xEC: pLl | pp, // 'ì'
+ 0xED: pLl | pp, // 'í'
+ 0xEE: pLl | pp, // 'î'
+ 0xEF: pLl | pp, // 'ï'
+ 0xF0: pLl | pp, // 'ð'
+ 0xF1: pLl | pp, // 'ñ'
+ 0xF2: pLl | pp, // 'ò'
+ 0xF3: pLl | pp, // 'ó'
+ 0xF4: pLl | pp, // 'ô'
+ 0xF5: pLl | pp, // 'õ'
+ 0xF6: pLl | pp, // 'ö'
+ 0xF7: pS | pp, // '÷'
+ 0xF8: pLl | pp, // 'ø'
+ 0xF9: pLl | pp, // 'ù'
+ 0xFA: pLl | pp, // 'ú'
+ 0xFB: pLl | pp, // 'û'
+ 0xFC: pLl | pp, // 'ü'
+ 0xFD: pLl | pp, // 'ý'
+ 0xFE: pLl | pp, // 'þ'
+ 0xFF: pLl | pp, // 'ÿ'
+}
+
+var caseOrbit = []foldPair{
+ {0x004B, 0x006B},
+ {0x0053, 0x0073},
+ {0x006B, 0x212A},
+ {0x0073, 0x017F},
+ {0x00B5, 0x039C},
+ {0x00C5, 0x00E5},
+ {0x00DF, 0x1E9E},
+ {0x00E5, 0x212B},
+ {0x0130, 0x0130},
+ {0x0131, 0x0131},
+ {0x017F, 0x0053},
+ {0x01C4, 0x01C5},
+ {0x01C5, 0x01C6},
+ {0x01C6, 0x01C4},
+ {0x01C7, 0x01C8},
+ {0x01C8, 0x01C9},
+ {0x01C9, 0x01C7},
+ {0x01CA, 0x01CB},
+ {0x01CB, 0x01CC},
+ {0x01CC, 0x01CA},
+ {0x01F1, 0x01F2},
+ {0x01F2, 0x01F3},
+ {0x01F3, 0x01F1},
+ {0x0345, 0x0399},
+ {0x0392, 0x03B2},
+ {0x0395, 0x03B5},
+ {0x0398, 0x03B8},
+ {0x0399, 0x03B9},
+ {0x039A, 0x03BA},
+ {0x039C, 0x03BC},
+ {0x03A0, 0x03C0},
+ {0x03A1, 0x03C1},
+ {0x03A3, 0x03C2},
+ {0x03A6, 0x03C6},
+ {0x03A9, 0x03C9},
+ {0x03B2, 0x03D0},
+ {0x03B5, 0x03F5},
+ {0x03B8, 0x03D1},
+ {0x03B9, 0x1FBE},
+ {0x03BA, 0x03F0},
+ {0x03BC, 0x00B5},
+ {0x03C0, 0x03D6},
+ {0x03C1, 0x03F1},
+ {0x03C2, 0x03C3},
+ {0x03C3, 0x03A3},
+ {0x03C6, 0x03D5},
+ {0x03C9, 0x2126},
+ {0x03D0, 0x0392},
+ {0x03D1, 0x03F4},
+ {0x03D5, 0x03A6},
+ {0x03D6, 0x03A0},
+ {0x03F0, 0x039A},
+ {0x03F1, 0x03A1},
+ {0x03F4, 0x0398},
+ {0x03F5, 0x0395},
+ {0x1E60, 0x1E61},
+ {0x1E61, 0x1E9B},
+ {0x1E9B, 0x1E60},
+ {0x1E9E, 0x00DF},
+ {0x1FBE, 0x0345},
+ {0x2126, 0x03A9},
+ {0x212A, 0x004B},
+ {0x212B, 0x00C5},
+ {0x2160, 0x2170},
+ {0x2161, 0x2171},
+ {0x2162, 0x2172},
+ {0x2163, 0x2173},
+ {0x2164, 0x2174},
+ {0x2165, 0x2175},
+ {0x2166, 0x2176},
+ {0x2167, 0x2177},
+ {0x2168, 0x2178},
+ {0x2169, 0x2179},
+ {0x216A, 0x217A},
+ {0x216B, 0x217B},
+ {0x216C, 0x217C},
+ {0x216D, 0x217D},
+ {0x216E, 0x217E},
+ {0x216F, 0x217F},
+ {0x2170, 0x2160},
+ {0x2171, 0x2161},
+ {0x2172, 0x2162},
+ {0x2173, 0x2163},
+ {0x2174, 0x2164},
+ {0x2175, 0x2165},
+ {0x2176, 0x2166},
+ {0x2177, 0x2167},
+ {0x2178, 0x2168},
+ {0x2179, 0x2169},
+ {0x217A, 0x216A},
+ {0x217B, 0x216B},
+ {0x217C, 0x216C},
+ {0x217D, 0x216D},
+ {0x217E, 0x216E},
+ {0x217F, 0x216F},
+ {0x24B6, 0x24D0},
+ {0x24B7, 0x24D1},
+ {0x24B8, 0x24D2},
+ {0x24B9, 0x24D3},
+ {0x24BA, 0x24D4},
+ {0x24BB, 0x24D5},
+ {0x24BC, 0x24D6},
+ {0x24BD, 0x24D7},
+ {0x24BE, 0x24D8},
+ {0x24BF, 0x24D9},
+ {0x24C0, 0x24DA},
+ {0x24C1, 0x24DB},
+ {0x24C2, 0x24DC},
+ {0x24C3, 0x24DD},
+ {0x24C4, 0x24DE},
+ {0x24C5, 0x24DF},
+ {0x24C6, 0x24E0},
+ {0x24C7, 0x24E1},
+ {0x24C8, 0x24E2},
+ {0x24C9, 0x24E3},
+ {0x24CA, 0x24E4},
+ {0x24CB, 0x24E5},
+ {0x24CC, 0x24E6},
+ {0x24CD, 0x24E7},
+ {0x24CE, 0x24E8},
+ {0x24CF, 0x24E9},
+ {0x24D0, 0x24B6},
+ {0x24D1, 0x24B7},
+ {0x24D2, 0x24B8},
+ {0x24D3, 0x24B9},
+ {0x24D4, 0x24BA},
+ {0x24D5, 0x24BB},
+ {0x24D6, 0x24BC},
+ {0x24D7, 0x24BD},
+ {0x24D8, 0x24BE},
+ {0x24D9, 0x24BF},
+ {0x24DA, 0x24C0},
+ {0x24DB, 0x24C1},
+ {0x24DC, 0x24C2},
+ {0x24DD, 0x24C3},
+ {0x24DE, 0x24C4},
+ {0x24DF, 0x24C5},
+ {0x24E0, 0x24C6},
+ {0x24E1, 0x24C7},
+ {0x24E2, 0x24C8},
+ {0x24E3, 0x24C9},
+ {0x24E4, 0x24CA},
+ {0x24E5, 0x24CB},
+ {0x24E6, 0x24CC},
+ {0x24E7, 0x24CD},
+ {0x24E8, 0x24CE},
+ {0x24E9, 0x24CF},
+}
+
+// FoldCategory maps a category name to a table of
+// code points outside the category that are equivalent under
+// simple case folding to code points inside the category.
+// If there is no entry for a category name, there are no such points.
+var FoldCategory = map[string]*RangeTable{
+ "Ll": foldLl,
+ "Inherited": foldInherited,
+ "M": foldM,
+ "L": foldL,
+ "Mn": foldMn,
+ "Common": foldCommon,
+ "Greek": foldGreek,
+ "Lu": foldLu,
+ "Lt": foldLt,
+}
+
+var foldLl = &RangeTable{
+ R16: []Range16{
+ {0x0041, 0x005a, 1},
+ {0x00c0, 0x00d6, 1},
+ {0x00d8, 0x00de, 1},
+ {0x0100, 0x012e, 2},
+ {0x0132, 0x0136, 2},
+ {0x0139, 0x0147, 2},
+ {0x014a, 0x0178, 2},
+ {0x0179, 0x017d, 2},
+ {0x0181, 0x0182, 1},
+ {0x0184, 0x0186, 2},
+ {0x0187, 0x0189, 2},
+ {0x018a, 0x018b, 1},
+ {0x018e, 0x0191, 1},
+ {0x0193, 0x0194, 1},
+ {0x0196, 0x0198, 1},
+ {0x019c, 0x019d, 1},
+ {0x019f, 0x01a0, 1},
+ {0x01a2, 0x01a6, 2},
+ {0x01a7, 0x01a9, 2},
+ {0x01ac, 0x01ae, 2},
+ {0x01af, 0x01b1, 2},
+ {0x01b2, 0x01b3, 1},
+ {0x01b5, 0x01b7, 2},
+ {0x01b8, 0x01bc, 4},
+ {0x01c4, 0x01c5, 1},
+ {0x01c7, 0x01c8, 1},
+ {0x01ca, 0x01cb, 1},
+ {0x01cd, 0x01db, 2},
+ {0x01de, 0x01ee, 2},
+ {0x01f1, 0x01f2, 1},
+ {0x01f4, 0x01f6, 2},
+ {0x01f7, 0x01f8, 1},
+ {0x01fa, 0x0232, 2},
+ {0x023a, 0x023b, 1},
+ {0x023d, 0x023e, 1},
+ {0x0241, 0x0243, 2},
+ {0x0244, 0x0246, 1},
+ {0x0248, 0x024e, 2},
+ {0x0345, 0x0370, 43},
+ {0x0372, 0x0376, 4},
+ {0x0386, 0x0388, 2},
+ {0x0389, 0x038a, 1},
+ {0x038c, 0x038e, 2},
+ {0x038f, 0x0391, 2},
+ {0x0392, 0x03a1, 1},
+ {0x03a3, 0x03ab, 1},
+ {0x03cf, 0x03d8, 9},
+ {0x03da, 0x03ee, 2},
+ {0x03f4, 0x03f7, 3},
+ {0x03f9, 0x03fa, 1},
+ {0x03fd, 0x042f, 1},
+ {0x0460, 0x0480, 2},
+ {0x048a, 0x04c0, 2},
+ {0x04c1, 0x04cd, 2},
+ {0x04d0, 0x0526, 2},
+ {0x0531, 0x0556, 1},
+ {0x10a0, 0x10c5, 1},
+ {0x1e00, 0x1e94, 2},
+ {0x1e9e, 0x1efe, 2},
+ {0x1f08, 0x1f0f, 1},
+ {0x1f18, 0x1f1d, 1},
+ {0x1f28, 0x1f2f, 1},
+ {0x1f38, 0x1f3f, 1},
+ {0x1f48, 0x1f4d, 1},
+ {0x1f59, 0x1f5f, 2},
+ {0x1f68, 0x1f6f, 1},
+ {0x1f88, 0x1f8f, 1},
+ {0x1f98, 0x1f9f, 1},
+ {0x1fa8, 0x1faf, 1},
+ {0x1fb8, 0x1fbc, 1},
+ {0x1fc8, 0x1fcc, 1},
+ {0x1fd8, 0x1fdb, 1},
+ {0x1fe8, 0x1fec, 1},
+ {0x1ff8, 0x1ffc, 1},
+ {0x2126, 0x212a, 4},
+ {0x212b, 0x2132, 7},
+ {0x2183, 0x2c00, 2685},
+ {0x2c01, 0x2c2e, 1},
+ {0x2c60, 0x2c62, 2},
+ {0x2c63, 0x2c64, 1},
+ {0x2c67, 0x2c6d, 2},
+ {0x2c6e, 0x2c70, 1},
+ {0x2c72, 0x2c75, 3},
+ {0x2c7e, 0x2c80, 1},
+ {0x2c82, 0x2ce2, 2},
+ {0x2ceb, 0x2ced, 2},
+ {0xa640, 0xa66c, 2},
+ {0xa680, 0xa696, 2},
+ {0xa722, 0xa72e, 2},
+ {0xa732, 0xa76e, 2},
+ {0xa779, 0xa77d, 2},
+ {0xa77e, 0xa786, 2},
+ {0xa78b, 0xa78d, 2},
+ {0xa790, 0xa7a0, 16},
+ {0xa7a2, 0xa7a8, 2},
+ {0xff21, 0xff3a, 1},
+ },
+ R32: []Range32{
+ {0x10400, 0x10427, 1},
+ },
+}
+
+var foldInherited = &RangeTable{
+ R16: []Range16{
+ {0x0399, 0x03b9, 32},
+ {0x1fbe, 0x1fbe, 1},
+ },
+}
+
+var foldM = &RangeTable{
+ R16: []Range16{
+ {0x0399, 0x03b9, 32},
+ {0x1fbe, 0x1fbe, 1},
+ },
+}
+
+var foldL = &RangeTable{
+ R16: []Range16{
+ {0x0345, 0x0345, 1},
+ },
+}
+
+var foldMn = &RangeTable{
+ R16: []Range16{
+ {0x0399, 0x03b9, 32},
+ {0x1fbe, 0x1fbe, 1},
+ },
+}
+
+var foldCommon = &RangeTable{
+ R16: []Range16{
+ {0x039c, 0x03bc, 32},
+ },
+}
+
+var foldGreek = &RangeTable{
+ R16: []Range16{
+ {0x00b5, 0x0345, 656},
+ },
+}
+
+var foldLu = &RangeTable{
+ R16: []Range16{
+ {0x0061, 0x007a, 1},
+ {0x00b5, 0x00df, 42},
+ {0x00e0, 0x00f6, 1},
+ {0x00f8, 0x00ff, 1},
+ {0x0101, 0x012f, 2},
+ {0x0133, 0x0137, 2},
+ {0x013a, 0x0148, 2},
+ {0x014b, 0x0177, 2},
+ {0x017a, 0x017e, 2},
+ {0x017f, 0x0180, 1},
+ {0x0183, 0x0185, 2},
+ {0x0188, 0x018c, 4},
+ {0x0192, 0x0195, 3},
+ {0x0199, 0x019a, 1},
+ {0x019e, 0x01a1, 3},
+ {0x01a3, 0x01a5, 2},
+ {0x01a8, 0x01ad, 5},
+ {0x01b0, 0x01b4, 4},
+ {0x01b6, 0x01b9, 3},
+ {0x01bd, 0x01bf, 2},
+ {0x01c5, 0x01c6, 1},
+ {0x01c8, 0x01c9, 1},
+ {0x01cb, 0x01cc, 1},
+ {0x01ce, 0x01dc, 2},
+ {0x01dd, 0x01ef, 2},
+ {0x01f2, 0x01f3, 1},
+ {0x01f5, 0x01f9, 4},
+ {0x01fb, 0x021f, 2},
+ {0x0223, 0x0233, 2},
+ {0x023c, 0x023f, 3},
+ {0x0240, 0x0242, 2},
+ {0x0247, 0x024f, 2},
+ {0x0250, 0x0254, 1},
+ {0x0256, 0x0257, 1},
+ {0x0259, 0x025b, 2},
+ {0x0260, 0x0263, 3},
+ {0x0265, 0x0268, 3},
+ {0x0269, 0x026b, 2},
+ {0x026f, 0x0271, 2},
+ {0x0272, 0x0275, 3},
+ {0x027d, 0x0283, 3},
+ {0x0288, 0x028c, 1},
+ {0x0292, 0x0345, 179},
+ {0x0371, 0x0373, 2},
+ {0x0377, 0x037b, 4},
+ {0x037c, 0x037d, 1},
+ {0x03ac, 0x03af, 1},
+ {0x03b1, 0x03ce, 1},
+ {0x03d0, 0x03d1, 1},
+ {0x03d5, 0x03d7, 1},
+ {0x03d9, 0x03ef, 2},
+ {0x03f0, 0x03f2, 1},
+ {0x03f5, 0x03fb, 3},
+ {0x0430, 0x045f, 1},
+ {0x0461, 0x0481, 2},
+ {0x048b, 0x04bf, 2},
+ {0x04c2, 0x04ce, 2},
+ {0x04cf, 0x0527, 2},
+ {0x0561, 0x0586, 1},
+ {0x1d79, 0x1d7d, 4},
+ {0x1e01, 0x1e95, 2},
+ {0x1e9b, 0x1ea1, 6},
+ {0x1ea3, 0x1eff, 2},
+ {0x1f00, 0x1f07, 1},
+ {0x1f10, 0x1f15, 1},
+ {0x1f20, 0x1f27, 1},
+ {0x1f30, 0x1f37, 1},
+ {0x1f40, 0x1f45, 1},
+ {0x1f51, 0x1f57, 2},
+ {0x1f60, 0x1f67, 1},
+ {0x1f70, 0x1f7d, 1},
+ {0x1fb0, 0x1fb1, 1},
+ {0x1fbe, 0x1fd0, 18},
+ {0x1fd1, 0x1fe0, 15},
+ {0x1fe1, 0x1fe5, 4},
+ {0x214e, 0x2184, 54},
+ {0x2c30, 0x2c5e, 1},
+ {0x2c61, 0x2c65, 4},
+ {0x2c66, 0x2c6c, 2},
+ {0x2c73, 0x2c76, 3},
+ {0x2c81, 0x2ce3, 2},
+ {0x2cec, 0x2cee, 2},
+ {0x2d00, 0x2d25, 1},
+ {0xa641, 0xa66d, 2},
+ {0xa681, 0xa697, 2},
+ {0xa723, 0xa72f, 2},
+ {0xa733, 0xa76f, 2},
+ {0xa77a, 0xa77c, 2},
+ {0xa77f, 0xa787, 2},
+ {0xa78c, 0xa791, 5},
+ {0xa7a1, 0xa7a9, 2},
+ {0xff41, 0xff5a, 1},
+ },
+ R32: []Range32{
+ {0x10428, 0x1044f, 1},
+ },
+}
+
+var foldLt = &RangeTable{
+ R16: []Range16{
+ {0x01c4, 0x01c6, 2},
+ {0x01c7, 0x01c9, 2},
+ {0x01ca, 0x01cc, 2},
+ {0x01f1, 0x01f3, 2},
+ {0x1f80, 0x1f87, 1},
+ {0x1f90, 0x1f97, 1},
+ {0x1fa0, 0x1fa7, 1},
+ {0x1fb3, 0x1fc3, 16},
+ {0x1ff3, 0x1ff3, 1},
+ },
+}
+
+// FoldScript maps a script name to a table of
+// code points outside the script that are equivalent under
+// simple case folding to code points inside the script.
+// If there is no entry for a script name, there are no such points.
+var FoldScript = map[string]*RangeTable{}
+
+
+// Range entries: 3391 16-bit, 659 32-bit, 4050 total.
+// Range bytes: 20346 16-bit, 7908 32-bit, 28254 total.
+
+// Fold orbit bytes: 147 pairs, 588 bytes