unicode: improve generated comments for categories

The comments on the category range tables in the unicode package are fairly redundent and require an external source to translate into human readable category names. This adds a look up table with the category descriptions and uses it if available when generating the comments for the range tables. Fixes #28954 Change-Id: I853e2d270def6492c2c1dd2ad0ec761a74c04e5d Reviewed-on: https://go-review.googlesource.com/c/151297 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
author: Wil Selwood <wselwood@gmail.com> 2018-11-26 15:11:45 +0000
committer: Brad Fitzpatrick <bradfitz@golang.org> 2018-11-28 03:12:48 +0000
commit: d704b5c956f431e618904d15643e9fee97840253 (patch)
tree: 4f5e8e8d4e5c2db4f762c34b91af734e0e08566b /src/unicode
parent: 6bf531384dd233d7620181d5086a52571d64c5c0 (diff)
download: go-d704b5c956f431e618904d15643e9fee97840253.tar.gz
go-d704b5c956f431e618904d15643e9fee97840253.zip
2 files changed, 72 insertions, 32 deletions
diff --git a/src/unicode/maketables.go b/src/unicode/maketables.go
index b11b77c634..a1f15869ea 100644
--- a/src/unicode/maketables.go
+++ b/src/unicode/maketables.go
@@ -458,6 +458,39 @@ package unicode
 
 `
 
+var categoryMapping = map[string]string{
+	"Lu": "Letter, uppercase",
+	"Ll": "Letter, lowercase",
+	"Lt": "Letter, titlecase",
+	"Lm": "Letter, modifier",
+	"Lo": "Letter, other",
+	"Mn": "Mark, nonspacing",
+	"Mc": "Mark, spacing combining",
+	"Me": "Mark, enclosing",
+	"Nd": "Number, decimal digit",
+	"Nl": "Number, letter",
+	"No": "Number, other",
+	"Pc": "Punctuation, connector",
+	"Pd": "Punctuation, dash",
+	"Ps": "Punctuation, open",
+	"Pe": "Punctuation, close",
+	"Pi": "Punctuation, initial quote",
+	"Pf": "Punctuation, final quote",
+	"Po": "Punctuation, other",
+	"Sm": "Symbol, math",
+	"Sc": "Symbol, currency",
+	"Sk": "Symbol, modifier",
+	"So": "Symbol, other",
+	"Zs": "Separator, space",
+	"Zl": "Separator, line",
+	"Zp": "Separator, paragraph",
+	"Cc": "Other, control",
+	"Cf": "Other, format",
+	"Cs": "Other, surrogate",
+	"Co": "Other, private use",
+	"Cn": "Other, not assigned",
+}
+
 func printCategories() {
 	if *tablelist == "" {
 		return
@@ -528,9 +561,16 @@ func printCategories() {
 			varDecl = "\tTitle = _Lt;	// Title is the set of Unicode title case letters.\n"
 		}
 		if len(name) > 1 {
-			varDecl += fmt.Sprintf(
-				"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
-				name, name, name, name)
+			desc, ok := categoryMapping[name]
+			if ok {
+				varDecl += fmt.Sprintf(
+					"\t%s = _%s;	// %s is the set of Unicode characters in category %s (%s).\n",
+					name, name, name, name, desc)
+			} else {
+				varDecl += fmt.Sprintf(
+					"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
+					name, name, name, name)
+			}
 		}
 		decl[ndecl] = varDecl
 		ndecl++
diff --git a/src/unicode/tables.go b/src/unicode/tables.go
index dd2f70b651..ce85b128ca 100644
--- a/src/unicode/tables.go
+++ b/src/unicode/tables.go
@@ -3380,53 +3380,53 @@ var _Zs = &RangeTable{
 
 // These variables have type *RangeTable.
 var (
-	Cc     = _Cc // Cc is the set of Unicode characters in category Cc.
-	Cf     = _Cf // Cf is the set of Unicode characters in category Cf.
-	Co     = _Co // Co is the set of Unicode characters in category Co.
-	Cs     = _Cs // Cs is the set of Unicode characters in category Cs.
+	Cc     = _Cc // Cc is the set of Unicode characters in category Cc (Other, control).
+	Cf     = _Cf // Cf is the set of Unicode characters in category Cf (Other, format).
+	Co     = _Co // Co is the set of Unicode characters in category Co (Other, private use).
+	Cs     = _Cs // Cs is the set of Unicode characters in category Cs (Other, surrogate).
 	Digit  = _Nd // Digit is the set of Unicode characters with the "decimal digit" property.
-	Nd     = _Nd // Nd is the set of Unicode characters in category Nd.
+	Nd     = _Nd // Nd is the set of Unicode characters in category Nd (Number, decimal digit).
 	Letter = _L  // Letter/L is the set of Unicode letters, category L.
 	L      = _L
-	Lm     = _Lm // Lm is the set of Unicode characters in category Lm.
-	Lo     = _Lo // Lo is the set of Unicode characters in category Lo.
+	Lm     = _Lm // Lm is the set of Unicode characters in category Lm (Letter, modifier).
+	Lo     = _Lo // Lo is the set of Unicode characters in category Lo (Letter, other).
 	Lower  = _Ll // Lower is the set of Unicode lower case letters.
-	Ll     = _Ll // Ll is the set of Unicode characters in category Ll.
+	Ll     = _Ll // Ll is the set of Unicode characters in category Ll (Letter, lowercase).
 	Mark   = _M  // Mark/M is the set of Unicode mark characters, category M.
 	M      = _M
-	Mc     = _Mc // Mc is the set of Unicode characters in category Mc.
-	Me     = _Me // Me is the set of Unicode characters in category Me.
-	Mn     = _Mn // Mn is the set of Unicode characters in category Mn.
-	Nl     = _Nl // Nl is the set of Unicode characters in category Nl.
-	No     = _No // No is the set of Unicode characters in category No.
+	Mc     = _Mc // Mc is the set of Unicode characters in category Mc (Mark, spacing combining).
+	Me     = _Me // Me is the set of Unicode characters in category Me (Mark, enclosing).
+	Mn     = _Mn // Mn is the set of Unicode characters in category Mn (Mark, nonspacing).
+	Nl     = _Nl // Nl is the set of Unicode characters in category Nl (Number, letter).
+	No     = _No // No is the set of Unicode characters in category No (Number, other).
 	Number = _N  // Number/N is the set of Unicode number characters, category N.
 	N      = _N
 	Other  = _C // Other/C is the set of Unicode control and special characters, category C.
 	C      = _C
-	Pc     = _Pc // Pc is the set of Unicode characters in category Pc.
-	Pd     = _Pd // Pd is the set of Unicode characters in category Pd.
-	Pe     = _Pe // Pe is the set of Unicode characters in category Pe.
-	Pf     = _Pf // Pf is the set of Unicode characters in category Pf.
-	Pi     = _Pi // Pi is the set of Unicode characters in category Pi.
-	Po     = _Po // Po is the set of Unicode characters in category Po.
-	Ps     = _Ps // Ps is the set of Unicode characters in category Ps.
+	Pc     = _Pc // Pc is the set of Unicode characters in category Pc (Punctuation, connector).
+	Pd     = _Pd // Pd is the set of Unicode characters in category Pd (Punctuation, dash).
+	Pe     = _Pe // Pe is the set of Unicode characters in category Pe (Punctuation, close).
+	Pf     = _Pf // Pf is the set of Unicode characters in category Pf (Punctuation, final quote).
+	Pi     = _Pi // Pi is the set of Unicode characters in category Pi (Punctuation, initial quote).
+	Po     = _Po // Po is the set of Unicode characters in category Po (Punctuation, other).
+	Ps     = _Ps // Ps is the set of Unicode characters in category Ps (Punctuation, open).
 	Punct  = _P  // Punct/P is the set of Unicode punctuation characters, category P.
 	P      = _P
-	Sc     = _Sc // Sc is the set of Unicode characters in category Sc.
-	Sk     = _Sk // Sk is the set of Unicode characters in category Sk.
-	Sm     = _Sm // Sm is the set of Unicode characters in category Sm.
-	So     = _So // So is the set of Unicode characters in category So.
+	Sc     = _Sc // Sc is the set of Unicode characters in category Sc (Symbol, currency).
+	Sk     = _Sk // Sk is the set of Unicode characters in category Sk (Symbol, modifier).
+	Sm     = _Sm // Sm is the set of Unicode characters in category Sm (Symbol, math).
+	So     = _So // So is the set of Unicode characters in category So (Symbol, other).
 	Space  = _Z  // Space/Z is the set of Unicode space characters, category Z.
 	Z      = _Z
 	Symbol = _S // Symbol/S is the set of Unicode symbol characters, category S.
 	S      = _S
 	Title  = _Lt // Title is the set of Unicode title case letters.
-	Lt     = _Lt // Lt is the set of Unicode characters in category Lt.
+	Lt     = _Lt // Lt is the set of Unicode characters in category Lt (Letter, titlecase).
 	Upper  = _Lu // Upper is the set of Unicode upper case letters.
-	Lu     = _Lu // Lu is the set of Unicode characters in category Lu.
-	Zl     = _Zl // Zl is the set of Unicode characters in category Zl.
-	Zp     = _Zp // Zp is the set of Unicode characters in category Zp.
-	Zs     = _Zs // Zs is the set of Unicode characters in category Zs.
+	Lu     = _Lu // Lu is the set of Unicode characters in category Lu (Letter, uppercase).
+	Zl     = _Zl // Zl is the set of Unicode characters in category Zl (Separator, line).
+	Zp     = _Zp // Zp is the set of Unicode characters in category Zp (Separator, paragraph).
+	Zs     = _Zs // Zs is the set of Unicode characters in category Zs (Separator, space).
 )
 
 // Generated by running
author	Wil Selwood <wselwood@gmail.com>	2018-11-26 15:11:45 +0000
committer	Brad Fitzpatrick <bradfitz@golang.org>	2018-11-28 03:12:48 +0000
commit	d704b5c956f431e618904d15643e9fee97840253 (patch)
tree	4f5e8e8d4e5c2db4f762c34b91af734e0e08566b /src/unicode
parent	6bf531384dd233d7620181d5086a52571d64c5c0 (diff)
download	go-d704b5c956f431e618904d15643e9fee97840253.tar.gz go-d704b5c956f431e618904d15643e9fee97840253.zip