go/doc/comment: add heuristics for common badly formatted comments

In a set of 55M Go doc comments drawn from the latest version of all public Go modules known to the module proxy in spring 2020, the current Go 1.19 gofmt reformats about 1.57M of them. Out of those 1.57M comments, inspection of random samples shows that around 5% of the changed comments contain unindented code snippets, multiline shell commands, or lists. For example: // Here is a greeting: // // func main() { // fmt.Println("hello, world") // } // Run this command: // // path/to/your/program -flag1=longargument1 \ // -flag2=longargument2 \ // -flag3 // There are three possibilities: // // - Unindented code snippets (or JSON objects) // in which the first and last line are unindented // but end in { and start with }, respectively. // - Unindented multiline shell commands // in which the lines end in \ // - Unindented lists, in which wrapped lines are indented. All three of these cases involve unindented lines next to indented lines that would according to the usual rules begin a pre block. Before this CL, they'd be reformatted to: // Here is a greeting: // // func main() { // // fmt.Println("hello, world") // // } // Run this command: // // path/to/your/program -flag1=longargument1 \ // // -flag2=longargument2 \ // -flag3 // There are three possibilities: // // - Unindented code snippets (or JSON objects) // // in which the first and last line are unindented // but end in { and start with }, respectively. // // - Unindented multiline shell commands // // in which the lines end in \ // // - Unindented lists, in which wrapped lines are indented. The fact that they are not already in canonical format gives us a signal that they might not mean what the usual rules would say. This CL takes advantage of that opening to apply a few heuristics to better handle these cases: 1. If an indented code block immediately follows (without a blank line) an unindented line ending in { or \, include the unindented line in the code block. 2. If an indented code block immediately precedes (without a blank line) an unindented line beginning with }, include the unindented line in the code block. 3. If an indented line immediately follows (without a blank line) an unindented line that starts with a list marker, assume this is an unindented list with a wrapped indented line, and treat all adjacent unindented lines starting with list markers as part of the list, stopping at any surrounding blank lines. This raises the fraction of “correctly” reformatted doc comments in the corpus from approximately 87% to approximately 93%. Change-Id: I7ac542eb085032d607a7caf3ba9020787b2978b5 Reviewed-on: https://go-review.googlesource.com/c/go/+/410360 Auto-Submit: Russ Cox <rsc@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Roland Shoemaker <roland@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>
author: Russ Cox <rsc@golang.org> 2022-06-03 12:44:58 -0400
committer: Gopher Robot <gobot@golang.org> 2022-06-06 20:47:52 +0000
commit: 3651a6117e9a88576615c29c4faf7eeec55d7691 (patch)
tree: 3814530b85f5ac27871b607c8a1a0932538f6d47
parent: 4c08260c51c6ebe405a78a30f970014af763ca38 (diff)
download: go-3651a6117e9a88576615c29c4faf7eeec55d7691.tar.gz
go-3651a6117e9a88576615c29c4faf7eeec55d7691.zip
7 files changed, 322 insertions, 99 deletions
diff --git a/src/go/doc/comment/parse.go b/src/go/doc/comment/parse.go
index 8a311ff817..4de8ce710d 100644
--- a/src/go/doc/comment/parse.go
+++ b/src/go/doc/comment/parse.go
@@ -260,7 +260,8 @@ func (d *parseDoc) lookupPkg(pkg string) (importPath string, ok bool) {
 }
 
 func isStdPkg(path string) bool {
-	// TODO(rsc): Use sort.Find.
+	// TODO(rsc): Use sort.Find once we don't have to worry about
+	// copying this code into older Go environments.
 	i := sort.Search(len(stdPkgs), func(i int) bool { return stdPkgs[i] >= path })
 	return i < len(stdPkgs) && stdPkgs[i] == path
 }
@@ -297,44 +298,27 @@ func (p *Parser) Parse(text string) *Doc {
 
 	// First pass: break into block structure and collect known links.
 	// The text is all recorded as Plain for now.
-	// TODO: Break into actual block structure.
-	didHeading := false
-	all := lines
-	for len(lines) > 0 {
-		line := lines[0]
-		n := len(lines)
+	var prev span
+	for _, s := range parseSpans(lines) {
 		var b Block
-
-		switch {
-		case line == "":
-			// emit nothing
-
-		case isList(line):
-			prevWasBlank := len(lines) < len(all) && all[len(all)-len(lines)-1] == ""
-			b, lines = d.list(lines, prevWasBlank)
-
-		case isIndented(line):
-			b, lines = d.code(lines)
-
-		case (len(lines) == 1 || lines[1] == "") && !didHeading && isOldHeading(line, all, len(all)-n):
-			b = d.oldHeading(line)
-			didHeading = true
-
-		case (len(lines) == 1 || lines[1] == "") && isHeading(line):
-			b = d.heading(line)
-			didHeading = true
-
+		switch s.kind {
 		default:
-			b, lines = d.paragraph(lines)
-			didHeading = false
+			panic("go/doc/comment: internal error: unknown span kind")
+		case spanList:
+			b = d.list(lines[s.start:s.end], prev.end < s.start)
+		case spanCode:
+			b = d.code(lines[s.start:s.end])
+		case spanOldHeading:
+			b = d.oldHeading(lines[s.start])
+		case spanHeading:
+			b = d.heading(lines[s.start])
+		case spanPara:
+			b = d.paragraph(lines[s.start:s.end])
 		}
-
 		if b != nil {
 			d.Content = append(d.Content, b)
 		}
-		if len(lines) == n {
-			lines = lines[1:]
-		}
+		prev = s
 	}
 
 	// Second pass: interpret all the Plain text now that we know the links.
@@ -348,9 +332,172 @@ func (p *Parser) Parse(text string) *Doc {
 	return d.Doc
 }
 
+// A span represents a single span of comment lines (lines[start:end])
+// of an identified kind (code, heading, paragraph, and so on).
+type span struct {
+	start int
+	end   int
+	kind  spanKind
+}
+
+// A spanKind describes the kind of span.
+type spanKind int
+
+const (
+	_ spanKind = iota
+	spanCode
+	spanHeading
+	spanList
+	spanOldHeading
+	spanPara
+)
+
+func parseSpans(lines []string) []span {
+	var spans []span
+
+	// The loop may process a line twice: once as unindented
+	// and again forced indented. So the maximum expected
+	// number of iterations is 2*len(lines). The repeating logic
+	// can be subtle, though, and to protect against introduction
+	// of infinite loops in future changes, we watch to see that
+	// we are not looping too much. A panic is better than a
+	// quiet infinite loop.
+	watchdog := 2 * len(lines)
+
+	i := 0
+	forceIndent := 0
+Spans:
+	for {
+		// Skip blank lines.
+		for i < len(lines) && lines[i] == "" {
+			i++
+		}
+		if i >= len(lines) {
+			break
+		}
+		if watchdog--; watchdog < 0 {
+			panic("go/doc/comment: internal error: not making progress")
+		}
+
+		var kind spanKind
+		start := i
+		end := i
+		if i < forceIndent || indented(lines[i]) {
+			// Indented (or force indented).
+			// Ends before next unindented. (Blank lines are OK.)
+			// If this is an unindented list that we are heuristically treating as indented,
+			// then accept unindented list item lines up to the first blank lines.
+			// The heuristic is disabled at blank lines to contain its effect
+			// to non-gofmt'ed sections of the comment.
+			unindentedListOK := isList(lines[i]) && i < forceIndent
+			i++
+			for i < len(lines) && (lines[i] == "" || i < forceIndent || indented(lines[i]) || (unindentedListOK && isList(lines[i]))) {
+				if lines[i] == "" {
+					unindentedListOK = false
+				}
+				i++
+			}
+
+			// Drop trailing blank lines.
+			end = i
+			for end > start && lines[end-1] == "" {
+				end--
+			}
+
+			// If indented lines are followed (without a blank line)
+			// by an unindented line ending in a brace,
+			// take that one line too. This fixes the common mistake
+			// of pasting in something like
+			//
+			// func main() {
+			//	fmt.Println("hello, world")
+			// }
+			//
+			// and forgetting to indent it.
+			// The heuristic will never trigger on a gofmt'ed comment,
+			// because any gofmt'ed code block or list would be
+			// followed by a blank line or end of comment.
+			if end < len(lines) && strings.HasPrefix(lines[end], "}") {
+				end++
+			}
+
+			if isList(lines[start]) {
+				kind = spanList
+			} else {
+				kind = spanCode
+			}
+		} else {
+			// Unindented. Ends at next blank or indented line.
+			i++
+			for i < len(lines) && lines[i] != "" && !indented(lines[i]) {
+				i++
+			}
+			end = i
+
+			// If unindented lines are followed (without a blank line)
+			// by an indented line that would start a code block,
+			// check whether the final unindented lines
+			// should be left for the indented section.
+			// This can happen for the common mistakes of
+			// unindented code or unindented lists.
+			// The heuristic will never trigger on a gofmt'ed comment,
+			// because any gofmt'ed code block would have a blank line
+			// preceding it after the unindented lines.
+			if i < len(lines) && lines[i] != "" && !isList(lines[i]) {
+				switch {
+				case isList(lines[i-1]):
+					// If the final unindented line looks like a list item,
+					// this may be the first indented line wrap of
+					// a mistakenly unindented list.
+					// Leave all the unindented list items.
+					forceIndent = end
+					end--
+					for end > start && isList(lines[end-1]) {
+						end--
+					}
+
+				case strings.HasSuffix(lines[i-1], "{") || strings.HasSuffix(lines[i-1], `\`):
+					// If the final unindented line ended in { or \
+					// it is probably the start of a misindented code block.
+					// Give the user a single line fix.
+					// Often that's enough; if not, the user can fix the others themselves.
+					forceIndent = end
+					end--
+				}
+
+				if start == end && forceIndent > start {
+					i = start
+					continue Spans
+				}
+			}
+
+			// Span is either paragraph or heading.
+			if end-start == 1 && isHeading(lines[start]) {
+				kind = spanHeading
+			} else if end-start == 1 && isOldHeading(lines[start], lines, start) {
+				kind = spanOldHeading
+			} else {
+				kind = spanPara
+			}
+		}
+
+		spans = append(spans, span{start, end, kind})
+		i = end
+	}
+
+	return spans
+}
+
+// indented reports whether line is indented
+// (starts with a leading space or tab).
+func indented(line string) bool {
+	return line != "" && (line[0] == ' ' || line[0] == '\t')
+}
+
 // unindent removes any common space/tab prefix
 // from each line in lines, returning a copy of lines in which
 // those prefixes have been trimmed from each line.
+// It also replaces any lines containing only spaces with blank lines (empty strings).
 func unindent(lines []string) []string {
 	// Trim leading and trailing blank lines.
 	for len(lines) > 0 && isBlank(lines[0]) {
@@ -480,58 +627,16 @@ func (d *parseDoc) heading(line string) Block {
 	return &Heading{Text: []Text{Plain(strings.TrimSpace(line[1:]))}}
 }
 
-// code returns a code block built from the indented text
-// at the start of lines, along with the remainder of the lines.
-// If there is no indented text at the start, or if the indented
-// text consists only of empty lines, code returns a nil Block.
-func (d *parseDoc) code(lines []string) (b Block, rest []string) {
-	lines, rest = indented(lines)
+// code returns a code block built from the lines.
+func (d *parseDoc) code(lines []string) *Code {
 	body := unindent(lines)
-	if len(body) == 0 {
-		return nil, rest
-	}
 	body = append(body, "") // to get final \n from Join
-	return &Code{Text: strings.Join(body, "\n")}, rest
-}
-
-// isIndented reports whether the line is indented,
-// meaning it starts with a space or tab.
-func isIndented(line string) bool {
-	return line != "" && (line[0] == ' ' || line[0] == '\t')
+	return &Code{Text: strings.Join(body, "\n")}
 }
 
-// indented splits lines into an initial indented section
-// and the remaining lines, returning the two halves.
-func indented(lines []string) (indented, rest []string) {
-	// Blank lines mid-run are OK, but not at the end.
-	i := 0
-	for i < len(lines) && (isIndented(lines[i]) || lines[i] == "") {
-		i++
-	}
-	for i > 0 && lines[i-1] == "" {
-		i--
-	}
-	return lines[:i], lines[i:]
-}
-
-// paragraph returns a paragraph block built from the
-// unindented text at the start of lines, along with the remainder of the lines.
-// If there is no unindented text at the start of lines,
-// then paragraph returns a nil Block.
-func (d *parseDoc) paragraph(lines []string) (b Block, rest []string) {
-	// Paragraph is interrupted by any indented line,
-	// which is either a list or a code block,
-	// and of course by a blank line.
-	// It is not interrupted by a # line - headings must stand alone.
-	i := 0
-	for i < len(lines) && lines[i] != "" && !isIndented(lines[i]) {
-		i++
-	}
-	lines, rest = lines[:i], lines[i:]
-	if len(lines) == 0 {
-		return nil, rest
-	}
-
+// paragraph returns a paragraph block built from the lines.
+// If the lines are link definitions, paragraph adds them to d and returns nil.
+func (d *parseDoc) paragraph(lines []string) Block {
 	// Is this a block of known links? Handle.
 	var defs []*LinkDef
 	for _, line := range lines {
@@ -547,10 +652,10 @@ func (d *parseDoc) paragraph(lines []string) (b Block, rest []string) {
 			d.links[def.Text] = def
 		}
 	}
-	return nil, rest
+	return nil
 NoDefs:
 
-	return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}}, rest
+	return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}}
 }
 
 // parseLink parses a single link definition line:
@@ -581,14 +686,9 @@ func parseLink(line string) (*LinkDef, bool) {
 	return &LinkDef{Text: text, URL: url}, true
 }
 
-// list returns a list built from the indented text at the start of lines,
+// list returns a list built from the indented lines,
 // using forceBlankBefore as the value of the List's ForceBlankBefore field.
-// The caller is responsible for ensuring that the first line of lines
-// satisfies isList.
-// list returns the *List as a Block along with the remaining lines.
-func (d *parseDoc) list(lines []string, forceBlankBefore bool) (b Block, rest []string) {
-	lines, rest = indented(lines)
-
+func (d *parseDoc) list(lines []string, forceBlankBefore bool) *List {
 	num, _, _ := listMarker(lines[0])
 	var (
 		list *List = &List{ForceBlankBefore: forceBlankBefore}
@@ -597,7 +697,7 @@ func (d *parseDoc) list(lines []string, forceBlankBefore bool) (b Block, rest []
 	)
 	flush := func() {
 		if item != nil {
-			if para, _ := d.paragraph(text); para != nil {
+			if para := d.paragraph(text); para != nil {
 				item.Content = append(item.Content, para)
 			}
 		}
@@ -622,17 +722,14 @@ func (d *parseDoc) list(lines []string, forceBlankBefore bool) (b Block, rest []
 		text = append(text, strings.TrimSpace(line))
 	}
 	flush()
-	return list, rest
+	return list
 }
 
-// listMarker parses the line as an indented line beginning with a list marker.
+// listMarker parses the line as beginning with a list marker.
 // If it can do that, it returns the numeric marker ("" for a bullet list),
 // the rest of the line, and ok == true.
 // Otherwise, it returns "", "", false.
 func listMarker(line string) (num, rest string, ok bool) {
-	if !isIndented(line) {
-		return "", "", false
-	}
 	line = strings.TrimSpace(line)
 	if line == "" {
 		return "", "", false
@@ -654,7 +751,7 @@ func listMarker(line string) (num, rest string, ok bool) {
 		return "", "", false
 	}
 
-	if !isIndented(rest) || strings.TrimSpace(rest) == "" {
+	if !indented(rest) || strings.TrimSpace(rest) == "" {
 		return "", "", false
 	}
 
@@ -662,7 +759,8 @@ func listMarker(line string) (num, rest string, ok bool) {
 }
 
 // isList reports whether the line is the first line of a list,
-// meaning is indented and starts with a list marker.
+// meaning starts with a list marker after any indentation.
+// (The caller is responsible for checking the line is indented, as appropriate.)
 func isList(line string) bool {
 	_, _, ok := listMarker(line)
 	return ok
diff --git a/src/go/doc/comment/testdata/code4.txt b/src/go/doc/comment/testdata/code4.txt
new file mode 100644
index 0000000000..f128c9aeff
--- /dev/null
+++ b/src/go/doc/comment/testdata/code4.txt
@@ -0,0 +1,38 @@
+-- input --
+To test, run this command:
+  go test -more
+
+Or, to test specific things, run this command:
+
+go test -more \
+  -pkg first/package \
+  -pkg second/package \
+  -pkg third/package
+
+Happy testing!
+-- gofmt --
+To test, run this command:
+
+	go test -more
+
+Or, to test specific things, run this command:
+
+	go test -more \
+	  -pkg first/package \
+	  -pkg second/package \
+	  -pkg third/package
+
+Happy testing!
+-- markdown --
+To test, run this command:
+
+	go test -more
+
+Or, to test specific things, run this command:
+
+	go test -more \
+	  -pkg first/package \
+	  -pkg second/package \
+	  -pkg third/package
+
+Happy testing!
diff --git a/src/go/doc/comment/testdata/code5.txt b/src/go/doc/comment/testdata/code5.txt
new file mode 100644
index 0000000000..0e340dd129
--- /dev/null
+++ b/src/go/doc/comment/testdata/code5.txt
@@ -0,0 +1,21 @@
+-- input --
+L1
+L2
+L3
+L4
+L5
+- L6 {
+	L7
+}
+L8
+-- gofmt --
+L1
+L2
+L3
+L4
+L5
+  - L6 {
+    L7
+    }
+
+L8
diff --git a/src/go/doc/comment/testdata/code6.txt b/src/go/doc/comment/testdata/code6.txt
new file mode 100644
index 0000000000..d2915d1068
--- /dev/null
+++ b/src/go/doc/comment/testdata/code6.txt
@@ -0,0 +1,24 @@
+-- input --
+Run this program:
+
+func main() {
+	fmt.Println("hello, world")
+}
+
+Or this:
+
+go func() {
+	fmt.Println("hello, world")
+}()
+-- gofmt --
+Run this program:
+
+	func main() {
+		fmt.Println("hello, world")
+	}
+
+Or this:
+
+	go func() {
+		fmt.Println("hello, world")
+	}()
diff --git a/src/go/doc/comment/testdata/list10.txt b/src/go/doc/comment/testdata/list10.txt
new file mode 100644
index 0000000000..9c49083456
--- /dev/null
+++ b/src/go/doc/comment/testdata/list10.txt
@@ -0,0 +1,13 @@
+-- input --
+
+	1. This list
+	2. Starts the comment
+	3. And also has a blank line before it.
+
+All of which is a little weird.
+-- gofmt --
+ 1. This list
+ 2. Starts the comment
+ 3. And also has a blank line before it.
+
+All of which is a little weird.
diff --git a/src/go/doc/comment/testdata/list9.txt b/src/go/doc/comment/testdata/list9.txt
new file mode 100644
index 0000000000..48e4673d54
--- /dev/null
+++ b/src/go/doc/comment/testdata/list9.txt
@@ -0,0 +1,30 @@
+-- input --
+Text.
+
+1. Not a list
+2. because it is
+3. unindented.
+
+4. This one
+  is a list
+  because of the indented text.
+5. More wrapped
+  items.
+6. And unwrapped.
+
+7. The blank line stops the heuristic.
+-- gofmt --
+Text.
+
+1. Not a list
+2. because it is
+3. unindented.
+
+ 4. This one
+    is a list
+    because of the indented text.
+ 5. More wrapped
+    items.
+ 6. And unwrapped.
+
+7. The blank line stops the heuristic.
diff --git a/src/go/doc/comment/text.go b/src/go/doc/comment/text.go
index 86e5eebe9a..6f9c2e201d 100644
--- a/src/go/doc/comment/text.go
+++ b/src/go/doc/comment/text.go
@@ -134,7 +134,6 @@ func (p *textPrinter) block(out *bytes.Buffer, x Block) {
 }
 
 // text prints the text sequence x to out.
-// TODO: Wrap lines.
 func (p *textPrinter) text(out *bytes.Buffer, indent string, x []Text) {
 	p.oneLongLine(&p.long, x)
 	words := strings.Fields(p.long.String())
author	Russ Cox <rsc@golang.org>	2022-06-03 12:44:58 -0400
committer	Gopher Robot <gobot@golang.org>	2022-06-06 20:47:52 +0000
commit	3651a6117e9a88576615c29c4faf7eeec55d7691 (patch)
tree	3814530b85f5ac27871b607c8a1a0932538f6d47
parent	4c08260c51c6ebe405a78a30f970014af763ca38 (diff)
download	go-3651a6117e9a88576615c29c4faf7eeec55d7691.tar.gz go-3651a6117e9a88576615c29c4faf7eeec55d7691.zip