html: remove the Tokenizer.ReturnComments option.

The original intention was to simplify the parser, in making it skip all comment tokens. However, checking that the Go html package is 100% compatible with the WebKit HTML test suite requires parsing the comments. There is no longer any real benefit for the option. R=gri, andybalholm CC=golang-dev https://golang.org/cl/5321043
author: Nigel Tao <nigeltao@golang.org> 2011-10-25 11:28:07 +1100
committer: Nigel Tao <nigeltao@golang.org> 2011-10-25 11:28:07 +1100
commit: 18b025d530b2410c74c094c0e78671570c60b7bd (patch)
tree: 81a1e0f3e95da2a932f71be4f62e297ea096a0f5
parent: 5791233461d9eaef94f8a29cee7a1933a5c015d2 (diff)
download: go-18b025d530b2410c74c094c0e78671570c60b7bd.tar.gz
go-18b025d530b2410c74c094c0e78671570c60b7bd.zip
4 files changed, 21 insertions, 34 deletions
diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go
index 5bc0630861..ba9d188486 100644
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@@ -70,9 +70,6 @@ call to Next. For example, to extract an HTML page's anchor text:
 		}
 	}
 
-A Tokenizer typically skips over HTML comments. To return comment tokens, set
-Tokenizer.ReturnComments to true before looping over calls to Next.
-
 Parsing is done by calling Parse with an io.Reader, which returns the root of
 the parse tree (the document element) as a *Node. It is the caller's
 responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go
index 2c7294b4f3..d1d4e483c5 100644
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@@ -1067,7 +1067,6 @@ func Parse(r io.Reader) (*Node, os.Error) {
 		scripting:  true,
 		framesetOK: true,
 	}
-	p.tokenizer.ReturnComments = true
 	// Iterate until EOF. Any other error will cause an early return.
 	im, consumed := initialIM, true
 	for {
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go
index 2826f95f17..952d17468b 100644
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -116,10 +116,6 @@ type span struct {
 
 // A Tokenizer returns a stream of HTML Tokens.
 type Tokenizer struct {
-	// If ReturnComments is set, Next returns comment tokens;
-	// otherwise it skips over comments (default).
-	ReturnComments bool
-
 	// r is the source of the HTML text.
 	r io.Reader
 	// tt is the TokenType of the current token.
@@ -546,17 +542,19 @@ func (z *Tokenizer) readTagAttrVal() {
 	}
 }
 
-// next scans the next token and returns its type.
-func (z *Tokenizer) next() TokenType {
+// Next scans the next token and returns its type.
+func (z *Tokenizer) Next() TokenType {
 	if z.err != nil {
-		return ErrorToken
+		z.tt = ErrorToken
+		return z.tt
 	}
 	z.raw.start = z.raw.end
 	z.data.start = z.raw.end
 	z.data.end = z.raw.end
 	if z.rawTag != "" {
 		z.readRawOrRCDATA()
-		return TextToken
+		z.tt = TextToken
+		return z.tt
 	}
 	z.textIsRaw = false
 
@@ -596,11 +594,13 @@ loop:
 		if x := z.raw.end - len("<a"); z.raw.start < x {
 			z.raw.end = x
 			z.data.end = x
-			return TextToken
+			z.tt = TextToken
+			return z.tt
 		}
 		switch tokenType {
 		case StartTagToken:
-			return z.readStartTag()
+			z.tt = z.readStartTag()
+			return z.tt
 		case EndTagToken:
 			c = z.readByte()
 			if z.err != nil {
@@ -616,39 +616,31 @@ loop:
 			}
 			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
 				z.readEndTag()
-				return EndTagToken
+				z.tt = EndTagToken
+				return z.tt
 			}
 			z.raw.end--
 			z.readUntilCloseAngle()
-			return CommentToken
+			z.tt = CommentToken
+			return z.tt
 		case CommentToken:
 			if c == '!' {
-				return z.readMarkupDeclaration()
+				z.tt = z.readMarkupDeclaration()
+				return z.tt
 			}
 			z.raw.end--
 			z.readUntilCloseAngle()
-			return CommentToken
+			z.tt = CommentToken
+			return z.tt
 		}
 	}
 	if z.raw.start < z.raw.end {
 		z.data.end = z.raw.end
-		return TextToken
-	}
-	return ErrorToken
-}
-
-// Next scans the next token and returns its type.
-func (z *Tokenizer) Next() TokenType {
-	for {
-		z.tt = z.next()
-		// TODO: remove the ReturnComments option. A tokenizer should
-		// always return comment tags.
-		if z.tt == CommentToken && !z.ReturnComments {
-			continue
-		}
+		z.tt = TextToken
 		return z.tt
 	}
-	panic("unreachable")
+	z.tt = ErrorToken
+	return z.tt
 }
 
 // Raw returns the unmodified text of the current token. Calling Next, Token,
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
index 310cd97d67..45ce85e911 100644
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -424,7 +424,6 @@ func TestTokenizer(t *testing.T) {
 loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(strings.NewReader(tt.html))
-		z.ReturnComments = true
 		if tt.golden != "" {
 			for i, s := range strings.Split(tt.golden, "$") {
 				if z.Next() == ErrorToken {
author	Nigel Tao <nigeltao@golang.org>	2011-10-25 11:28:07 +1100
committer	Nigel Tao <nigeltao@golang.org>	2011-10-25 11:28:07 +1100
commit	18b025d530b2410c74c094c0e78671570c60b7bd (patch)
tree	81a1e0f3e95da2a932f71be4f62e297ea096a0f5
parent	5791233461d9eaef94f8a29cee7a1933a5c015d2 (diff)
download	go-18b025d530b2410c74c094c0e78671570c60b7bd.tar.gz go-18b025d530b2410c74c094c0e78671570c60b7bd.zip