html: handle unexpected EOF during parsing.

This lets us parse HTML like "<html>foo". R=gri CC=golang-dev https://golang.org/cl/3460043
author: Nigel Tao <nigeltao@golang.org> 2010-12-08 08:59:20 +1100
committer: Nigel Tao <nigeltao@golang.org> 2010-12-08 08:59:20 +1100
commit: 49014c5b12bd00817203c5ae5dbd4ec1c8f6d157 (patch)
tree: 773b51c86dd6b0248cd1982d61e6272e5da32b41
parent: 8d50557979b30c277c7c846bb61f6d1a0466db4a (diff)
download: go-49014c5b12bd00817203c5ae5dbd4ec1c8f6d157.tar.gz
go-49014c5b12bd00817203c5ae5dbd4ec1c8f6d157.zip
2 files changed, 48 insertions, 24 deletions
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go
index d3c1f12135..acc3eccbcc 100644
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@@ -32,11 +32,6 @@ type Node struct {
 	Attr   []Attribute
 }
 
-// An insertion mode (section 10.2.3.1) is the state transition function from
-// a particular state in the HTML5 parser's state machine. In addition to
-// returning the next state, it also returns whether the token was consumed.
-type insertionMode func(*parser) (insertionMode, bool)
-
 // A parser implements the HTML5 parsing algorithm:
 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
 type parser struct {
@@ -121,11 +116,12 @@ func (p *parser) read() os.Error {
 		p.tok.Attr = nil
 		return nil
 	}
-	if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
-		return p.tokenizer.Error()
-	}
+	p.tokenizer.Next()
 	p.tok = p.tokenizer.Token()
-	if p.tok.Type == SelfClosingTagToken {
+	switch p.tok.Type {
+	case ErrorToken:
+		return p.tokenizer.Error()
+	case SelfClosingTagToken:
 		p.hasSelfClosingToken = true
 		p.tok.Type = StartTagToken
 	}
@@ -137,6 +133,13 @@ func (p *parser) acknowledgeSelfClosingTag() {
 	p.hasSelfClosingToken = false
 }
 
+// An insertion mode (section 10.2.3.1) is the state transition function from
+// a particular state in the HTML5 parser's state machine. It updates the
+// parser's fields depending on parser.token (where ErrorToken means EOF). In
+// addition to returning the next insertionMode state, it also returns whether
+// the token was consumed.
+type insertionMode func(*parser) (insertionMode, bool)
+
 // Section 10.2.5.4.
 func initialInsertionMode(p *parser) (insertionMode, bool) {
 	// TODO(nigeltao): check p.tok for DOCTYPE.
@@ -151,6 +154,8 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
 		implied bool
 	)
 	switch p.tok.Type {
+	case ErrorToken:
+		implied = true
 	case TextToken:
 		// TODO(nigeltao): distinguish whitespace text from others.
 		implied = true
@@ -162,7 +167,12 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
 			implied = true
 		}
 	case EndTagToken:
-		// TODO.
+		switch p.tok.Data {
+		case "head", "body", "html", "br":
+			implied = true
+		default:
+			// Ignore the token.
+		}
 	}
 	if add || implied {
 		p.addChild(&Node{
@@ -182,6 +192,8 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
 		implied bool
 	)
 	switch p.tok.Type {
+	case ErrorToken:
+		implied = true
 	case TextToken:
 		// TODO(nigeltao): distinguish whitespace text from others.
 		implied = true
@@ -191,12 +203,17 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
 			add = true
 			attr = p.tok.Attr
 		case "html":
-			// TODO.
+			return inBodyInsertionMode, false
 		default:
 			implied = true
 		}
 	case EndTagToken:
-		// TODO.
+		switch p.tok.Data {
+		case "head", "body", "html", "br":
+			implied = true
+		default:
+			// Ignore the token.
+		}
 	}
 	if add || implied {
 		p.addChild(&Node{
@@ -215,7 +232,7 @@ func inHeadInsertionMode(p *parser) (insertionMode, bool) {
 		implied bool
 	)
 	switch p.tok.Type {
-	case TextToken:
+	case ErrorToken, TextToken:
 		implied = true
 	case StartTagToken:
 		switch p.tok.Data {
@@ -251,7 +268,7 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
 		implied    bool
 	)
 	switch p.tok.Type {
-	case TextToken:
+	case ErrorToken, TextToken:
 		implied = true
 		framesetOK = true
 	case StartTagToken:
@@ -290,6 +307,8 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
 func inBodyInsertionMode(p *parser) (insertionMode, bool) {
 	var endP bool
 	switch p.tok.Type {
+	case ErrorToken:
+		// No-op.
 	case TextToken:
 		p.addText(p.tok.Data)
 		p.framesetOK = false
@@ -363,6 +382,8 @@ func inBodyInsertionMode(p *parser) (insertionMode, bool) {
 // Section 10.2.5.22.
 func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
 	switch p.tok.Type {
+	case ErrorToken:
+		// TODO.
 	case TextToken:
 		// TODO.
 	case StartTagToken:
@@ -395,6 +416,7 @@ func Parse(r io.Reader) (*Node, os.Error) {
 		scripting:  true,
 		framesetOK: true,
 	}
+	// Iterate until EOF. Any other error will cause an early return.
 	im, consumed := initialInsertionMode, true
 	for {
 		if consumed {
@@ -407,8 +429,11 @@ func Parse(r io.Reader) (*Node, os.Error) {
 		}
 		im, consumed = im(p)
 	}
-	// TODO(nigeltao): clean up, depending on the value of im.
-	// The specification's algorithm does clean up on reading an EOF 'token',
-	// but in go we represent EOF by an os.Error instead.
+	// Loop until the final token (the ErrorToken signifying EOF) is consumed.
+	for {
+		if im, consumed = im(p); consumed {
+			break
+		}
+	}
 	return p.doc, nil
 }
diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go
index 7fa4f42767..839a034b7d 100644
--- a/src/pkg/html/parse_test.go
+++ b/src/pkg/html/parse_test.go
@@ -106,12 +106,11 @@ func dump(n *Node) (string, os.Error) {
 	if n == nil || len(n.Child) == 0 {
 		return "", nil
 	}
-	if len(n.Child) > 1 {
-		return "too many children", nil
-	}
 	b := bytes.NewBuffer(nil)
-	if err := dumpLevel(b, n.Child[0], 0); err != nil {
-		return "", err
+	for _, child := range n.Child {
+		if err := dumpLevel(b, child, 0); err != nil {
+			return "", err
+		}
 	}
 	return b.String(), nil
 }
@@ -124,8 +123,8 @@ func TestParser(t *testing.T) {
 	for _, filename := range filenames {
 		rc := make(chan io.Reader)
 		go readDat(filename, rc)
-		// TODO(nigeltao): Process all test cases, not just the first three.
-		for i := 0; i < 3; i++ {
+		// TODO(nigeltao): Process all test cases, not just a subset.
+		for i := 0; i < 19; i++ {
 			// Parse the #data section.
 			doc, err := Parse(<-rc)
 			if err != nil {
author	Nigel Tao <nigeltao@golang.org>	2010-12-08 08:59:20 +1100
committer	Nigel Tao <nigeltao@golang.org>	2010-12-08 08:59:20 +1100
commit	49014c5b12bd00817203c5ae5dbd4ec1c8f6d157 (patch)
tree	773b51c86dd6b0248cd1982d61e6272e5da32b41
parent	8d50557979b30c277c7c846bb61f6d1a0466db4a (diff)
download	go-49014c5b12bd00817203c5ae5dbd4ec1c8f6d157.tar.gz go-49014c5b12bd00817203c5ae5dbd4ec1c8f6d157.zip