aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNigel Tao <nigeltao@golang.org>2010-12-08 08:59:20 +1100
committerNigel Tao <nigeltao@golang.org>2010-12-08 08:59:20 +1100
commit49014c5b12bd00817203c5ae5dbd4ec1c8f6d157 (patch)
tree773b51c86dd6b0248cd1982d61e6272e5da32b41
parent8d50557979b30c277c7c846bb61f6d1a0466db4a (diff)
downloadgo-49014c5b12bd00817203c5ae5dbd4ec1c8f6d157.tar.gz
go-49014c5b12bd00817203c5ae5dbd4ec1c8f6d157.zip
html: handle unexpected EOF during parsing.
This lets us parse HTML like "<html>foo". R=gri CC=golang-dev https://golang.org/cl/3460043
-rw-r--r--src/pkg/html/parse.go59
-rw-r--r--src/pkg/html/parse_test.go13
2 files changed, 48 insertions, 24 deletions
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go
index d3c1f12135..acc3eccbcc 100644
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@@ -32,11 +32,6 @@ type Node struct {
Attr []Attribute
}
-// An insertion mode (section 10.2.3.1) is the state transition function from
-// a particular state in the HTML5 parser's state machine. In addition to
-// returning the next state, it also returns whether the token was consumed.
-type insertionMode func(*parser) (insertionMode, bool)
-
// A parser implements the HTML5 parsing algorithm:
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
type parser struct {
@@ -121,11 +116,12 @@ func (p *parser) read() os.Error {
p.tok.Attr = nil
return nil
}
- if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
- return p.tokenizer.Error()
- }
+ p.tokenizer.Next()
p.tok = p.tokenizer.Token()
- if p.tok.Type == SelfClosingTagToken {
+ switch p.tok.Type {
+ case ErrorToken:
+ return p.tokenizer.Error()
+ case SelfClosingTagToken:
p.hasSelfClosingToken = true
p.tok.Type = StartTagToken
}
@@ -137,6 +133,13 @@ func (p *parser) acknowledgeSelfClosingTag() {
p.hasSelfClosingToken = false
}
+// An insertion mode (section 10.2.3.1) is the state transition function from
+// a particular state in the HTML5 parser's state machine. It updates the
+// parser's fields depending on parser.token (where ErrorToken means EOF). In
+// addition to returning the next insertionMode state, it also returns whether
+// the token was consumed.
+type insertionMode func(*parser) (insertionMode, bool)
+
// Section 10.2.5.4.
func initialInsertionMode(p *parser) (insertionMode, bool) {
// TODO(nigeltao): check p.tok for DOCTYPE.
@@ -151,6 +154,8 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
implied bool
)
switch p.tok.Type {
+ case ErrorToken:
+ implied = true
case TextToken:
// TODO(nigeltao): distinguish whitespace text from others.
implied = true
@@ -162,7 +167,12 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
implied = true
}
case EndTagToken:
- // TODO.
+ switch p.tok.Data {
+ case "head", "body", "html", "br":
+ implied = true
+ default:
+ // Ignore the token.
+ }
}
if add || implied {
p.addChild(&Node{
@@ -182,6 +192,8 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
implied bool
)
switch p.tok.Type {
+ case ErrorToken:
+ implied = true
case TextToken:
// TODO(nigeltao): distinguish whitespace text from others.
implied = true
@@ -191,12 +203,17 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
add = true
attr = p.tok.Attr
case "html":
- // TODO.
+ return inBodyInsertionMode, false
default:
implied = true
}
case EndTagToken:
- // TODO.
+ switch p.tok.Data {
+ case "head", "body", "html", "br":
+ implied = true
+ default:
+ // Ignore the token.
+ }
}
if add || implied {
p.addChild(&Node{
@@ -215,7 +232,7 @@ func inHeadInsertionMode(p *parser) (insertionMode, bool) {
implied bool
)
switch p.tok.Type {
- case TextToken:
+ case ErrorToken, TextToken:
implied = true
case StartTagToken:
switch p.tok.Data {
@@ -251,7 +268,7 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
implied bool
)
switch p.tok.Type {
- case TextToken:
+ case ErrorToken, TextToken:
implied = true
framesetOK = true
case StartTagToken:
@@ -290,6 +307,8 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
func inBodyInsertionMode(p *parser) (insertionMode, bool) {
var endP bool
switch p.tok.Type {
+ case ErrorToken:
+ // No-op.
case TextToken:
p.addText(p.tok.Data)
p.framesetOK = false
@@ -363,6 +382,8 @@ func inBodyInsertionMode(p *parser) (insertionMode, bool) {
// Section 10.2.5.22.
func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
switch p.tok.Type {
+ case ErrorToken:
+ // TODO.
case TextToken:
// TODO.
case StartTagToken:
@@ -395,6 +416,7 @@ func Parse(r io.Reader) (*Node, os.Error) {
scripting: true,
framesetOK: true,
}
+ // Iterate until EOF. Any other error will cause an early return.
im, consumed := initialInsertionMode, true
for {
if consumed {
@@ -407,8 +429,11 @@ func Parse(r io.Reader) (*Node, os.Error) {
}
im, consumed = im(p)
}
- // TODO(nigeltao): clean up, depending on the value of im.
- // The specification's algorithm does clean up on reading an EOF 'token',
- // but in go we represent EOF by an os.Error instead.
+ // Loop until the final token (the ErrorToken signifying EOF) is consumed.
+ for {
+ if im, consumed = im(p); consumed {
+ break
+ }
+ }
return p.doc, nil
}
diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go
index 7fa4f42767..839a034b7d 100644
--- a/src/pkg/html/parse_test.go
+++ b/src/pkg/html/parse_test.go
@@ -106,12 +106,11 @@ func dump(n *Node) (string, os.Error) {
if n == nil || len(n.Child) == 0 {
return "", nil
}
- if len(n.Child) > 1 {
- return "too many children", nil
- }
b := bytes.NewBuffer(nil)
- if err := dumpLevel(b, n.Child[0], 0); err != nil {
- return "", err
+ for _, child := range n.Child {
+ if err := dumpLevel(b, child, 0); err != nil {
+ return "", err
+ }
}
return b.String(), nil
}
@@ -124,8 +123,8 @@ func TestParser(t *testing.T) {
for _, filename := range filenames {
rc := make(chan io.Reader)
go readDat(filename, rc)
- // TODO(nigeltao): Process all test cases, not just the first three.
- for i := 0; i < 3; i++ {
+ // TODO(nigeltao): Process all test cases, not just a subset.
+ for i := 0; i < 19; i++ {
// Parse the #data section.
doc, err := Parse(<-rc)
if err != nil {