diff options
author | Andrew Balholm <andybalholm@gmail.com> | 2011-10-23 18:36:01 +1100 |
---|---|---|
committer | Nigel Tao <nigeltao@golang.org> | 2011-10-23 18:36:01 +1100 |
commit | 2aa589c8438debaef249e7fbcd9dd3fa0546c9c8 (patch) | |
tree | 75cd9555b2200bd70abf1158982842dd1dc9ef7f | |
parent | 2f352ae48abf1a714f7b3bfb097fab6451067599 (diff) | |
download | go-2aa589c8438debaef249e7fbcd9dd3fa0546c9c8.tar.gz go-2aa589c8438debaef249e7fbcd9dd3fa0546c9c8.zip |
html: implement foster parenting
Implement the foster-parenting algorithm for content that is inside a table
but not in a cell.
Also fix a bug in reconstructing the active formatting elements.
Pass test 30 in tests1.dat:
<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y
R=nigeltao
CC=golang-dev
https://golang.org/cl/5309052
-rw-r--r-- | src/pkg/html/parse.go | 123 | ||||
-rw-r--r-- | src/pkg/html/parse_test.go | 9 |
2 files changed, 103 insertions, 29 deletions
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go index 5b25e2620d..2c7294b4f3 100644 --- a/src/pkg/html/parse.go +++ b/src/pkg/html/parse.go @@ -32,6 +32,9 @@ type parser struct { // originalIM is the insertion mode to go back to after completing a text // or inTableText insertion mode. originalIM insertionMode + // fosterParenting is whether new elements should be inserted according to + // the foster parenting rules (section 11.2.5.3). + fosterParenting bool } func (p *parser) top() *Node { @@ -103,12 +106,56 @@ func (p *parser) elementInScope(stopTags []string, matchTags ...string) bool { // addChild adds a child node n to the top element, and pushes n onto the stack // of open elements if it is an element node. func (p *parser) addChild(n *Node) { - p.top().Add(n) + if p.fosterParenting { + p.fosterParent(n) + } else { + p.top().Add(n) + } + if n.Type == ElementNode { p.oe = append(p.oe, n) } } +// fosterParent adds a child node according to the foster parenting rules. +// Section 11.2.5.3, "foster parenting". +func (p *parser) fosterParent(n *Node) { + var table, parent *Node + var i int + for i = len(p.oe) - 1; i >= 0; i-- { + if p.oe[i].Data == "table" { + table = p.oe[i] + break + } + } + + if table == nil { + // The foster parent is the html element. + parent = p.oe[0] + } else { + parent = table.Parent + } + if parent == nil { + parent = p.oe[i-1] + } + + var child *Node + for i, child = range parent.Child { + if child == table { + break + } + } + + if i == len(parent.Child) { + parent.Add(n) + } else { + // Insert n into parent.Child at index i. + parent.Child = append(parent.Child[:i+1], parent.Child[i:]...) + parent.Child[i] = n + n.Parent = parent + } +} + // addText adds text to the preceding node if it is a text node, or else it // calls addChild with a new text node. func (p *parser) addText(text string) { @@ -170,9 +217,9 @@ func (p *parser) reconstructActiveFormattingElements() { } for { i++ - n = p.afe[i] - p.addChild(n.clone()) - p.afe[i] = n + clone := p.afe[i].clone() + p.addChild(clone) + p.afe[i] = clone if i == len(p.afe)-1 { break } @@ -536,10 +583,7 @@ func inBodyIM(p *parser) (insertionMode, bool) { case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u": p.inBodyEndTagFormatting(p.tok.Data) default: - // TODO: any other end tag - if p.tok.Data == p.top().Data { - p.oe.pop() - } + p.inBodyEndTagOther(p.tok.Data) } case CommentToken: p.addChild(&Node{ @@ -573,6 +617,7 @@ func (p *parser) inBodyEndTagFormatting(tag string) { } } if formattingElement == nil { + p.inBodyEndTagOther(tag) return } feIndex := p.oe.index(formattingElement) @@ -645,8 +690,7 @@ func (p *parser) inBodyEndTagFormatting(tag string) { } switch commonAncestor.Data { case "table", "tbody", "tfoot", "thead", "tr": - // TODO: fix up misnested table nodes; find the foster parent. - fallthrough + p.fosterParent(lastNode) default: commonAncestor.Add(lastNode) } @@ -667,6 +711,19 @@ func (p *parser) inBodyEndTagFormatting(tag string) { } } +// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. +func (p *parser) inBodyEndTagOther(tag string) { + for i := len(p.oe) - 1; i >= 0; i-- { + if p.oe[i].Data == tag { + p.oe = p.oe[:i] + break + } + if isSpecialElement[p.oe[i].Data] { + break + } + } +} + // Section 11.2.5.4.8. func textIM(p *parser) (insertionMode, bool) { switch p.tok.Type { @@ -683,12 +740,6 @@ func textIM(p *parser) (insertionMode, bool) { // Section 11.2.5.4.9. func inTableIM(p *parser) (insertionMode, bool) { - var ( - add bool - data string - attr []Attribute - consumed bool - ) switch p.tok.Type { case ErrorToken: // Stop parsing. @@ -698,13 +749,19 @@ func inTableIM(p *parser) (insertionMode, bool) { case StartTagToken: switch p.tok.Data { case "tbody", "tfoot", "thead": - add = true - data = p.tok.Data - attr = p.tok.Attr - consumed = true + p.clearStackToTableContext() + p.addElement(p.tok.Data, p.tok.Attr) + return inTableBodyIM, true case "td", "th", "tr": - add = true - data = "tbody" + p.clearStackToTableContext() + p.addElement("tbody", nil) + return inTableBodyIM, false + case "table": + if p.popUntil(tableScopeStopTags, "table") { + return p.resetInsertionMode(), false + } + // Ignore the token. + return inTableIM, true default: // TODO. } @@ -727,13 +784,23 @@ func inTableIM(p *parser) (insertionMode, bool) { }) return inTableIM, true } - if add { - // TODO: clear the stack back to a table context. - p.addElement(data, attr) - return inTableBodyIM, consumed + + switch p.top().Data { + case "table", "tbody", "tfoot", "thead", "tr": + p.fosterParenting = true + defer func() { p.fosterParenting = false }() + } + + return useTheRulesFor(p, inTableIM, inBodyIM) +} + +func (p *parser) clearStackToTableContext() { + for i := len(p.oe) - 1; i >= 0; i-- { + if x := p.oe[i].Data; x == "table" || x == "html" { + p.oe = p.oe[:i+1] + return + } } - // TODO: return useTheRulesFor(inTableIM, inBodyIM, p) unless etc. etc. foster parenting. - return inTableIM, true } // Section 11.2.5.4.13. diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go index 2c56ffd6ad..652bf805de 100644 --- a/src/pkg/html/parse_test.go +++ b/src/pkg/html/parse_test.go @@ -123,7 +123,7 @@ func TestParser(t *testing.T) { rc := make(chan io.Reader) go readDat(filename, rc) // TODO(nigeltao): Process all test cases, not just a subset. - for i := 0; i < 30; i++ { + for i := 0; i < 31; i++ { // Parse the #data section. b, err := ioutil.ReadAll(<-rc) if err != nil { @@ -152,6 +152,13 @@ func TestParser(t *testing.T) { continue } // Check that rendering and re-parsing results in an identical tree. + if filename == "tests1.dat" && i == 30 { + // Test 30 in tests1.dat is such messed-up markup that a correct parse + // results in a non-conforming tree (one <a> element nested inside another). + // Therefore when it is rendered and re-parsed, it isn't the same. + // So we skip rendering on that test. + continue + } pr, pw := io.Pipe() go func() { pw.CloseWithError(Render(pw, doc)) |