aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Balholm <andybalholm@gmail.com>2011-12-01 12:47:57 +1100
committerNigel Tao <nigeltao@golang.org>2011-12-01 12:47:57 +1100
commitce27b00f48bf3b90445bb4bcd28f6115c129d75b (patch)
treeadbe15432fc6becd8074fb0939678b3dbebe24da
parent595efd0d205b2a1fe143440088f8f394b09c3b8c (diff)
downloadgo-ce27b00f48bf3b90445bb4bcd28f6115c129d75b.tar.gz
go-ce27b00f48bf3b90445bb4bcd28f6115c129d75b.zip
html: implement fragment parsing algorithm
Pass the tests in tests4.dat. R=nigeltao CC=golang-dev https://golang.org/cl/5447055
-rw-r--r--src/pkg/html/parse.go95
-rw-r--r--src/pkg/html/parse_test.go62
2 files changed, 127 insertions, 30 deletions
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go
index 45dc19150c..97fbc514d8 100644
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@@ -39,6 +39,9 @@ type parser struct {
fosterParenting bool
// quirks is whether the parser is operating in "quirks mode."
quirks bool
+ // context is the context element when parsing an HTML fragment
+ // (section 11.4).
+ context *Node
}
func (p *parser) top() *Node {
@@ -287,9 +290,10 @@ func (p *parser) setOriginalIM() {
func (p *parser) resetInsertionMode() {
for i := len(p.oe) - 1; i >= 0; i-- {
n := p.oe[i]
- if i == 0 {
- // TODO: set n to the context element, for HTML fragment parsing.
+ if i == 0 && p.context != nil {
+ n = p.context
}
+
switch n.Data {
case "select":
p.im = inSelectIM
@@ -1516,18 +1520,7 @@ func afterAfterFramesetIM(p *parser) bool {
return true
}
-// Parse returns the parse tree for the HTML from the given Reader.
-// The input is assumed to be UTF-8 encoded.
-func Parse(r io.Reader) (*Node, error) {
- p := &parser{
- tokenizer: NewTokenizer(r),
- doc: &Node{
- Type: DocumentNode,
- },
- scripting: true,
- framesetOK: true,
- im: initialIM,
- }
+func (p *parser) parse() error {
// Iterate until EOF. Any other error will cause an early return.
consumed := true
for {
@@ -1536,7 +1529,7 @@ func Parse(r io.Reader) (*Node, error) {
if err == io.EOF {
break
}
- return nil, err
+ return err
}
}
consumed = p.im(p)
@@ -1547,5 +1540,77 @@ func Parse(r io.Reader) (*Node, error) {
break
}
}
+ return nil
+}
+
+// Parse returns the parse tree for the HTML from the given Reader.
+// The input is assumed to be UTF-8 encoded.
+func Parse(r io.Reader) (*Node, error) {
+ p := &parser{
+ tokenizer: NewTokenizer(r),
+ doc: &Node{
+ Type: DocumentNode,
+ },
+ scripting: true,
+ framesetOK: true,
+ im: initialIM,
+ }
+ err := p.parse()
+ if err != nil {
+ return nil, err
+ }
return p.doc, nil
}
+
+// ParseFragment parses a fragment of HTML and returns the nodes that were
+// found. If the fragment is the InnerHTML for an existing element, pass that
+// element in context.
+func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
+ p := &parser{
+ tokenizer: NewTokenizer(r),
+ doc: &Node{
+ Type: DocumentNode,
+ },
+ scripting: true,
+ context: context,
+ }
+
+ if context != nil {
+ switch context.Data {
+ case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
+ p.tokenizer.rawTag = context.Data
+ }
+ }
+
+ root := &Node{
+ Type: ElementNode,
+ Data: "html",
+ }
+ p.doc.Add(root)
+ p.oe = nodeStack{root}
+ p.resetInsertionMode()
+
+ for n := context; n != nil; n = n.Parent {
+ if n.Type == ElementNode && n.Data == "form" {
+ p.form = n
+ break
+ }
+ }
+
+ err := p.parse()
+ if err != nil {
+ return nil, err
+ }
+
+ parent := p.doc
+ if context != nil {
+ parent = root
+ }
+
+ result := parent.Child
+ parent.Child = nil
+ for _, n := range result {
+ n.Parent = nil
+ }
+ return result, nil
+}
diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go
index ea72557a0b..e0c19cff6d 100644
--- a/src/pkg/html/parse_test.go
+++ b/src/pkg/html/parse_test.go
@@ -16,21 +16,21 @@ import (
)
// readParseTest reads a single test case from r.
-func readParseTest(r *bufio.Reader) (text, want string, err error) {
+func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
line, err := r.ReadSlice('\n')
if err != nil {
- return "", "", err
+ return "", "", "", err
}
var b []byte
// Read the HTML.
if string(line) != "#data\n" {
- return "", "", fmt.Errorf(`got %q want "#data\n"`, line)
+ return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil {
- return "", "", err
+ return "", "", "", err
}
if line[0] == '#' {
break
@@ -42,33 +42,45 @@ func readParseTest(r *bufio.Reader) (text, want string, err error) {
// Skip the error list.
if string(line) != "#errors\n" {
- return "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
+ return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil {
- return "", "", err
+ return "", "", "", err
}
if line[0] == '#' {
break
}
}
+ if string(line) == "#document-fragment\n" {
+ line, err = r.ReadSlice('\n')
+ if err != nil {
+ return "", "", "", err
+ }
+ context = strings.TrimSpace(string(line))
+ line, err = r.ReadSlice('\n')
+ if err != nil {
+ return "", "", "", err
+ }
+ }
+
// Read the dump of what the parse tree should be.
if string(line) != "#document\n" {
- return "", "", fmt.Errorf(`got %q want "#document\n"`, line)
+ return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil && err != io.EOF {
- return "", "", err
+ return "", "", "", err
}
if len(line) == 0 || len(line) == 1 && line[0] == '\n' {
break
}
b = append(b, line...)
}
- return text, string(b), nil
+ return text, string(b), context, nil
}
func dumpIndent(w io.Writer, level int) {
@@ -153,7 +165,7 @@ func TestParser(t *testing.T) {
{"tests1.dat", -1},
{"tests2.dat", -1},
{"tests3.dat", -1},
- // tests4.dat is fragment cases.
+ {"tests4.dat", -1},
{"tests5.dat", -1},
}
for _, tf := range testFiles {
@@ -164,17 +176,37 @@ func TestParser(t *testing.T) {
defer f.Close()
r := bufio.NewReader(f)
for i := 0; i != tf.n; i++ {
- text, want, err := readParseTest(r)
+ text, want, context, err := readParseTest(r)
if err == io.EOF && tf.n == -1 {
break
}
if err != nil {
t.Fatal(err)
}
- doc, err := Parse(strings.NewReader(text))
- if err != nil {
- t.Fatal(err)
+
+ var doc *Node
+ if context == "" {
+ doc, err = Parse(strings.NewReader(text))
+ if err != nil {
+ t.Fatal(err)
+ }
+ } else {
+ contextNode := &Node{
+ Type: ElementNode,
+ Data: context,
+ }
+ nodes, err := ParseFragment(strings.NewReader(text), contextNode)
+ if err != nil {
+ t.Fatal(err)
+ }
+ doc = &Node{
+ Type: DocumentNode,
+ }
+ for _, n := range nodes {
+ doc.Add(n)
+ }
}
+
got, err := dump(doc)
if err != nil {
t.Fatal(err)
@@ -184,7 +216,7 @@ func TestParser(t *testing.T) {
t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want)
continue
}
- if renderTestBlacklist[text] {
+ if renderTestBlacklist[text] || context != "" {
continue
}
// Check that rendering and re-parsing results in an identical tree.