aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNigel Tao <nigeltao@golang.org>2011-08-10 13:39:07 +1000
committerNigel Tao <nigeltao@golang.org>2011-08-10 13:39:07 +1000
commit37afff2978f3bb12170076192676e9348a7936fc (patch)
treee46f9fa1585e04ad0c054610987571ab3a47591d
parent1ac7a6970198b2448b1165d5e0d651ced4bf73d7 (diff)
downloadgo-37afff2978f3bb12170076192676e9348a7936fc.tar.gz
go-37afff2978f3bb12170076192676e9348a7936fc.zip
html: parse malformed tags missing a '>', such as `<p id=0</p>`.
The additional token_test.go cases matches html5lib behavior. Fixes #2124. R=gri CC=golang-dev https://golang.org/cl/4844055
-rw-r--r--src/pkg/html/token.go58
-rw-r--r--src/pkg/html/token_test.go17
2 files changed, 52 insertions, 23 deletions
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go
index d280ff2256..fddc922d60 100644
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -276,13 +276,12 @@ func (z *Tokenizer) nextTag() {
if z.err != nil {
return
}
- var tt TokenType
switch {
case c == '/':
- tt = EndTagToken
+ z.tt = EndTagToken
// Lower-cased characters are more common in tag names, so we check for them first.
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
- tt = StartTagToken
+ z.tt = StartTagToken
case c == '!':
z.nextMarkupDeclaration()
return
@@ -305,8 +304,7 @@ func (z *Tokenizer) nextTag() {
return
}
case '>':
- z.tt = tt
- if z.buf[z.p1-2] == '/' && tt == StartTagToken {
+ if z.buf[z.p1-2] == '/' && z.tt == StartTagToken {
z.tt = SelfClosingTagToken
}
return
@@ -379,37 +377,53 @@ func (z *Tokenizer) trim(i int) int {
return k
}
-// word finds the largest alphabetic [0-9A-Za-z]* word at the start
-// of z.buf[i:] and returns that word (optionally lower-cased), as
-// well as the trimmed cursor location after that word.
-func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
+// tagName finds the tag name at the start of z.buf[i:] and returns that name
+// lower-cased, as well as the trimmed cursor location afterwards.
+func (z *Tokenizer) tagName(i int) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
c := z.buf[i]
- switch {
- case '0' <= c && c <= '9':
- // No-op.
- case 'A' <= c && c <= 'Z':
- if lower {
- z.buf[i] = c + 'a' - 'A'
- }
- case 'a' <= c && c <= 'z':
- // No-op.
- default:
+ switch c {
+ case ' ', '\n', '\t', '\f', '/', '>':
break loop
}
+ if 'A' <= c && c <= 'Z' {
+ z.buf[i] = c + 'a' - 'A'
+ }
+ }
+ return z.buf[i0:i], z.trim(i)
+}
+
+// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:]
+// and returns that value, as well as the trimmed cursor location afterwards.
+func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) {
+ i0 := i
+loop:
+ for ; i < z.p1; i++ {
+ switch z.buf[i] {
+ case ' ', '\n', '\t', '\f', '>':
+ break loop
+ case '&':
+ // TODO: unescape the entity.
+ }
}
return z.buf[i0:i], z.trim(i)
}
// attrName finds the largest attribute name at the start
// of z.buf[i:] and returns it lower-cased, as well
-// as the trimmed cursor location after that word.
+// as the trimmed cursor location after that name.
//
// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
// TODO: unicode characters
func (z *Tokenizer) attrName(i int) ([]byte, int) {
+ for z.buf[i] == '/' {
+ i++
+ if z.buf[i] == '>' {
+ return nil, z.trim(i)
+ }
+ }
i0 := i
loop:
for ; i < z.p1; i++ {
@@ -469,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
if z.buf[i] == '/' {
i++
}
- name, z.p0 = z.word(i, true)
+ name, z.p0 = z.tagName(i)
hasAttr = z.p0 != z.p1
return
}
@@ -496,7 +510,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
}
closeQuote := z.buf[i]
if closeQuote != '\'' && closeQuote != '"' {
- val, z.p0 = z.word(i, false)
+ val, z.p0 = z.unquotedAttrVal(i)
moreAttr = z.p0 != z.p1
return
}
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
index c8dcc88648..1330f3247a 100644
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -41,6 +41,22 @@ var tokenTests = []tokenTest{
"<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>",
},
+ // Some malformed tags that are missing a '>'.
+ {
+ "malformed tag #0",
+ `<p</p>`,
+ `<p< p="">`,
+ },
+ {
+ "malformed tag #1",
+ `<p id=0</p>`,
+ `<p id="0&lt;/p">`,
+ },
+ {
+ "malformed tag #2",
+ `<p id="0</p>`,
+ `<p id="0&lt;/p&gt;">`,
+ },
// Comments.
{
"comment0",
@@ -117,7 +133,6 @@ var tokenTests = []tokenTest{
"&frac12;",
"½",
},
-
// Attribute tests:
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
{