aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrad Fitzpatrick <bradfitz@golang.org>2011-06-06 15:56:15 -0700
committerBrad Fitzpatrick <bradfitz@golang.org>2011-06-06 15:56:15 -0700
commit5e03143c1aff5e62a8181f73fc52e5b7684d76de (patch)
tree0347e9cb5cc29cf0f22eb6c87abde5f77af626bd
parent9e857dbdcc66b8fa2da927e07531c6b8dc894cae (diff)
downloadgo-5e03143c1aff5e62a8181f73fc52e5b7684d76de.tar.gz
go-5e03143c1aff5e62a8181f73fc52e5b7684d76de.zip
html: improve attribute parsing, note package status
Fixes #1890 R=nigeltao CC=golang-dev https://golang.org/cl/4528102
-rw-r--r--src/pkg/html/doc.go1
-rw-r--r--src/pkg/html/token.go29
-rw-r--r--src/pkg/html/token_test.go10
3 files changed, 39 insertions, 1 deletions
diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go
index 55135c3d05..5bc0630861 100644
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@@ -4,6 +4,7 @@
/*
Package html implements an HTML5-compliant tokenizer and parser.
+INCOMPLETE.
Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
caller's responsibility to ensure that r provides UTF-8 encoded HTML.
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go
index 6d8eb604ef..23c95ece6f 100644
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -355,6 +355,33 @@ loop:
return z.buf[i0:i], z.trim(i)
}
+// attrName finds the largest attribute name at the start
+// of z.buf[i:] and returns it lower-cased, as well
+// as the trimmed cursor location after that word.
+//
+// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
+// TODO: unicode characters
+func (z *Tokenizer) attrName(i int) ([]byte, int) {
+ i0 := i
+loop:
+ for ; i < z.p1; i++ {
+ c := z.buf[i]
+ switch c {
+ case '<', '>', '"', '\'', '/', '=':
+ break loop
+ }
+ switch {
+ case 'A' <= c && c <= 'Z':
+ z.buf[i] = c + 'a' - 'A'
+ case c > ' ' && c < 0x7f:
+ // No-op.
+ default:
+ break loop
+ }
+ }
+ return z.buf[i0:i], z.trim(i)
+}
+
// Text returns the unescaped text of a TextToken or a CommentToken.
// The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte {
@@ -399,7 +426,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
// attribute for the current tag token and whether there are more attributes.
// The contents of the returned slices may change on the next call to Next.
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
- key, i := z.word(z.p0, true)
+ key, i := z.attrName(z.p0)
// Check for an empty attribute value.
if i == z.p1 {
z.p0 = i
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go
index 6291af6005..c17b436aab 100644
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -126,6 +126,11 @@ var tokenTests = []tokenTest{
`<input value="yes" foo="BAR">`,
},
{
+ "Unquoted attribute value, spaces",
+ `<input value = yes FOO = BAR>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
"Unquoted attribute value, trailing space",
`<input value=yes FOO=BAR >`,
`<input value="yes" foo="BAR">`,
@@ -145,6 +150,11 @@ var tokenTests = []tokenTest{
`<input value="I'm an attribute" FOO="BAR">`,
`<input value="I&apos;m an attribute" foo="BAR">`,
},
+ {
+ "Attribute name characters",
+ `<meta http-equiv="content-type">`,
+ `<meta http-equiv="content-type">`,
+ },
}
func TestTokenizer(t *testing.T) {