diff options
author | Brad Fitzpatrick <bradfitz@golang.org> | 2011-06-06 15:56:15 -0700 |
---|---|---|
committer | Brad Fitzpatrick <bradfitz@golang.org> | 2011-06-06 15:56:15 -0700 |
commit | 5e03143c1aff5e62a8181f73fc52e5b7684d76de (patch) | |
tree | 0347e9cb5cc29cf0f22eb6c87abde5f77af626bd | |
parent | 9e857dbdcc66b8fa2da927e07531c6b8dc894cae (diff) | |
download | go-5e03143c1aff5e62a8181f73fc52e5b7684d76de.tar.gz go-5e03143c1aff5e62a8181f73fc52e5b7684d76de.zip |
html: improve attribute parsing, note package status
Fixes #1890
R=nigeltao
CC=golang-dev
https://golang.org/cl/4528102
-rw-r--r-- | src/pkg/html/doc.go | 1 | ||||
-rw-r--r-- | src/pkg/html/token.go | 29 | ||||
-rw-r--r-- | src/pkg/html/token_test.go | 10 |
3 files changed, 39 insertions, 1 deletions
diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go index 55135c3d05..5bc0630861 100644 --- a/src/pkg/html/doc.go +++ b/src/pkg/html/doc.go @@ -4,6 +4,7 @@ /* Package html implements an HTML5-compliant tokenizer and parser. +INCOMPLETE. Tokenization is done by creating a Tokenizer for an io.Reader r. It is the caller's responsibility to ensure that r provides UTF-8 encoded HTML. diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index 6d8eb604ef..23c95ece6f 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -355,6 +355,33 @@ loop: return z.buf[i0:i], z.trim(i) } +// attrName finds the largest attribute name at the start +// of z.buf[i:] and returns it lower-cased, as well +// as the trimmed cursor location after that word. +// +// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name +// TODO: unicode characters +func (z *Tokenizer) attrName(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + c := z.buf[i] + switch c { + case '<', '>', '"', '\'', '/', '=': + break loop + } + switch { + case 'A' <= c && c <= 'Z': + z.buf[i] = c + 'a' - 'A' + case c > ' ' && c < 0x7f: + // No-op. + default: + break loop + } + } + return z.buf[i0:i], z.trim(i) +} + // Text returns the unescaped text of a TextToken or a CommentToken. // The contents of the returned slice may change on the next call to Next. func (z *Tokenizer) Text() []byte { @@ -399,7 +426,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { // attribute for the current tag token and whether there are more attributes. // The contents of the returned slices may change on the next call to Next. func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { - key, i := z.word(z.p0, true) + key, i := z.attrName(z.p0) // Check for an empty attribute value. if i == z.p1 { z.p0 = i diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index 6291af6005..c17b436aab 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -126,6 +126,11 @@ var tokenTests = []tokenTest{ `<input value="yes" foo="BAR">`, }, { + "Unquoted attribute value, spaces", + `<input value = yes FOO = BAR>`, + `<input value="yes" foo="BAR">`, + }, + { "Unquoted attribute value, trailing space", `<input value=yes FOO=BAR >`, `<input value="yes" foo="BAR">`, @@ -145,6 +150,11 @@ var tokenTests = []tokenTest{ `<input value="I'm an attribute" FOO="BAR">`, `<input value="I'm an attribute" foo="BAR">`, }, + { + "Attribute name characters", + `<meta http-equiv="content-type">`, + `<meta http-equiv="content-type">`, + }, } func TestTokenizer(t *testing.T) { |