diff options
author | Nigel Tao <nigeltao@golang.org> | 2010-08-10 16:08:21 +1000 |
---|---|---|
committer | Nigel Tao <nigeltao@golang.org> | 2010-08-10 16:08:21 +1000 |
commit | 56b989f1b9d5290ab38bcdd96be80600ea97b61b (patch) | |
tree | 3f8a810ea57f0a3da41ead5ae5b01184867e98dd | |
parent | 96d7c8d4a85e7c5bb317b6c737d5b2dbf0e69c25 (diff) | |
download | go-56b989f1b9d5290ab38bcdd96be80600ea97b61b.tar.gz go-56b989f1b9d5290ab38bcdd96be80600ea97b61b.zip |
First cut of an HTML tokenizer (and eventually a parser).
R=r, rsc, gri, rsc1
CC=golang-dev
https://golang.org/cl/1814044
-rw-r--r-- | src/pkg/Makefile | 1 | ||||
-rw-r--r-- | src/pkg/html/Makefile | 14 | ||||
-rw-r--r-- | src/pkg/html/doc.go | 87 | ||||
-rw-r--r-- | src/pkg/html/entity.go | 38 | ||||
-rw-r--r-- | src/pkg/html/escape.go | 89 | ||||
-rw-r--r-- | src/pkg/html/token.go | 406 | ||||
-rw-r--r-- | src/pkg/html/token_test.go | 162 |
7 files changed, 797 insertions, 0 deletions
diff --git a/src/pkg/Makefile b/src/pkg/Makefile index c410697abf..7d135962f1 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -82,6 +82,7 @@ DIRS=\ hash/adler32\ hash/crc32\ hash/crc64\ + html\ http\ http/pprof\ image\ diff --git a/src/pkg/html/Makefile b/src/pkg/html/Makefile new file mode 100644 index 0000000000..63000e01b4 --- /dev/null +++ b/src/pkg/html/Makefile @@ -0,0 +1,14 @@ +# Copyright 2010 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include ../../Make.$(GOARCH) + +TARG=html +GOFILES=\ + doc.go\ + entity.go\ + escape.go\ + token.go\ + +include ../../Make.pkg diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go new file mode 100644 index 0000000000..9f5d478b42 --- /dev/null +++ b/src/pkg/html/doc.go @@ -0,0 +1,87 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +The html package implements an HTML5-compliant tokenizer and parser. + +Tokenization is done by creating a Tokenizer for an io.Reader r. It is the +caller's responsibility to ensure that r provides UTF-8 encoded HTML. + + z := html.NewTokenizer(r) + +Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(), +which parses the next token and returns its type, or an error: + + for { + tt := z.Next() + if tt == html.Error { + // ... + return ... + } + // Process the current token. + } + +There are two APIs for retrieving the current token. The high-level API is to +call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs +allow optionally calling Raw after Next but before Token, Text, TagName, or +TagAttr. In EBNF notation, the valid call sequence per token is: + + Next {Raw} [ Token | Text | TagName {TagAttr} ] + +Token returns an independent data structure that completely describes a token. +Entities (such as "<") are unescaped, tag names and attribute keys are +lower-cased, and attributes are collected into a []Attribute. For example: + + for { + if z.Next() == html.Error { + // Returning os.EOF indicates success. + return z.Error() + } + emitToken(z.Token()) + } + +The low-level API performs fewer allocations and copies, but the contents of +the []byte values returned by Text, TagName and TagAttr may change on the next +call to Next. For example, to extract an HTML page's anchor text: + + depth := 0 + for { + tt := z.Next() + switch tt { + case Error: + return z.Error() + case Text: + if depth > 0 { + // emitBytes should copy the []byte it receives, + // if it doesn't process it immediately. + emitBytes(z.Text()) + } + case StartTag, EndTag: + tn, _ := z.TagName() + if len(tn) == 1 && tn[0] == 'a' { + if tt == StartTag { + depth++ + } else { + depth-- + } + } + } + } + +The relevant specifications include: +http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html and +http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html +*/ +package html + +// The tokenization algorithm implemented by this package is not a line-by-line +// transliteration of the relatively verbose state-machine in the WHATWG +// specification. A more direct approach is used instead, where the program +// counter implies the state, such as whether it is tokenizing a tag or a text +// node. Specification compliance is verified by checking expected and actual +// outputs over a test suite rather than aiming for algorithmic fidelity. + +// TODO(nigeltao): Implement a parser, not just a tokenizer. +// TODO(nigeltao): Does a DOM API belong in this package or a separate one? +// TODO(nigeltao): How does parsing interact with a JavaScript engine? diff --git a/src/pkg/html/entity.go b/src/pkg/html/entity.go new file mode 100644 index 0000000000..e9f27b9041 --- /dev/null +++ b/src/pkg/html/entity.go @@ -0,0 +1,38 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "utf8" +) + +// entity is a map from HTML entity names to their values. The semicolon matters: +// http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html +// lists both "amp" and "amp;" as two separate entries. +// +// TODO(nigeltao): Take the complete map from the HTML5 spec section 10.5 "Named character references". +// http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html +// Note that the HTML5 list is larger than the HTML4 list at +// http://www.w3.org/TR/html4/sgml/entities.html +var entity = map[string]int{ + "aacute": '\U000000E1', + "aacute;": '\U000000E1', + "amp;": '\U00000026', + "apos;": '\U00000027', + "gt;": '\U0000003E', + "lt;": '\U0000003C', + "quot;": '\U00000022', +} + +func init() { + // We verify that the length of UTF-8 encoding of each value is <= 1 + len(key). + // The +1 comes from the leading "&". This property implies that the length of + // unescaped text is <= the length of escaped text. + for k, v := range entity { + if 1+len(k) < utf8.RuneLen(v) { + panic("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v)) + } + } +} diff --git a/src/pkg/html/escape.go b/src/pkg/html/escape.go new file mode 100644 index 0000000000..f9fdf8c4d9 --- /dev/null +++ b/src/pkg/html/escape.go @@ -0,0 +1,89 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "strings" + "utf8" +) + +// unescapeEntity reads an entity like "<" from b[src:] and writes the +// corresponding "<" to b[dst:], returning the incremented dst and src cursors. +// Precondition: src[0] == '&' && dst <= src. +func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { + // TODO(nigeltao): Check that this entity substitution algorithm matches the spec: + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference + // TODO(nigeltao): Handle things like "中" or "中". + + // i starts at 1 because we already know that s[0] == '&'. + i, s := 1, b[src:] + for i < len(s) { + c := s[i] + i++ + // Lower-cased characters are more common in entities, so we check for them first. + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + continue + } + if c != ';' { + i-- + } + x := entity[string(s[1:i])] + if x != 0 { + return dst + utf8.EncodeRune(x, b[dst:]), src + i + } + break + } + dst1, src1 = dst+i, src+i + copy(b[dst:dst1], b[src:src1]) + return dst1, src1 +} + +// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b". +func unescape(b []byte) []byte { + for i, c := range b { + if c == '&' { + dst, src := unescapeEntity(b, i, i) + for src < len(b) { + c := b[src] + if c == '&' { + dst, src = unescapeEntity(b, dst, src) + } else { + b[dst] = c + dst, src = dst+1, src+1 + } + } + return b[0:dst] + } + } + return b +} + +// EscapeString escapes special characters like "<" to become "<". It +// escapes only five such characters: amp, apos, lt, gt and quot. +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func EscapeString(s string) string { + // TODO(nigeltao): Do this much more efficiently. + s = strings.Replace(s, `&`, `&`, -1) + s = strings.Replace(s, `'`, `'`, -1) + s = strings.Replace(s, `<`, `<`, -1) + s = strings.Replace(s, `>`, `>`, -1) + s = strings.Replace(s, `"`, `"`, -1) + return s +} + +// UnescapeString unescapes entities like "<" to become "<". It unescapes a +// larger range of entities than EscapeString escapes. For example, "á" +// unescapes to "รก", as does "á" and "&xE1;". +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func UnescapeString(s string) string { + for _, c := range s { + if c == '&' { + return string(unescape([]byte(s))) + } + } + return s +} diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go new file mode 100644 index 0000000000..0681af44a4 --- /dev/null +++ b/src/pkg/html/token.go @@ -0,0 +1,406 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "io" + "log" + "os" + "strconv" +) + +func init() { + // TODO(nigeltao): Remove this when ready. + log.Stderr("The html package is incomplete; do not use for production software.") +} + +// A TokenType is the type of a Token. +type TokenType int + +const ( + // Error means that an error occurred during tokenization. + Error TokenType = iota + // Text means a text node. + Text + // A StartTag looks like <a>. + StartTag + // An EndTag looks like </a>. + EndTag + // A SelfClosingTag tag looks like <br/>. + SelfClosingTag +) + +// String returns a string representation of the TokenType. +func (t TokenType) String() string { + switch t { + case Error: + return "Error" + case Text: + return "Text" + case StartTag: + return "StartTag" + case EndTag: + return "EndTag" + case SelfClosingTag: + return "SelfClosingTag" + } + return "Invalid(" + strconv.Itoa(int(t)) + ")" +} + +// An Attribute is an attribute key-value pair. Key is alphabetic (and hence +// does not contain escapable characters like '&', '<' or '>'), and Val is +// unescaped (it looks like "a<b" rather than "a<b"). +type Attribute struct { + Key, Val string +} + +// A Token consists of a TokenType and some Data (tag name for start and end +// tags, content for text). A tag Token may also contain a slice of Attributes. +// Data is unescaped for both tag and text Tokens (it looks like "a<b" rather +// than "a<b"). +type Token struct { + Type TokenType + Data string + Attr []Attribute +} + +// tagString returns a string representation of a tag Token's Data and Attr. +func (t Token) tagString() string { + // TODO(nigeltao): Don't use string concatenation; it is inefficient. + s := string(t.Data) + for _, a := range t.Attr { + s += ` ` + a.Key + `="` + EscapeString(a.Val) + `"` + } + return s +} + +// String returns a string representation of the Token. +func (t Token) String() string { + switch t.Type { + case Error: + return "" + case Text: + return EscapeString(t.Data) + case StartTag: + return "<" + t.tagString() + ">" + case EndTag: + return "</" + t.tagString() + ">" + case SelfClosingTag: + return "<" + t.tagString() + "/>" + } + return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" +} + +// A Tokenizer returns a stream of HTML Tokens. +type Tokenizer struct { + // r is the source of the HTML text. + r io.Reader + // tt is the TokenType of the most recently read token. If tt == Error + // then err is the error associated with trying to read that token. + tt TokenType + err os.Error + // buf[p0:p1] holds the raw data of the most recent token. + // buf[p1:] is buffered input that will yield future tokens. + p0, p1 int + buf []byte +} + +// Error returns the error associated with the most recent Error token. This is +// typically os.EOF, meaning the end of tokenization. +func (z *Tokenizer) Error() os.Error { + if z.tt != Error { + return nil + } + return z.err +} + +// Raw returns the unmodified text of the current token. Calling Next, Token, +// Text, TagName or TagAttr may change the contents of the returned slice. +func (z *Tokenizer) Raw() []byte { + return z.buf[z.p0:z.p1] +} + +// readByte returns the next byte from the input stream, doing a buffered read +// from z.r into z.buf if necessary. z.buf[z.p0:z.p1] remains a contiguous byte +// slice that holds all the bytes read so far for the current token. +func (z *Tokenizer) readByte() (byte, os.Error) { + if z.p1 >= len(z.buf) { + // Our buffer is exhausted and we have to read from z.r. + // We copy z.buf[z.p0:z.p1] to the beginning of z.buf. If the length + // z.p1 - z.p0 is more than half the capacity of z.buf, then we + // allocate a new buffer before the copy. + c := cap(z.buf) + d := z.p1 - z.p0 + var buf1 []byte + if 2*d > c { + buf1 = make([]byte, d, 2*c) + } else { + buf1 = z.buf[0:d] + } + copy(buf1, z.buf[z.p0:z.p1]) + z.p0, z.p1, z.buf = 0, d, buf1[0:d] + // Now that we have copied the live bytes to the start of the buffer, + // we read from z.r into the remainder. + n, err := z.r.Read(buf1[d:cap(buf1)]) + if err != nil { + return 0, err + } + z.buf = buf1[0 : d+n] + } + x := z.buf[z.p1] + z.p1++ + return x, nil +} + +// readTo keeps reading bytes until x is found. +func (z *Tokenizer) readTo(x uint8) os.Error { + for { + c, err := z.readByte() + if err != nil { + return err + } + switch c { + case x: + return nil + case '\\': + _, err = z.readByte() + if err != nil { + return err + } + } + } + panic("unreachable") +} + +// nextTag returns the next TokenType starting from the tag open state. +func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { + c, err := z.readByte() + if err != nil { + return Error, err + } + switch { + case c == '/': + tt = EndTag + // Lower-cased characters are more common in tag names, so we check for them first. + case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': + tt = StartTag + case c == '!': + return Error, os.NewError("html: TODO(nigeltao): implement comments") + case c == '?': + return Error, os.NewError("html: TODO(nigeltao): implement XML processing instructions") + default: + return Error, os.NewError("html: TODO(nigeltao): handle malformed tags") + } + for { + c, err := z.readByte() + if err != nil { + return Text, err + } + switch c { + case '"': + err = z.readTo('"') + if err != nil { + return Text, err + } + case '\'': + err = z.readTo('\'') + if err != nil { + return Text, err + } + case '>': + if z.buf[z.p1-2] == '/' && tt == StartTag { + return SelfClosingTag, nil + } + return tt, nil + } + } + panic("unreachable") +} + +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { + if z.err != nil { + z.tt = Error + return z.tt + } + z.p0 = z.p1 + c, err := z.readByte() + if err != nil { + z.tt, z.err = Error, err + return z.tt + } + if c == '<' { + z.tt, z.err = z.nextTag() + return z.tt + } + for { + c, err := z.readByte() + if err != nil { + z.tt, z.err = Error, err + if err == os.EOF { + z.tt = Text + } + return z.tt + } + if c == '<' { + z.p1-- + z.tt = Text + return z.tt + } + } + panic("unreachable") +} + +// trim returns the largest j such that z.buf[i:j] contains only white space, +// or only white space plus the final ">" or "/>" of the raw data. +func (z *Tokenizer) trim(i int) int { + k := z.p1 + for ; i < k; i++ { + switch z.buf[i] { + case ' ', '\n', '\t', '\f': + continue + case '>': + if i == k-1 { + return k + } + case '/': + if i == k-2 { + return k + } + } + return i + } + return k +} + +// lower finds the largest alphabetic [a-zA-Z]* word at the start of z.buf[i:] +// and returns that word lower-cased, as well as the trimmed cursor location +// after that word. +func (z *Tokenizer) lower(i int) ([]byte, int) { + i0 := i +loop: + for ; i < z.p1; i++ { + c := z.buf[i] + // TODO(nigeltao): Check what '0' <= c && c <= '9' should do. + switch { + case 'A' <= c && c <= 'Z': + z.buf[i] = c + 'a' - 'A' + case 'a' <= c && c <= 'z': + // No-op. + default: + break loop + } + } + return z.buf[i0:i], z.trim(i) +} + +// Text returns the raw data after unescaping. +// The contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) Text() []byte { + s := unescape(z.Raw()) + z.p0 = z.p1 + return s +} + +// TagName returns the lower-cased name of a tag token (the `img` out of +// `<IMG SRC="foo">`), and whether the tag has attributes. +// The contents of the returned slice may change on the next call to Next. +func (z *Tokenizer) TagName() (name []byte, remaining bool) { + i := z.p0 + 1 + if i >= z.p1 { + z.p0 = z.p1 + return nil, false + } + if z.buf[i] == '/' { + i++ + } + name, z.p0 = z.lower(i) + remaining = z.p0 != z.p1 + return +} + +// TagAttr returns the lower-cased key and unescaped value of the next unparsed +// attribute for the current tag token, and whether there are more attributes. +// The contents of the returned slices may change on the next call to Next. +func (z *Tokenizer) TagAttr() (key, val []byte, remaining bool) { + key, i := z.lower(z.p0) + // Get past the "=\"". + if i == z.p1 || z.buf[i] != '=' { + return + } + i = z.trim(i + 1) + if i == z.p1 || z.buf[i] != '"' { + return + } + i = z.trim(i + 1) + // Copy and unescape everything up to the closing '"'. + dst, src := i, i +loop: + for src < z.p1 { + c := z.buf[src] + switch c { + case '"': + src++ + break loop + case '&': + dst, src = unescapeEntity(z.buf, dst, src) + case '\\': + if src == z.p1 { + z.buf[dst] = '\\' + dst++ + } else { + z.buf[dst] = z.buf[src+1] + dst, src = dst+1, src+2 + } + default: + z.buf[dst] = c + dst, src = dst+1, src+1 + } + } + val, z.p0 = z.buf[i:dst], z.trim(src) + remaining = z.p0 != z.p1 + return +} + +// Token returns the next Token. The result's Data and Attr values remain valid +// after subsequent Next calls. +func (z *Tokenizer) Token() Token { + t := Token{Type: z.tt} + switch z.tt { + case Text: + t.Data = string(z.Text()) + case StartTag, EndTag, SelfClosingTag: + var ( + attr []Attribute + a int + ) + name, remaining := z.TagName() + for remaining { + var key, val []byte + key, val, remaining = z.TagAttr() + if a == len(attr) { + // Grow the attr slice. + n := 4 + 2*a + attr1 := make([]Attribute, n, n) + copy(attr1, attr) + attr = attr1 + } + attr[a] = Attribute{string(key), string(val)} + a++ + } + t.Data = string(name) + t.Attr = attr[0:a] + } + return t +} + +// NewTokenizer returns a new HTML Tokenizer for the given Reader. +// The input is assumed to be UTF-8 encoded. +func NewTokenizer(r io.Reader) *Tokenizer { + return &Tokenizer{ + r: r, + buf: make([]byte, 0, 4096), + } +} diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go new file mode 100644 index 0000000000..0ab2aac248 --- /dev/null +++ b/src/pkg/html/token_test.go @@ -0,0 +1,162 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "os" + "testing" +) + +type tokenTest struct { + // A short description of the test case. + desc string + // The HTML to parse. + html string + // The string representations of the expected tokens. + tokens []string +} + +var tokenTests = []tokenTest{ + // A single text node. The tokenizer should not break text nodes on whitespace, + // nor should it normalize whitespace within a text node. + tokenTest{ + "text", + "foo bar", + []string{ + "foo bar", + }, + }, + // An entity. + tokenTest{ + "entity", + "one < two", + []string{ + "one < two", + }, + }, + // A start, self-closing and end tag. The tokenizer does not care if the start + // and end tokens don't match; that is the job of the parser. + tokenTest{ + "tags", + "<a>b<c/>d</e>", + []string{ + "<a>", + "b", + "<c/>", + "d", + "</e>", + }, + }, + // An attribute with a backslash. + tokenTest{ + "backslash", + `<p id="a\"b">`, + []string{ + `<p id="a"b">`, + }, + }, + // Entities, tag name and attribute key lower-casing, and whitespace + // normalization within a tag. + tokenTest{ + "tricky", + "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", + []string{ + `<p id="a"B" foo="bar">`, + "<em>", + "te<&;xt", + "</em>", + "</p>", + }, + }, + // A non-existant entity. Tokenizing and converting back to a string should + // escape the "&" to become "&". + tokenTest{ + "noSuchEntity", + `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, + []string{ + `<a b="c&noSuchEntity;d">`, + "<&alsoDoesntExist;&", + }, + }, +} + +func TestTokenizer(t *testing.T) { +loop: + for _, tt := range tokenTests { + z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) + for i, s := range tt.tokens { + if z.Next() == Error { + t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) + continue loop + } + actual := z.Token().String() + if s != actual { + t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) + continue loop + } + } + z.Next() + if z.Error() != os.EOF { + t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String()) + } + } +} + +func TestUnescapeEscape(t *testing.T) { + ss := []string{ + ``, + `abc def`, + `a & b`, + `a&b`, + `a & b`, + `"`, + `"`, + `"<&>"`, + `"<&>"`, + `3&5==1 && 0<1, "0<1", a+acute=á`, + } + for _, s := range ss { + if s != UnescapeString(EscapeString(s)) { + t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s) + } + } +} + +func TestBufAPI(t *testing.T) { + s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" + z := NewTokenizer(bytes.NewBuffer([]byte(s))) + result := bytes.NewBuffer(nil) + depth := 0 +loop: + for { + tt := z.Next() + switch tt { + case Error: + if z.Error() != os.EOF { + t.Error(z.Error()) + } + break loop + case Text: + if depth > 0 { + result.Write(z.Text()) + } + case StartTag, EndTag: + tn, _ := z.TagName() + if len(tn) == 1 && tn[0] == 'a' { + if tt == StartTag { + depth++ + } else { + depth-- + } + } + } + } + u := "14567" + v := string(result.Bytes()) + if u != v { + t.Errorf("TestBufAPI: want %q got %q", u, v) + } +} |