diff options
author | Kyle Consalus <consalus@gmail.com> | 2010-08-12 09:45:34 +1000 |
---|---|---|
committer | Nigel Tao <nigeltao@golang.org> | 2010-08-12 09:45:34 +1000 |
commit | 8fcdc6a1e2cd91390130f7122be427466db000e9 (patch) | |
tree | 7cb24788a8c196c9321a614a887bafc596eb9d3e | |
parent | bca3151042ab7c81d2edda17749bc2613c84edd0 (diff) | |
download | go-8fcdc6a1e2cd91390130f7122be427466db000e9.tar.gz go-8fcdc6a1e2cd91390130f7122be427466db000e9.zip |
Small performance improvements to the HTML tokenizer based on your 'TODO's.
R=nigeltao_golang
CC=golang-dev
https://golang.org/cl/1941042
-rw-r--r-- | src/pkg/html/escape.go | 42 | ||||
-rw-r--r-- | src/pkg/html/token.go | 16 |
2 files changed, 47 insertions, 11 deletions
diff --git a/src/pkg/html/escape.go b/src/pkg/html/escape.go index f9fdf8c4d9..f30086f367 100644 --- a/src/pkg/html/escape.go +++ b/src/pkg/html/escape.go @@ -5,6 +5,7 @@ package html import ( + "bytes" "strings" "utf8" ) @@ -60,18 +61,45 @@ func unescape(b []byte) []byte { return b } +const escapedChars = `&'<>"` + +func escape(buf *bytes.Buffer, s string) { + i := strings.IndexAny(s, escapedChars) + for i != -1 { + buf.WriteString(s[0:i]) + var esc string + switch s[i] { + case '&': + esc = "&" + case '\'': + esc = "'" + case '<': + esc = "<" + case '>': + esc = ">" + case '"': + esc = """ + default: + panic("unrecognized escape character") + } + s = s[i+1:] + buf.WriteString(esc) + i = strings.IndexAny(s, escapedChars) + } + buf.WriteString(s) +} + // EscapeString escapes special characters like "<" to become "<". It // escapes only five such characters: amp, apos, lt, gt and quot. // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't // always true. func EscapeString(s string) string { - // TODO(nigeltao): Do this much more efficiently. - s = strings.Replace(s, `&`, `&`, -1) - s = strings.Replace(s, `'`, `'`, -1) - s = strings.Replace(s, `<`, `<`, -1) - s = strings.Replace(s, `>`, `>`, -1) - s = strings.Replace(s, `"`, `"`, -1) - return s + if strings.IndexAny(s, escapedChars) == -1 { + return s + } + buf := bytes.NewBuffer(nil) + escape(buf, s) + return buf.String() } // UnescapeString unescapes entities like "<" to become "<". It unescapes a diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index 0681af44a4..39f6700321 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -5,6 +5,7 @@ package html import ( + "bytes" "io" "log" "os" @@ -68,12 +69,19 @@ type Token struct { // tagString returns a string representation of a tag Token's Data and Attr. func (t Token) tagString() string { - // TODO(nigeltao): Don't use string concatenation; it is inefficient. - s := string(t.Data) + if len(t.Attr) == 0 { + return t.Data + } + buf := bytes.NewBuffer(nil) + buf.WriteString(t.Data) for _, a := range t.Attr { - s += ` ` + a.Key + `="` + EscapeString(a.Val) + `"` + buf.WriteByte(' ') + buf.WriteString(a.Key) + buf.WriteString(`="`) + escape(buf, a.Val) + buf.WriteByte('"') } - return s + return buf.String() } // String returns a string representation of the Token. |