aboutsummaryrefslogtreecommitdiff
path: root/normalize/normalize.go
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-01-15 10:09:56 -0700
committerJordan <me@jordan.im>2022-01-15 10:09:56 -0700
commit4a92c26fbc739734ce5227c1be8efd440306cebf (patch)
tree3a009e9ae3543725a597b77a838d0b41690b2e0e /normalize/normalize.go
parent995a8fad1461734108b5266f51a3c913936c7203 (diff)
downloadkeep-4a92c26fbc739734ce5227c1be8efd440306cebf.tar.gz
keep-4a92c26fbc739734ce5227c1be8efd440306cebf.zip
keep, archive: normalize URLs (RFC 3986), improve logging
Diffstat (limited to 'normalize/normalize.go')
-rw-r--r--normalize/normalize.go379
1 files changed, 379 insertions, 0 deletions
diff --git a/normalize/normalize.go b/normalize/normalize.go
new file mode 100644
index 0000000..0220f65
--- /dev/null
+++ b/normalize/normalize.go
@@ -0,0 +1,379 @@
+/*
+Package normalize offers URL normalization as described on the wikipedia page:
+http://en.wikipedia.org/wiki/URL_normalization
+*/
+package normalize
+
+import (
+ "bytes"
+ "fmt"
+ "net/url"
+ "regexp"
+ "sort"
+ "strconv"
+ "strings"
+
+ "golang.org/x/net/idna"
+ "golang.org/x/text/unicode/norm"
+ "golang.org/x/text/width"
+ "keep/urlesc"
+)
+
+// A set of normalization flags determines how a URL will
+// be normalized.
+type NormalizationFlags uint
+
+const (
+ // Safe normalizations
+ FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
+ FlagLowercaseHost // http://HOST -> http://host
+ FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF
+ FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA
+ FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$
+ FlagRemoveDefaultPort // http://host:80 -> http://host
+ FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path
+
+ // Usually safe normalizations
+ FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
+ FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
+ FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c
+
+ // Unsafe normalizations
+ FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/
+ FlagRemoveFragment // http://host/path#fragment -> http://host/path
+ FlagForceHTTP // https://host -> http://host
+ FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
+ FlagRemoveWWW // http://www.host/ -> http://host/
+ FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
+ FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
+
+ // Normalizations not in the wikipedia article, required to cover tests cases
+ // submitted by jehiah
+ FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147
+ FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147
+ FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147
+ FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
+ FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path
+
+ // Convenience set of safe normalizations
+ FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
+
+ // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
+ // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
+
+ // Convenience set of usually safe normalizations (includes FlagsSafe)
+ FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
+ FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
+
+ // Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
+ FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
+ FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
+
+ // Convenience set of all available flags
+ FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
+ FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
+)
+
+const (
+ defaultHttpPort = ":80"
+ defaultHttpsPort = ":443"
+)
+
+// Regular expressions used by the normalizations
+var rxPort = regexp.MustCompile(`(:\d+)/?$`)
+var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`)
+var rxDupSlashes = regexp.MustCompile(`/{2,}`)
+var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`)
+var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`)
+var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`)
+var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`)
+var rxEmptyPort = regexp.MustCompile(`:+$`)
+
+// Map of flags to implementation function.
+// FlagDecodeUnnecessaryEscapes has no action, since it is done automatically
+// by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator.
+
+// Since maps have undefined traversing order, make a slice of ordered keys
+var flagsOrder = []NormalizationFlags{
+ FlagLowercaseScheme,
+ FlagLowercaseHost,
+ FlagRemoveDefaultPort,
+ FlagRemoveDirectoryIndex,
+ FlagRemoveDotSegments,
+ FlagRemoveFragment,
+ FlagForceHTTP, // Must be after remove default port (because https=443/http=80)
+ FlagRemoveDuplicateSlashes,
+ FlagRemoveWWW,
+ FlagAddWWW,
+ FlagSortQuery,
+ FlagDecodeDWORDHost,
+ FlagDecodeOctalHost,
+ FlagDecodeHexHost,
+ FlagRemoveUnnecessaryHostDots,
+ FlagRemoveEmptyPortSeparator,
+ FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last
+ FlagAddTrailingSlash,
+}
+
+// ... and then the map, where order is unimportant
+var flags = map[NormalizationFlags]func(*url.URL){
+ FlagLowercaseScheme: lowercaseScheme,
+ FlagLowercaseHost: lowercaseHost,
+ FlagRemoveDefaultPort: removeDefaultPort,
+ FlagRemoveDirectoryIndex: removeDirectoryIndex,
+ FlagRemoveDotSegments: removeDotSegments,
+ FlagRemoveFragment: removeFragment,
+ FlagForceHTTP: forceHTTP,
+ FlagRemoveDuplicateSlashes: removeDuplicateSlashes,
+ FlagRemoveWWW: removeWWW,
+ FlagAddWWW: addWWW,
+ FlagSortQuery: sortQuery,
+ FlagDecodeDWORDHost: decodeDWORDHost,
+ FlagDecodeOctalHost: decodeOctalHost,
+ FlagDecodeHexHost: decodeHexHost,
+ FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots,
+ FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator,
+ FlagRemoveTrailingSlash: removeTrailingSlash,
+ FlagAddTrailingSlash: addTrailingSlash,
+}
+
+// MustNormalizeURLString returns the normalized string, and panics if an error occurs.
+// It takes an URL string as input, as well as the normalization flags.
+func MustNormalizeURLString(u string, f NormalizationFlags) string {
+ result, e := NormalizeURLString(u, f)
+ if e != nil {
+ panic(e)
+ }
+ return result
+}
+
+// NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object.
+// It takes an URL string as input, as well as the normalization flags.
+func NormalizeURLString(u string, f NormalizationFlags) (string, error) {
+ parsed, err := url.Parse(u)
+ if err != nil {
+ return "", err
+ }
+
+ if f&FlagLowercaseHost == FlagLowercaseHost {
+ parsed.Host = strings.ToLower(parsed.Host)
+ }
+
+ // The idna package doesn't fully conform to RFC 5895
+ // (https://tools.ietf.org/html/rfc5895), so we do it here.
+ // Taken from Go 1.8 cycle source, courtesy of bradfitz.
+ // TODO: Remove when (if?) idna package conforms to RFC 5895.
+ parsed.Host = width.Fold.String(parsed.Host)
+ parsed.Host = norm.NFC.String(parsed.Host)
+ if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil {
+ return "", err
+ }
+
+ return NormalizeURL(parsed, f), nil
+}
+
+// NormalizeURL returns the normalized string.
+// It takes a parsed URL object as input, as well as the normalization flags.
+func NormalizeURL(u *url.URL, f NormalizationFlags) string {
+ for _, k := range flagsOrder {
+ if f&k == k {
+ flags[k](u)
+ }
+ }
+ return urlesc.Escape(u)
+}
+
+func lowercaseScheme(u *url.URL) {
+ if len(u.Scheme) > 0 {
+ u.Scheme = strings.ToLower(u.Scheme)
+ }
+}
+
+func lowercaseHost(u *url.URL) {
+ if len(u.Host) > 0 {
+ u.Host = strings.ToLower(u.Host)
+ }
+}
+
+func removeDefaultPort(u *url.URL) {
+ if len(u.Host) > 0 {
+ scheme := strings.ToLower(u.Scheme)
+ u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string {
+ if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) {
+ return ""
+ }
+ return val
+ })
+ }
+}
+
+func removeTrailingSlash(u *url.URL) {
+ if l := len(u.Path); l > 0 {
+ if strings.HasSuffix(u.Path, "/") {
+ u.Path = u.Path[:l-1]
+ }
+ } else if l = len(u.Host); l > 0 {
+ if strings.HasSuffix(u.Host, "/") {
+ u.Host = u.Host[:l-1]
+ }
+ }
+}
+
+func addTrailingSlash(u *url.URL) {
+ if l := len(u.Path); l > 0 {
+ if !strings.HasSuffix(u.Path, "/") {
+ u.Path += "/"
+ }
+ } else if l = len(u.Host); l > 0 {
+ if !strings.HasSuffix(u.Host, "/") {
+ u.Host += "/"
+ }
+ }
+}
+
+func removeDotSegments(u *url.URL) {
+ if len(u.Path) > 0 {
+ var dotFree []string
+ var lastIsDot bool
+
+ sections := strings.Split(u.Path, "/")
+ for _, s := range sections {
+ if s == ".." {
+ if len(dotFree) > 0 {
+ dotFree = dotFree[:len(dotFree)-1]
+ }
+ } else if s != "." {
+ dotFree = append(dotFree, s)
+ }
+ lastIsDot = (s == "." || s == "..")
+ }
+ // Special case if host does not end with / and new path does not begin with /
+ u.Path = strings.Join(dotFree, "/")
+ if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") {
+ u.Path = "/" + u.Path
+ }
+ // Special case if the last segment was a dot, make sure the path ends with a slash
+ if lastIsDot && !strings.HasSuffix(u.Path, "/") {
+ u.Path += "/"
+ }
+ }
+}
+
+func removeDirectoryIndex(u *url.URL) {
+ if len(u.Path) > 0 {
+ u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1")
+ }
+}
+
+func removeFragment(u *url.URL) {
+ u.Fragment = ""
+}
+
+func forceHTTP(u *url.URL) {
+ if strings.ToLower(u.Scheme) == "https" {
+ u.Scheme = "http"
+ }
+}
+
+func removeDuplicateSlashes(u *url.URL) {
+ if len(u.Path) > 0 {
+ u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/")
+ }
+}
+
+func removeWWW(u *url.URL) {
+ if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") {
+ u.Host = u.Host[4:]
+ }
+}
+
+func addWWW(u *url.URL) {
+ if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") {
+ u.Host = "www." + u.Host
+ }
+}
+
+func sortQuery(u *url.URL) {
+ q := u.Query()
+
+ if len(q) > 0 {
+ arKeys := make([]string, len(q))
+ i := 0
+ for k := range q {
+ arKeys[i] = k
+ i++
+ }
+ sort.Strings(arKeys)
+ buf := new(bytes.Buffer)
+ for _, k := range arKeys {
+ sort.Strings(q[k])
+ for _, v := range q[k] {
+ if buf.Len() > 0 {
+ buf.WriteRune('&')
+ }
+ buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v)))
+ }
+ }
+
+ // Rebuild the raw query string
+ u.RawQuery = buf.String()
+ }
+}
+
+func decodeDWORDHost(u *url.URL) {
+ if len(u.Host) > 0 {
+ if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 {
+ var parts [4]int64
+
+ dword, _ := strconv.ParseInt(matches[1], 10, 0)
+ for i, shift := range []uint{24, 16, 8, 0} {
+ parts[i] = dword >> shift & 0xFF
+ }
+ u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2])
+ }
+ }
+}
+
+func decodeOctalHost(u *url.URL) {
+ if len(u.Host) > 0 {
+ if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 {
+ var parts [4]int64
+
+ for i := 1; i <= 4; i++ {
+ parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0)
+ }
+ u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5])
+ }
+ }
+}
+
+func decodeHexHost(u *url.URL) {
+ if len(u.Host) > 0 {
+ if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 {
+ // Conversion is safe because of regex validation
+ parsed, _ := strconv.ParseInt(matches[1], 16, 0)
+ // Set host as DWORD (base 10) encoded host
+ u.Host = fmt.Sprintf("%d%s", parsed, matches[2])
+ // The rest is the same as decoding a DWORD host
+ decodeDWORDHost(u)
+ }
+ }
+}
+
+func removeUnncessaryHostDots(u *url.URL) {
+ if len(u.Host) > 0 {
+ if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 {
+ // Trim the leading and trailing dots
+ u.Host = strings.Trim(matches[1], ".")
+ if len(matches) > 2 {
+ u.Host += matches[2]
+ }
+ }
+ }
+}
+
+func removeEmptyPortSeparator(u *url.URL) {
+ if len(u.Host) > 0 {
+ u.Host = rxEmptyPort.ReplaceAllString(u.Host, "")
+ }
+}