From 4a92c26fbc739734ce5227c1be8efd440306cebf Mon Sep 17 00:00:00 2001 From: Jordan Date: Sat, 15 Jan 2022 10:09:56 -0700 Subject: keep, archive: normalize URLs (RFC 3986), improve logging --- normalize/normalize.go | 379 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 379 insertions(+) create mode 100644 normalize/normalize.go (limited to 'normalize/normalize.go') diff --git a/normalize/normalize.go b/normalize/normalize.go new file mode 100644 index 0000000..0220f65 --- /dev/null +++ b/normalize/normalize.go @@ -0,0 +1,379 @@ +/* +Package normalize offers URL normalization as described on the wikipedia page: +http://en.wikipedia.org/wiki/URL_normalization +*/ +package normalize + +import ( + "bytes" + "fmt" + "net/url" + "regexp" + "sort" + "strconv" + "strings" + + "golang.org/x/net/idna" + "golang.org/x/text/unicode/norm" + "golang.org/x/text/width" + "keep/urlesc" +) + +// A set of normalization flags determines how a URL will +// be normalized. +type NormalizationFlags uint + +const ( + // Safe normalizations + FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1 + FlagLowercaseHost // http://HOST -> http://host + FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF + FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA + FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$ + FlagRemoveDefaultPort // http://host:80 -> http://host + FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path + + // Usually safe normalizations + FlagRemoveTrailingSlash // http://host/path/ -> http://host/path + FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags) + FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c + + // Unsafe normalizations + FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/ + FlagRemoveFragment // http://host/path#fragment -> http://host/path + FlagForceHTTP // https://host -> http://host + FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b + FlagRemoveWWW // http://www.host/ -> http://host/ + FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags) + FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3 + + // Normalizations not in the wikipedia article, required to cover tests cases + // submitted by jehiah + FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147 + FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147 + FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147 + FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path + FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path + + // Convenience set of safe normalizations + FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator + + // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags, + // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix". + + // Convenience set of usually safe normalizations (includes FlagsSafe) + FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments + FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments + + // Convenience set of unsafe normalizations (includes FlagsUsuallySafe) + FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery + FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery + + // Convenience set of all available flags + FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator + FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator +) + +const ( + defaultHttpPort = ":80" + defaultHttpsPort = ":443" +) + +// Regular expressions used by the normalizations +var rxPort = regexp.MustCompile(`(:\d+)/?$`) +var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`) +var rxDupSlashes = regexp.MustCompile(`/{2,}`) +var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`) +var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`) +var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`) +var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`) +var rxEmptyPort = regexp.MustCompile(`:+$`) + +// Map of flags to implementation function. +// FlagDecodeUnnecessaryEscapes has no action, since it is done automatically +// by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator. + +// Since maps have undefined traversing order, make a slice of ordered keys +var flagsOrder = []NormalizationFlags{ + FlagLowercaseScheme, + FlagLowercaseHost, + FlagRemoveDefaultPort, + FlagRemoveDirectoryIndex, + FlagRemoveDotSegments, + FlagRemoveFragment, + FlagForceHTTP, // Must be after remove default port (because https=443/http=80) + FlagRemoveDuplicateSlashes, + FlagRemoveWWW, + FlagAddWWW, + FlagSortQuery, + FlagDecodeDWORDHost, + FlagDecodeOctalHost, + FlagDecodeHexHost, + FlagRemoveUnnecessaryHostDots, + FlagRemoveEmptyPortSeparator, + FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last + FlagAddTrailingSlash, +} + +// ... and then the map, where order is unimportant +var flags = map[NormalizationFlags]func(*url.URL){ + FlagLowercaseScheme: lowercaseScheme, + FlagLowercaseHost: lowercaseHost, + FlagRemoveDefaultPort: removeDefaultPort, + FlagRemoveDirectoryIndex: removeDirectoryIndex, + FlagRemoveDotSegments: removeDotSegments, + FlagRemoveFragment: removeFragment, + FlagForceHTTP: forceHTTP, + FlagRemoveDuplicateSlashes: removeDuplicateSlashes, + FlagRemoveWWW: removeWWW, + FlagAddWWW: addWWW, + FlagSortQuery: sortQuery, + FlagDecodeDWORDHost: decodeDWORDHost, + FlagDecodeOctalHost: decodeOctalHost, + FlagDecodeHexHost: decodeHexHost, + FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots, + FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator, + FlagRemoveTrailingSlash: removeTrailingSlash, + FlagAddTrailingSlash: addTrailingSlash, +} + +// MustNormalizeURLString returns the normalized string, and panics if an error occurs. +// It takes an URL string as input, as well as the normalization flags. +func MustNormalizeURLString(u string, f NormalizationFlags) string { + result, e := NormalizeURLString(u, f) + if e != nil { + panic(e) + } + return result +} + +// NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object. +// It takes an URL string as input, as well as the normalization flags. +func NormalizeURLString(u string, f NormalizationFlags) (string, error) { + parsed, err := url.Parse(u) + if err != nil { + return "", err + } + + if f&FlagLowercaseHost == FlagLowercaseHost { + parsed.Host = strings.ToLower(parsed.Host) + } + + // The idna package doesn't fully conform to RFC 5895 + // (https://tools.ietf.org/html/rfc5895), so we do it here. + // Taken from Go 1.8 cycle source, courtesy of bradfitz. + // TODO: Remove when (if?) idna package conforms to RFC 5895. + parsed.Host = width.Fold.String(parsed.Host) + parsed.Host = norm.NFC.String(parsed.Host) + if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil { + return "", err + } + + return NormalizeURL(parsed, f), nil +} + +// NormalizeURL returns the normalized string. +// It takes a parsed URL object as input, as well as the normalization flags. +func NormalizeURL(u *url.URL, f NormalizationFlags) string { + for _, k := range flagsOrder { + if f&k == k { + flags[k](u) + } + } + return urlesc.Escape(u) +} + +func lowercaseScheme(u *url.URL) { + if len(u.Scheme) > 0 { + u.Scheme = strings.ToLower(u.Scheme) + } +} + +func lowercaseHost(u *url.URL) { + if len(u.Host) > 0 { + u.Host = strings.ToLower(u.Host) + } +} + +func removeDefaultPort(u *url.URL) { + if len(u.Host) > 0 { + scheme := strings.ToLower(u.Scheme) + u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string { + if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) { + return "" + } + return val + }) + } +} + +func removeTrailingSlash(u *url.URL) { + if l := len(u.Path); l > 0 { + if strings.HasSuffix(u.Path, "/") { + u.Path = u.Path[:l-1] + } + } else if l = len(u.Host); l > 0 { + if strings.HasSuffix(u.Host, "/") { + u.Host = u.Host[:l-1] + } + } +} + +func addTrailingSlash(u *url.URL) { + if l := len(u.Path); l > 0 { + if !strings.HasSuffix(u.Path, "/") { + u.Path += "/" + } + } else if l = len(u.Host); l > 0 { + if !strings.HasSuffix(u.Host, "/") { + u.Host += "/" + } + } +} + +func removeDotSegments(u *url.URL) { + if len(u.Path) > 0 { + var dotFree []string + var lastIsDot bool + + sections := strings.Split(u.Path, "/") + for _, s := range sections { + if s == ".." { + if len(dotFree) > 0 { + dotFree = dotFree[:len(dotFree)-1] + } + } else if s != "." { + dotFree = append(dotFree, s) + } + lastIsDot = (s == "." || s == "..") + } + // Special case if host does not end with / and new path does not begin with / + u.Path = strings.Join(dotFree, "/") + if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") { + u.Path = "/" + u.Path + } + // Special case if the last segment was a dot, make sure the path ends with a slash + if lastIsDot && !strings.HasSuffix(u.Path, "/") { + u.Path += "/" + } + } +} + +func removeDirectoryIndex(u *url.URL) { + if len(u.Path) > 0 { + u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1") + } +} + +func removeFragment(u *url.URL) { + u.Fragment = "" +} + +func forceHTTP(u *url.URL) { + if strings.ToLower(u.Scheme) == "https" { + u.Scheme = "http" + } +} + +func removeDuplicateSlashes(u *url.URL) { + if len(u.Path) > 0 { + u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/") + } +} + +func removeWWW(u *url.URL) { + if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") { + u.Host = u.Host[4:] + } +} + +func addWWW(u *url.URL) { + if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") { + u.Host = "www." + u.Host + } +} + +func sortQuery(u *url.URL) { + q := u.Query() + + if len(q) > 0 { + arKeys := make([]string, len(q)) + i := 0 + for k := range q { + arKeys[i] = k + i++ + } + sort.Strings(arKeys) + buf := new(bytes.Buffer) + for _, k := range arKeys { + sort.Strings(q[k]) + for _, v := range q[k] { + if buf.Len() > 0 { + buf.WriteRune('&') + } + buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v))) + } + } + + // Rebuild the raw query string + u.RawQuery = buf.String() + } +} + +func decodeDWORDHost(u *url.URL) { + if len(u.Host) > 0 { + if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 { + var parts [4]int64 + + dword, _ := strconv.ParseInt(matches[1], 10, 0) + for i, shift := range []uint{24, 16, 8, 0} { + parts[i] = dword >> shift & 0xFF + } + u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2]) + } + } +} + +func decodeOctalHost(u *url.URL) { + if len(u.Host) > 0 { + if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 { + var parts [4]int64 + + for i := 1; i <= 4; i++ { + parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0) + } + u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5]) + } + } +} + +func decodeHexHost(u *url.URL) { + if len(u.Host) > 0 { + if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 { + // Conversion is safe because of regex validation + parsed, _ := strconv.ParseInt(matches[1], 16, 0) + // Set host as DWORD (base 10) encoded host + u.Host = fmt.Sprintf("%d%s", parsed, matches[2]) + // The rest is the same as decoding a DWORD host + decodeDWORDHost(u) + } + } +} + +func removeUnncessaryHostDots(u *url.URL) { + if len(u.Host) > 0 { + if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 { + // Trim the leading and trailing dots + u.Host = strings.Trim(matches[1], ".") + if len(matches) > 2 { + u.Host += matches[2] + } + } + } +} + +func removeEmptyPortSeparator(u *url.URL) { + if len(u.Host) > 0 { + u.Host = rxEmptyPort.ReplaceAllString(u.Host, "") + } +} -- cgit v1.2.3-54-g00ecf