diff options
Diffstat (limited to 'normalize/normalize.go')
-rw-r--r-- | normalize/normalize.go | 379 |
1 files changed, 0 insertions, 379 deletions
diff --git a/normalize/normalize.go b/normalize/normalize.go deleted file mode 100644 index 0220f65..0000000 --- a/normalize/normalize.go +++ /dev/null @@ -1,379 +0,0 @@ -/* -Package normalize offers URL normalization as described on the wikipedia page: -http://en.wikipedia.org/wiki/URL_normalization -*/ -package normalize - -import ( - "bytes" - "fmt" - "net/url" - "regexp" - "sort" - "strconv" - "strings" - - "golang.org/x/net/idna" - "golang.org/x/text/unicode/norm" - "golang.org/x/text/width" - "keep/urlesc" -) - -// A set of normalization flags determines how a URL will -// be normalized. -type NormalizationFlags uint - -const ( - // Safe normalizations - FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1 - FlagLowercaseHost // http://HOST -> http://host - FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF - FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA - FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$ - FlagRemoveDefaultPort // http://host:80 -> http://host - FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path - - // Usually safe normalizations - FlagRemoveTrailingSlash // http://host/path/ -> http://host/path - FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags) - FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c - - // Unsafe normalizations - FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/ - FlagRemoveFragment // http://host/path#fragment -> http://host/path - FlagForceHTTP // https://host -> http://host - FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b - FlagRemoveWWW // http://www.host/ -> http://host/ - FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags) - FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3 - - // Normalizations not in the wikipedia article, required to cover tests cases - // submitted by jehiah - FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147 - FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147 - FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147 - FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path - FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path - - // Convenience set of safe normalizations - FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator - - // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags, - // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix". - - // Convenience set of usually safe normalizations (includes FlagsSafe) - FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments - FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments - - // Convenience set of unsafe normalizations (includes FlagsUsuallySafe) - FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery - FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery - - // Convenience set of all available flags - FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator - FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator -) - -const ( - defaultHttpPort = ":80" - defaultHttpsPort = ":443" -) - -// Regular expressions used by the normalizations -var rxPort = regexp.MustCompile(`(:\d+)/?$`) -var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`) -var rxDupSlashes = regexp.MustCompile(`/{2,}`) -var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`) -var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`) -var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`) -var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`) -var rxEmptyPort = regexp.MustCompile(`:+$`) - -// Map of flags to implementation function. -// FlagDecodeUnnecessaryEscapes has no action, since it is done automatically -// by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator. - -// Since maps have undefined traversing order, make a slice of ordered keys -var flagsOrder = []NormalizationFlags{ - FlagLowercaseScheme, - FlagLowercaseHost, - FlagRemoveDefaultPort, - FlagRemoveDirectoryIndex, - FlagRemoveDotSegments, - FlagRemoveFragment, - FlagForceHTTP, // Must be after remove default port (because https=443/http=80) - FlagRemoveDuplicateSlashes, - FlagRemoveWWW, - FlagAddWWW, - FlagSortQuery, - FlagDecodeDWORDHost, - FlagDecodeOctalHost, - FlagDecodeHexHost, - FlagRemoveUnnecessaryHostDots, - FlagRemoveEmptyPortSeparator, - FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last - FlagAddTrailingSlash, -} - -// ... and then the map, where order is unimportant -var flags = map[NormalizationFlags]func(*url.URL){ - FlagLowercaseScheme: lowercaseScheme, - FlagLowercaseHost: lowercaseHost, - FlagRemoveDefaultPort: removeDefaultPort, - FlagRemoveDirectoryIndex: removeDirectoryIndex, - FlagRemoveDotSegments: removeDotSegments, - FlagRemoveFragment: removeFragment, - FlagForceHTTP: forceHTTP, - FlagRemoveDuplicateSlashes: removeDuplicateSlashes, - FlagRemoveWWW: removeWWW, - FlagAddWWW: addWWW, - FlagSortQuery: sortQuery, - FlagDecodeDWORDHost: decodeDWORDHost, - FlagDecodeOctalHost: decodeOctalHost, - FlagDecodeHexHost: decodeHexHost, - FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots, - FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator, - FlagRemoveTrailingSlash: removeTrailingSlash, - FlagAddTrailingSlash: addTrailingSlash, -} - -// MustNormalizeURLString returns the normalized string, and panics if an error occurs. -// It takes an URL string as input, as well as the normalization flags. -func MustNormalizeURLString(u string, f NormalizationFlags) string { - result, e := NormalizeURLString(u, f) - if e != nil { - panic(e) - } - return result -} - -// NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object. -// It takes an URL string as input, as well as the normalization flags. -func NormalizeURLString(u string, f NormalizationFlags) (string, error) { - parsed, err := url.Parse(u) - if err != nil { - return "", err - } - - if f&FlagLowercaseHost == FlagLowercaseHost { - parsed.Host = strings.ToLower(parsed.Host) - } - - // The idna package doesn't fully conform to RFC 5895 - // (https://tools.ietf.org/html/rfc5895), so we do it here. - // Taken from Go 1.8 cycle source, courtesy of bradfitz. - // TODO: Remove when (if?) idna package conforms to RFC 5895. - parsed.Host = width.Fold.String(parsed.Host) - parsed.Host = norm.NFC.String(parsed.Host) - if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil { - return "", err - } - - return NormalizeURL(parsed, f), nil -} - -// NormalizeURL returns the normalized string. -// It takes a parsed URL object as input, as well as the normalization flags. -func NormalizeURL(u *url.URL, f NormalizationFlags) string { - for _, k := range flagsOrder { - if f&k == k { - flags[k](u) - } - } - return urlesc.Escape(u) -} - -func lowercaseScheme(u *url.URL) { - if len(u.Scheme) > 0 { - u.Scheme = strings.ToLower(u.Scheme) - } -} - -func lowercaseHost(u *url.URL) { - if len(u.Host) > 0 { - u.Host = strings.ToLower(u.Host) - } -} - -func removeDefaultPort(u *url.URL) { - if len(u.Host) > 0 { - scheme := strings.ToLower(u.Scheme) - u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string { - if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) { - return "" - } - return val - }) - } -} - -func removeTrailingSlash(u *url.URL) { - if l := len(u.Path); l > 0 { - if strings.HasSuffix(u.Path, "/") { - u.Path = u.Path[:l-1] - } - } else if l = len(u.Host); l > 0 { - if strings.HasSuffix(u.Host, "/") { - u.Host = u.Host[:l-1] - } - } -} - -func addTrailingSlash(u *url.URL) { - if l := len(u.Path); l > 0 { - if !strings.HasSuffix(u.Path, "/") { - u.Path += "/" - } - } else if l = len(u.Host); l > 0 { - if !strings.HasSuffix(u.Host, "/") { - u.Host += "/" - } - } -} - -func removeDotSegments(u *url.URL) { - if len(u.Path) > 0 { - var dotFree []string - var lastIsDot bool - - sections := strings.Split(u.Path, "/") - for _, s := range sections { - if s == ".." { - if len(dotFree) > 0 { - dotFree = dotFree[:len(dotFree)-1] - } - } else if s != "." { - dotFree = append(dotFree, s) - } - lastIsDot = (s == "." || s == "..") - } - // Special case if host does not end with / and new path does not begin with / - u.Path = strings.Join(dotFree, "/") - if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") { - u.Path = "/" + u.Path - } - // Special case if the last segment was a dot, make sure the path ends with a slash - if lastIsDot && !strings.HasSuffix(u.Path, "/") { - u.Path += "/" - } - } -} - -func removeDirectoryIndex(u *url.URL) { - if len(u.Path) > 0 { - u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1") - } -} - -func removeFragment(u *url.URL) { - u.Fragment = "" -} - -func forceHTTP(u *url.URL) { - if strings.ToLower(u.Scheme) == "https" { - u.Scheme = "http" - } -} - -func removeDuplicateSlashes(u *url.URL) { - if len(u.Path) > 0 { - u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/") - } -} - -func removeWWW(u *url.URL) { - if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") { - u.Host = u.Host[4:] - } -} - -func addWWW(u *url.URL) { - if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") { - u.Host = "www." + u.Host - } -} - -func sortQuery(u *url.URL) { - q := u.Query() - - if len(q) > 0 { - arKeys := make([]string, len(q)) - i := 0 - for k := range q { - arKeys[i] = k - i++ - } - sort.Strings(arKeys) - buf := new(bytes.Buffer) - for _, k := range arKeys { - sort.Strings(q[k]) - for _, v := range q[k] { - if buf.Len() > 0 { - buf.WriteRune('&') - } - buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v))) - } - } - - // Rebuild the raw query string - u.RawQuery = buf.String() - } -} - -func decodeDWORDHost(u *url.URL) { - if len(u.Host) > 0 { - if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 { - var parts [4]int64 - - dword, _ := strconv.ParseInt(matches[1], 10, 0) - for i, shift := range []uint{24, 16, 8, 0} { - parts[i] = dword >> shift & 0xFF - } - u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2]) - } - } -} - -func decodeOctalHost(u *url.URL) { - if len(u.Host) > 0 { - if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 { - var parts [4]int64 - - for i := 1; i <= 4; i++ { - parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0) - } - u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5]) - } - } -} - -func decodeHexHost(u *url.URL) { - if len(u.Host) > 0 { - if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 { - // Conversion is safe because of regex validation - parsed, _ := strconv.ParseInt(matches[1], 16, 0) - // Set host as DWORD (base 10) encoded host - u.Host = fmt.Sprintf("%d%s", parsed, matches[2]) - // The rest is the same as decoding a DWORD host - decodeDWORDHost(u) - } - } -} - -func removeUnncessaryHostDots(u *url.URL) { - if len(u.Host) > 0 { - if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 { - // Trim the leading and trailing dots - u.Host = strings.Trim(matches[1], ".") - if len(matches) > 2 { - u.Host += matches[2] - } - } - } -} - -func removeEmptyPortSeparator(u *url.URL) { - if len(u.Host) > 0 { - u.Host = rxEmptyPort.ReplaceAllString(u.Host, "") - } -} |