diff options
author | ale <ale@incal.net> | 2015-06-29 10:07:40 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2015-06-29 10:07:40 +0100 |
commit | 9fbc656c6cd2ad610986a265c6b346bc234bb881 (patch) | |
tree | a5aa8a44c63b239f194617dd09cfa92cf47495e0 /scope.go | |
parent | 63bd51e06b32d48878da68df8931809d42996df1 (diff) | |
download | crawl-9fbc656c6cd2ad610986a265c6b346bc234bb881.tar.gz crawl-9fbc656c6cd2ad610986a265c6b346bc234bb881.zip |
improve queue code; golint fixes
The queuing code now performs proper lease accounting, and it will not
return a URL twice if the page load is slow.
Diffstat (limited to 'scope.go')
-rw-r--r-- | scope.go | 10 |
1 files changed, 8 insertions, 2 deletions
@@ -7,7 +7,9 @@ import ( "strings" ) +// Scope defines the crawling scope. type Scope interface { + // Check a URL to see if it's in scope for crawling. Check(*url.URL, int) bool } @@ -48,14 +50,16 @@ func NewSchemeScope(schemes []string) Scope { // eventual "www." prefix. type URLPrefixMap map[string]struct{} -func normalizeUrlPrefix(uri *url.URL) string { +func normalizeURLPrefix(uri *url.URL) string { return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/") } +// Add an URL to the prefix map. func (m URLPrefixMap) Add(uri *url.URL) { - m[normalizeUrlPrefix(uri)] = struct{}{} + m[normalizeURLPrefix(uri)] = struct{}{} } +// Contains returns true if the given URL matches the prefix map. func (m URLPrefixMap) Contains(uri *url.URL) bool { s := strings.TrimPrefix(uri.Host, "www.") if _, ok := m[s]; ok { @@ -111,6 +115,8 @@ func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool { return true } +// NewRegexpIgnoreScope returns a Scope that filters out URLs +// according to a list of regular expressions. func NewRegexpIgnoreScope(ignores []string) Scope { if ignores == nil { ignores = defaultIgnorePatterns |