aboutsummaryrefslogtreecommitdiff
path: root/scope.go
diff options
context:
space:
mode:
authorale <ale@incal.net>2015-06-29 10:07:40 +0100
committerale <ale@incal.net>2015-06-29 10:07:40 +0100
commit9fbc656c6cd2ad610986a265c6b346bc234bb881 (patch)
treea5aa8a44c63b239f194617dd09cfa92cf47495e0 /scope.go
parent63bd51e06b32d48878da68df8931809d42996df1 (diff)
downloadcrawl-9fbc656c6cd2ad610986a265c6b346bc234bb881.tar.gz
crawl-9fbc656c6cd2ad610986a265c6b346bc234bb881.zip
improve queue code; golint fixes
The queuing code now performs proper lease accounting, and it will not return a URL twice if the page load is slow.
Diffstat (limited to 'scope.go')
-rw-r--r--scope.go10
1 files changed, 8 insertions, 2 deletions
diff --git a/scope.go b/scope.go
index ccba5f5..6a63018 100644
--- a/scope.go
+++ b/scope.go
@@ -7,7 +7,9 @@ import (
"strings"
)
+// Scope defines the crawling scope.
type Scope interface {
+ // Check a URL to see if it's in scope for crawling.
Check(*url.URL, int) bool
}
@@ -48,14 +50,16 @@ func NewSchemeScope(schemes []string) Scope {
// eventual "www." prefix.
type URLPrefixMap map[string]struct{}
-func normalizeUrlPrefix(uri *url.URL) string {
+func normalizeURLPrefix(uri *url.URL) string {
return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/")
}
+// Add an URL to the prefix map.
func (m URLPrefixMap) Add(uri *url.URL) {
- m[normalizeUrlPrefix(uri)] = struct{}{}
+ m[normalizeURLPrefix(uri)] = struct{}{}
}
+// Contains returns true if the given URL matches the prefix map.
func (m URLPrefixMap) Contains(uri *url.URL) bool {
s := strings.TrimPrefix(uri.Host, "www.")
if _, ok := m[s]; ok {
@@ -111,6 +115,8 @@ func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
return true
}
+// NewRegexpIgnoreScope returns a Scope that filters out URLs
+// according to a list of regular expressions.
func NewRegexpIgnoreScope(ignores []string) Scope {
if ignores == nil {
ignores = defaultIgnorePatterns