aboutsummaryrefslogtreecommitdiff
path: root/scope.go
diff options
context:
space:
mode:
authorale <ale@incal.net>2015-06-28 21:57:59 +0100
committerale <ale@incal.net>2015-06-28 21:57:59 +0100
commit63bd51e06b32d48878da68df8931809d42996df1 (patch)
tree40a6e124545015654c56d0600221530301b78fce /scope.go
parentaa6e67d7b2996b3b3c4e93ad6608c5753f03f03b (diff)
downloadcrawl-63bd51e06b32d48878da68df8931809d42996df1.tar.gz
crawl-63bd51e06b32d48878da68df8931809d42996df1.zip
add ignore list from ArchiveBot
Diffstat (limited to 'scope.go')
-rw-r--r--scope.go28
1 files changed, 28 insertions, 0 deletions
diff --git a/scope.go b/scope.go
index a2c06b6..ccba5f5 100644
--- a/scope.go
+++ b/scope.go
@@ -3,6 +3,7 @@ package crawl
import (
"fmt"
"net/url"
+ "regexp"
"strings"
)
@@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope {
}
return NewURLPrefixScope(pfx)
}
+
+type regexpIgnoreScope struct {
+ ignores []*regexp.Regexp
+}
+
+func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
+ uriStr := uri.String()
+ for _, i := range s.ignores {
+ if i.MatchString(uriStr) {
+ return false
+ }
+ }
+ return true
+}
+
+func NewRegexpIgnoreScope(ignores []string) Scope {
+ if ignores == nil {
+ ignores = defaultIgnorePatterns
+ }
+ r := regexpIgnoreScope{
+ ignores: make([]*regexp.Regexp, 0, len(ignores)),
+ }
+ for _, i := range ignores {
+ r.ignores = append(r.ignores, regexp.MustCompile(i))
+ }
+ return &r
+}