diff options
author | ale <ale@incal.net> | 2015-06-28 21:57:59 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2015-06-28 21:57:59 +0100 |
commit | 63bd51e06b32d48878da68df8931809d42996df1 (patch) | |
tree | 40a6e124545015654c56d0600221530301b78fce /scope.go | |
parent | aa6e67d7b2996b3b3c4e93ad6608c5753f03f03b (diff) | |
download | crawl-63bd51e06b32d48878da68df8931809d42996df1.tar.gz crawl-63bd51e06b32d48878da68df8931809d42996df1.zip |
add ignore list from ArchiveBot
Diffstat (limited to 'scope.go')
-rw-r--r-- | scope.go | 28 |
1 files changed, 28 insertions, 0 deletions
@@ -3,6 +3,7 @@ package crawl import ( "fmt" "net/url" + "regexp" "strings" ) @@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope { } return NewURLPrefixScope(pfx) } + +type regexpIgnoreScope struct { + ignores []*regexp.Regexp +} + +func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool { + uriStr := uri.String() + for _, i := range s.ignores { + if i.MatchString(uriStr) { + return false + } + } + return true +} + +func NewRegexpIgnoreScope(ignores []string) Scope { + if ignores == nil { + ignores = defaultIgnorePatterns + } + r := regexpIgnoreScope{ + ignores: make([]*regexp.Regexp, 0, len(ignores)), + } + for _, i := range ignores { + r.ignores = append(r.ignores, regexp.MustCompile(i)) + } + return &r +} |