From 63bd51e06b32d48878da68df8931809d42996df1 Mon Sep 17 00:00:00 2001 From: ale Date: Sun, 28 Jun 2015 21:57:59 +0100 Subject: add ignore list from ArchiveBot --- scope.go | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'scope.go') diff --git a/scope.go b/scope.go index a2c06b6..ccba5f5 100644 --- a/scope.go +++ b/scope.go @@ -3,6 +3,7 @@ package crawl import ( "fmt" "net/url" + "regexp" "strings" ) @@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope { } return NewURLPrefixScope(pfx) } + +type regexpIgnoreScope struct { + ignores []*regexp.Regexp +} + +func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool { + uriStr := uri.String() + for _, i := range s.ignores { + if i.MatchString(uriStr) { + return false + } + } + return true +} + +func NewRegexpIgnoreScope(ignores []string) Scope { + if ignores == nil { + ignores = defaultIgnorePatterns + } + r := regexpIgnoreScope{ + ignores: make([]*regexp.Regexp, 0, len(ignores)), + } + for _, i := range ignores { + r.ignores = append(r.ignores, regexp.MustCompile(i)) + } + return &r +} -- cgit v1.2.3-54-g00ecf