From 4c82422d2e75b9b8f4d034b1f43fda566416d6af Mon Sep 17 00:00:00 2001 From: ale Date: Sat, 20 Dec 2014 11:41:24 +0000 Subject: make Scope checking more modular --- scope.go | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 scope.go (limited to 'scope.go') diff --git a/scope.go b/scope.go new file mode 100644 index 0000000..de909f4 --- /dev/null +++ b/scope.go @@ -0,0 +1,94 @@ +package crawl + +import ( + "fmt" + "net/url" + "strings" +) + +type Scope interface { + Check(*url.URL, int) bool +} + +type maxDepthScope struct { + maxDepth int +} + +func (s *maxDepthScope) Check(uri *url.URL, depth int) bool { + return depth < s.maxDepth +} + +// NewDepthScope returns a Scope that will limit crawls to a +// maximum link depth with respect to the crawl seeds. +func NewDepthScope(maxDepth int) Scope { + return &maxDepthScope{maxDepth} +} + +type schemeScope struct { + allowedSchemes map[string]struct{} +} + +func (s *schemeScope) Check(uri *url.URL, depth int) bool { + _, ok := s.allowedSchemes[uri.Scheme] + return ok +} + +// NewSchemeScope limits the crawl to the specified URL schemes. +func NewSchemeScope(schemes []string) Scope { + m := make(map[string]struct{}) + for _, s := range schemes { + m[s] = struct{}{} + } + return &schemeScope{m} +} + +// A URLPrefixMap makes it easy to check for URL prefixes (even for +// very large lists). The URL scheme is ignored, along with an +// eventual "www." prefix. +type URLPrefixMap map[string]struct{} + +func normalizeUrlPrefix(uri *url.URL) string { + return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/") +} + +func (m URLPrefixMap) Add(uri *url.URL) { + m[normalizeUrlPrefix(uri)] = struct{}{} +} + +func (m URLPrefixMap) Contains(uri *url.URL) bool { + s := strings.TrimPrefix(uri.Host, "www.") + for _, p := range strings.Split(uri.Path, "/") { + if p == "" { + continue + } + s = fmt.Sprintf("%s/%s", s, p) + if _, ok := m[s]; ok { + return true + } + } + return false +} + +type urlPrefixScope struct { + prefixes URLPrefixMap +} + +func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool { + return s.prefixes.Contains(uri) +} + +// NewURLPrefixScope returns a Scope that limits the crawl to a set of +// allowed URL prefixes. +func NewURLPrefixScope(prefixes URLPrefixMap) Scope { + return &urlPrefixScope{prefixes} +} + +// NewSeedScope returns a Scope that will only allow crawling the seed +// prefixes. +func NewSeedScope(seeds []*url.URL) Scope { + pfx := make(URLPrefixMap) + for _, s := range seeds { + pfx.Add(s) + } + return NewURLPrefixScope(pfx) +} -- cgit v1.2.3-54-g00ecf