aboutsummaryrefslogtreecommitdiff
path: root/scope.go
diff options
context:
space:
mode:
authorale <ale@incal.net>2014-12-20 11:41:24 +0000
committerale <ale@incal.net>2014-12-20 11:41:24 +0000
commit4c82422d2e75b9b8f4d034b1f43fda566416d6af (patch)
tree91a66ec0d3ebd7739658794bca3e6a2ff4f6b400 /scope.go
parentefe98903c17a9103d7830361d6ff6f98bb9e0faa (diff)
downloadcrawl-4c82422d2e75b9b8f4d034b1f43fda566416d6af.tar.gz
crawl-4c82422d2e75b9b8f4d034b1f43fda566416d6af.zip
make Scope checking more modular
Diffstat (limited to 'scope.go')
-rw-r--r--scope.go94
1 files changed, 94 insertions, 0 deletions
diff --git a/scope.go b/scope.go
new file mode 100644
index 0000000..de909f4
--- /dev/null
+++ b/scope.go
@@ -0,0 +1,94 @@
+package crawl
+
+import (
+ "fmt"
+ "net/url"
+ "strings"
+)
+
+type Scope interface {
+ Check(*url.URL, int) bool
+}
+
+type maxDepthScope struct {
+ maxDepth int
+}
+
+func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
+ return depth < s.maxDepth
+}
+
+// NewDepthScope returns a Scope that will limit crawls to a
+// maximum link depth with respect to the crawl seeds.
+func NewDepthScope(maxDepth int) Scope {
+ return &maxDepthScope{maxDepth}
+}
+
+type schemeScope struct {
+ allowedSchemes map[string]struct{}
+}
+
+func (s *schemeScope) Check(uri *url.URL, depth int) bool {
+ _, ok := s.allowedSchemes[uri.Scheme]
+ return ok
+}
+
+// NewSchemeScope limits the crawl to the specified URL schemes.
+func NewSchemeScope(schemes []string) Scope {
+ m := make(map[string]struct{})
+ for _, s := range schemes {
+ m[s] = struct{}{}
+ }
+ return &schemeScope{m}
+}
+
+// A URLPrefixMap makes it easy to check for URL prefixes (even for
+// very large lists). The URL scheme is ignored, along with an
+// eventual "www." prefix.
+type URLPrefixMap map[string]struct{}
+
+func normalizeUrlPrefix(uri *url.URL) string {
+ return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/")
+}
+
+func (m URLPrefixMap) Add(uri *url.URL) {
+ m[normalizeUrlPrefix(uri)] = struct{}{}
+}
+
+func (m URLPrefixMap) Contains(uri *url.URL) bool {
+ s := strings.TrimPrefix(uri.Host, "www.")
+ for _, p := range strings.Split(uri.Path, "/") {
+ if p == "" {
+ continue
+ }
+ s = fmt.Sprintf("%s/%s", s, p)
+ if _, ok := m[s]; ok {
+ return true
+ }
+ }
+ return false
+}
+
+type urlPrefixScope struct {
+ prefixes URLPrefixMap
+}
+
+func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
+ return s.prefixes.Contains(uri)
+}
+
+// NewURLPrefixScope returns a Scope that limits the crawl to a set of
+// allowed URL prefixes.
+func NewURLPrefixScope(prefixes URLPrefixMap) Scope {
+ return &urlPrefixScope{prefixes}
+}
+
+// NewSeedScope returns a Scope that will only allow crawling the seed
+// prefixes.
+func NewSeedScope(seeds []*url.URL) Scope {
+ pfx := make(URLPrefixMap)
+ for _, s := range seeds {
+ pfx.Add(s)
+ }
+ return NewURLPrefixScope(pfx)
+}