aboutsummaryrefslogtreecommitdiff
path: root/scope.go
diff options
context:
space:
mode:
authorale <ale@incal.net>2017-12-19 00:12:11 +0000
committerale <ale@incal.net>2017-12-19 00:12:11 +0000
commit4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
treec3bf3e88729291ecf0e371d0dd43977cdd1d08ea /scope.go
parent77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
downloadcrawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
Add tags (primary/related) to links
This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
Diffstat (limited to 'scope.go')
-rw-r--r--scope.go63
1 files changed, 55 insertions, 8 deletions
diff --git a/scope.go b/scope.go
index 6a63018..b2e90ea 100644
--- a/scope.go
+++ b/scope.go
@@ -10,14 +10,14 @@ import (
// Scope defines the crawling scope.
type Scope interface {
// Check a URL to see if it's in scope for crawling.
- Check(*url.URL, int) bool
+ Check(Outlink, int) bool
}
type maxDepthScope struct {
maxDepth int
}
-func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
+func (s *maxDepthScope) Check(_ Outlink, depth int) bool {
return depth < s.maxDepth
}
@@ -31,8 +31,8 @@ type schemeScope struct {
allowedSchemes map[string]struct{}
}
-func (s *schemeScope) Check(uri *url.URL, depth int) bool {
- _, ok := s.allowedSchemes[uri.Scheme]
+func (s *schemeScope) Check(link Outlink, depth int) bool {
+ _, ok := s.allowedSchemes[link.URL.Scheme]
return ok
}
@@ -81,8 +81,8 @@ type urlPrefixScope struct {
prefixes URLPrefixMap
}
-func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
- return s.prefixes.Contains(uri)
+func (s *urlPrefixScope) Check(link Outlink, depth int) bool {
+ return s.prefixes.Contains(link.URL)
}
// NewURLPrefixScope returns a Scope that limits the crawl to a set of
@@ -105,8 +105,8 @@ type regexpIgnoreScope struct {
ignores []*regexp.Regexp
}
-func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
- uriStr := uri.String()
+func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
+ uriStr := link.URL.String()
for _, i := range s.ignores {
if i.MatchString(uriStr) {
return false
@@ -129,3 +129,50 @@ func NewRegexpIgnoreScope(ignores []string) Scope {
}
return &r
}
+
+// NewIncludeRelatedScope always includes resources with TagRelated.
+func NewIncludeRelatedScope() Scope {
+ return &includeRelatedScope{}
+}
+
+type includeRelatedScope struct{}
+
+func (s *includeRelatedScope) Check(link Outlink, _ int) bool {
+ return link.Tag == TagRelated
+}
+
+// AND performs a boolean AND.
+func AND(elems ...Scope) Scope {
+ return &andScope{elems: elems}
+}
+
+type andScope struct {
+ elems []Scope
+}
+
+func (s *andScope) Check(link Outlink, depth int) bool {
+ for _, e := range s.elems {
+ if !e.Check(link, depth) {
+ return false
+ }
+ }
+ return true
+}
+
+// OR performs a boolean OR.
+func OR(elems ...Scope) Scope {
+ return &orScope{elems: elems}
+}
+
+type orScope struct {
+ elems []Scope
+}
+
+func (s *orScope) Check(link Outlink, depth int) bool {
+ for _, e := range s.elems {
+ if e.Check(link, depth) {
+ return true
+ }
+ }
+ return false
+}