make Scope checking more modular

author: ale <ale@incal.net> 2014-12-20 11:41:24 +0000
committer: ale <ale@incal.net> 2014-12-20 11:41:24 +0000
commit: 4c82422d2e75b9b8f4d034b1f43fda566416d6af (patch)
tree: 91a66ec0d3ebd7739658794bca3e6a2ff4f6b400
parent: efe98903c17a9103d7830361d6ff6f98bb9e0faa (diff)
download: crawl-4c82422d2e75b9b8f4d034b1f43fda566416d6af.tar.gz
crawl-4c82422d2e75b9b8f4d034b1f43fda566416d6af.zip
4 files changed, 112 insertions, 55 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 1abeca6..8c02089 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -124,7 +124,11 @@ func main() {
 	}
 
 	seeds := crawl.MustParseURLs(flag.Args())
-	scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
+	scope := []crawl.Scope{
+		crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
+		crawl.NewDepthScope(*depth),
+		crawl.NewSeedScope(seeds),
+	}
 
 	w := warc.NewWriter(outf)
 	defer w.Close()
diff --git a/cmd/links/links.go b/cmd/links/links.go
index 95388ce..e89e22d 100644
--- a/cmd/links/links.go
+++ b/cmd/links/links.go
@@ -38,7 +38,11 @@ func main() {
 	flag.Parse()
 
 	seeds := crawl.MustParseURLs(flag.Args())
-	scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
+	scope := []crawl.Scope{
+		crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
+		crawl.NewDepthScope(*depth),
+		crawl.NewSeedScope(seeds),
+	}
 
 	crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.HandlerFunc(extractLinks))
 	if err != nil {
diff --git a/crawler.go b/crawler.go
index 709ff3c..fad8b9e 100644
--- a/crawler.go
+++ b/crawler.go
@@ -8,7 +8,6 @@ import (
 	"log"
 	"net/http"
 	"net/url"
-	"strings"
 	"sync"
 	"time"
 
@@ -58,10 +57,6 @@ type URLInfo struct {
 	Error      error
 }
 
-type Scope interface {
-	Check(*url.URL, int) bool
-}
-
 type Fetcher interface {
 	Fetch(string) (*http.Response, error)
 }
@@ -86,7 +81,7 @@ func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Respons
 type Crawler struct {
 	db      *gobDB
 	seeds   []*url.URL
-	scope   Scope
+	scopes  []Scope
 	fetcher Fetcher
 	handler Handler
 
@@ -111,9 +106,11 @@ func (c *Crawler) Enqueue(u *url.URL, depth int) {
 	// Normalize the URL.
 	urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
 
-	// See if it's in scope.
-	if !c.scope.Check(u, depth) {
-		return
+	// See if it's in scope. Checks are ANDed.
+	for _, sc := range c.scopes {
+		if !sc.Check(u, depth) {
+			return
+		}
 	}
 
 	c.enqueueMx.Lock()
@@ -202,46 +199,6 @@ func (c *Crawler) urlHandler(queue <-chan QueuePair) {
 	}
 }
 
-type seedScope struct {
-	seeds    []*url.URL
-	schemes  map[string]struct{}
-	maxDepth int
-}
-
-func (s *seedScope) Check(u *url.URL, depth int) bool {
-	// Ignore non-allowed schemes.
-	if _, ok := s.schemes[u.Scheme]; !ok {
-		return false
-	}
-
-	// Do not crawl beyond maxDepth.
-	if depth > s.maxDepth {
-		return false
-	}
-
-	// Check each seed prefix.
-	for _, seed := range s.seeds {
-		if u.Host == seed.Host && strings.HasPrefix(u.Path, seed.Path) {
-			return true
-		}
-	}
-	return false
-}
-
-// NewSeedScope returns a Scope that will only allow crawling the seed
-// domains, and not beyond the specified maximum link depth.
-func NewSeedScope(seeds []*url.URL, maxDepth int, allowedSchemes []string) Scope {
-	scope := &seedScope{
-		seeds:    seeds,
-		maxDepth: maxDepth,
-		schemes:  make(map[string]struct{}),
-	}
-	for _, s := range allowedSchemes {
-		scope.schemes[s] = struct{}{}
-	}
-	return scope
-}
-
 func MustParseURLs(urls []string) []*url.URL {
 	// Parse the seed URLs.
 	var parsed []*url.URL
@@ -256,7 +213,7 @@ func MustParseURLs(urls []string) []*url.URL {
 }
 
 // NewCrawler creates a new Crawler object with the specified behavior.
-func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
+func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
 	// Open the crawl database.
 	db, err := newGobDB(path)
 	if err != nil {
@@ -267,7 +224,7 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler
 		fetcher: f,
 		handler: &standardPageHandler{h},
 		seeds:   seeds,
-		scope:   scope,
+		scopes:  scopes,
 	}
 	return c, nil
 }
@@ -321,8 +278,6 @@ func (wrap *standardPageHandler) Handle(c *Crawler, u string, depth int, resp *h
 	}
 	info.Error = err
 
-	//log.Printf("[CRAWL] %+v", info)
-
 	c.UpdateURL(info)
 	return nil
 }
diff --git a/scope.go b/scope.go
new file mode 100644
index 0000000..de909f4
--- /dev/null
+++ b/scope.go
@@ -0,0 +1,94 @@
+package crawl
+
+import (
+	"fmt"
+	"net/url"
+	"strings"
+)
+
+type Scope interface {
+	Check(*url.URL, int) bool
+}
+
+type maxDepthScope struct {
+	maxDepth int
+}
+
+func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
+	return depth < s.maxDepth
+}
+
+// NewDepthScope returns a Scope that will limit crawls to a
+// maximum link depth with respect to the crawl seeds.
+func NewDepthScope(maxDepth int) Scope {
+	return &maxDepthScope{maxDepth}
+}
+
+type schemeScope struct {
+	allowedSchemes map[string]struct{}
+}
+
+func (s *schemeScope) Check(uri *url.URL, depth int) bool {
+	_, ok := s.allowedSchemes[uri.Scheme]
+	return ok
+}
+
+// NewSchemeScope limits the crawl to the specified URL schemes.
+func NewSchemeScope(schemes []string) Scope {
+	m := make(map[string]struct{})
+	for _, s := range schemes {
+		m[s] = struct{}{}
+	}
+	return &schemeScope{m}
+}
+
+// A URLPrefixMap makes it easy to check for URL prefixes (even for
+// very large lists). The URL scheme is ignored, along with an
+// eventual "www." prefix.
+type URLPrefixMap map[string]struct{}
+
+func normalizeUrlPrefix(uri *url.URL) string {
+	return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/")
+}
+
+func (m URLPrefixMap) Add(uri *url.URL) {
+	m[normalizeUrlPrefix(uri)] = struct{}{}
+}
+
+func (m URLPrefixMap) Contains(uri *url.URL) bool {
+	s := strings.TrimPrefix(uri.Host, "www.")
+	for _, p := range strings.Split(uri.Path, "/") {
+		if p == "" {
+			continue
+		}
+		s = fmt.Sprintf("%s/%s", s, p)
+		if _, ok := m[s]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+type urlPrefixScope struct {
+	prefixes URLPrefixMap
+}
+
+func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
+	return s.prefixes.Contains(uri)
+}
+
+// NewURLPrefixScope returns a Scope that limits the crawl to a set of
+// allowed URL prefixes.
+func NewURLPrefixScope(prefixes URLPrefixMap) Scope {
+	return &urlPrefixScope{prefixes}
+}
+
+// NewSeedScope returns a Scope that will only allow crawling the seed
+// prefixes.
+func NewSeedScope(seeds []*url.URL) Scope {
+	pfx := make(URLPrefixMap)
+	for _, s := range seeds {
+		pfx.Add(s)
+	}
+	return NewURLPrefixScope(pfx)
+}
author	ale <ale@incal.net>	2014-12-20 11:41:24 +0000
committer	ale <ale@incal.net>	2014-12-20 11:41:24 +0000
commit	4c82422d2e75b9b8f4d034b1f43fda566416d6af (patch)
tree	91a66ec0d3ebd7739658794bca3e6a2ff4f6b400
parent	efe98903c17a9103d7830361d6ff6f98bb9e0faa (diff)
download	crawl-4c82422d2e75b9b8f4d034b1f43fda566416d6af.tar.gz crawl-4c82422d2e75b9b8f4d034b1f43fda566416d6af.zip