Add tags (primary/related) to links

This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
author: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
committer: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
commit: 4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree: c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /crawler.go
parent: 77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download: crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
1 files changed, 26 insertions, 14 deletions
diff --git a/crawler.go b/crawler.go
index f1edc2d..9fad2ef 100644
--- a/crawler.go
+++ b/crawler.go
@@ -75,6 +75,20 @@ func (i *gobIterator) Value(obj interface{}) error {
 	return gob.NewDecoder(bytes.NewBuffer(i.Iterator.Value())).Decode(obj)
 }
 
+// Outlink is a tagged outbound link.
+type Outlink struct {
+	URL *url.URL
+	Tag int
+}
+
+const (
+	// TagPrimary is a primary reference (another web page).
+	TagPrimary = iota
+
+	// TagRelated is a secondary resource, related to a page.
+	TagRelated
+)
+
 // URLInfo stores information about a crawled URL.
 type URLInfo struct {
 	URL        string
@@ -118,7 +132,7 @@ type Crawler struct {
 	db      *gobDB
 	queue   *queue
 	seeds   []*url.URL
-	scopes  []Scope
+	scope   Scope
 	fetcher Fetcher
 	handler Handler
 
@@ -126,17 +140,15 @@ type Crawler struct {
 }
 
 // Enqueue a (possibly new) URL for processing.
-func (c *Crawler) Enqueue(u *url.URL, depth int) {
-	// Normalize the URL.
-	urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
-
-	// See if it's in scope. Checks are ANDed.
-	for _, sc := range c.scopes {
-		if !sc.Check(u, depth) {
-			return
-		}
+func (c *Crawler) Enqueue(link Outlink, depth int) {
+	// See if it's in scope.
+	if !c.scope.Check(link, depth) {
+		return
 	}
 
+	// Normalize the URL.
+	urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
+
 	// Protect the read-modify-update below with a mutex.
 	c.enqueueMx.Lock()
 	defer c.enqueueMx.Unlock()
@@ -228,7 +240,7 @@ func MustParseURLs(urls []string) []*url.URL {
 }
 
 // NewCrawler creates a new Crawler object with the specified behavior.
-func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
+func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
 	// Open the crawl database.
 	db, err := newGobDB(path)
 	if err != nil {
@@ -241,7 +253,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
 		fetcher: f,
 		handler: h,
 		seeds:   seeds,
-		scopes:  scopes,
+		scope:   scope,
 	}
 
 	// Recover active tasks.
@@ -255,7 +267,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
 func (c *Crawler) Run(concurrency int) {
 	// Load initial seeds into the queue.
 	for _, u := range c.seeds {
-		c.Enqueue(u, 0)
+		c.Enqueue(Outlink{URL: u, Tag: TagPrimary}, 0)
 	}
 
 	// Start some runners and wait until they're done.
@@ -291,7 +303,7 @@ func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http.
 				if err != nil {
 					log.Printf("error parsing Location header: %v", err)
 				} else {
-					c.Enqueue(locationURL, depth+1)
+					c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
 				}
 			}
 		} else {
author	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
committer	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
commit	4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree	c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /crawler.go
parent	77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download	crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip