Add tags (primary/related) to links

This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
author: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
committer: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
commit: 4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree: c3bf3e88729291ecf0e371d0dd43977cdd1d08ea
parent: 77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download: crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
6 files changed, 181 insertions, 49 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 9fdf8fb..5d61547 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -6,31 +6,39 @@ import (
 	"fmt"
 	"io/ioutil"
 	"net/http"
-	"net/url"
 	"regexp"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
+
+	"git.autistici.org/ale/crawl"
 )
 
 var (
-	urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`)
+	urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`)
 
 	linkMatches = []struct {
-		tag  string
-		attr string
+		tag     string
+		attr    string
+		linkTag int
 	}{
-		{"a", "href"},
-		{"link", "href"},
-		{"img", "src"},
-		{"script", "src"},
+		{"a", "href", crawl.TagPrimary},
+		{"link", "href", crawl.TagRelated},
+		{"img", "src", crawl.TagRelated},
+		{"script", "src", crawl.TagRelated},
 	}
 )
 
+// The unparsed version of an Outlink.
+type rawOutlink struct {
+	URL string
+	Tag int
+}
+
 // GetLinks returns all the links found in a document. Currently only
 // parses HTML pages and CSS stylesheets.
-func GetLinks(resp *http.Response) ([]*url.URL, error) {
-	var outlinks []string
+func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
+	var outlinks []rawOutlink
 
 	ctype := resp.Header.Get("Content-Type")
 	if strings.HasPrefix(ctype, "text/html") {
@@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
 		for _, lm := range linkMatches {
 			doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
 				val, _ := s.Attr(lm.attr)
-				outlinks = append(outlinks, val)
+				outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
 			})
 		}
 	} else if strings.HasPrefix(ctype, "text/css") {
@@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
 		// expression to extract "url()" links from CSS.
 		if data, err := ioutil.ReadAll(resp.Body); err == nil {
 			for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
-				outlinks = append(outlinks, val[1])
+				outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
 			}
 		}
 	}
 
 	// Parse outbound links relative to the request URI, and
 	// return unique results.
-	var result []*url.URL
-	links := make(map[string]*url.URL)
-	for _, val := range outlinks {
-		if linkurl, err := resp.Request.URL.Parse(val); err == nil {
-			links[linkurl.String()] = linkurl
+	var result []crawl.Outlink
+	links := make(map[string]crawl.Outlink)
+	for _, l := range outlinks {
+		if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil {
+			links[linkurl.String()] = crawl.Outlink{
+				URL: linkurl,
+				Tag: l.Tag,
+			}
 		}
 	}
-	for _, u := range links {
-		result = append(result, u)
+	for _, l := range links {
+		result = append(result, l)
 	}
 	return result, nil
 }
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 3954682..e31f63e 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -23,12 +23,13 @@ import (
 )
 
 var (
-	dbPath       = flag.String("state", "crawldb", "crawl state database path")
-	keepDb       = flag.Bool("keep", false, "keep the state database when done")
-	concurrency  = flag.Int("c", 10, "concurrent workers")
-	depth        = flag.Int("depth", 10, "maximum link depth")
-	validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
-	outputFile   = flag.String("output", "crawl.warc.gz", "output WARC file")
+	dbPath               = flag.String("state", "crawldb", "crawl state database path")
+	keepDb               = flag.Bool("keep", false, "keep the state database when done")
+	concurrency          = flag.Int("c", 10, "concurrent workers")
+	depth                = flag.Int("depth", 10, "maximum link depth")
+	validSchemes         = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+	alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
+	outputFile           = flag.String("output", "crawl.warc.gz", "output WARC file")
 )
 
 func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
@@ -196,11 +197,14 @@ func main() {
 	}
 
 	seeds := crawl.MustParseURLs(flag.Args())
-	scope := []crawl.Scope{
+	scope := crawl.AND(
 		crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
 		crawl.NewDepthScope(*depth),
 		crawl.NewSeedScope(seeds),
 		crawl.NewRegexpIgnoreScope(nil),
+	)
+	if *alwaysIncludeRelated {
+		scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
 	}
 
 	w := warc.NewWriter(outf)
diff --git a/crawler.go b/crawler.go
index f1edc2d..9fad2ef 100644
--- a/crawler.go
+++ b/crawler.go
@@ -75,6 +75,20 @@ func (i *gobIterator) Value(obj interface{}) error {
 	return gob.NewDecoder(bytes.NewBuffer(i.Iterator.Value())).Decode(obj)
 }
 
+// Outlink is a tagged outbound link.
+type Outlink struct {
+	URL *url.URL
+	Tag int
+}
+
+const (
+	// TagPrimary is a primary reference (another web page).
+	TagPrimary = iota
+
+	// TagRelated is a secondary resource, related to a page.
+	TagRelated
+)
+
 // URLInfo stores information about a crawled URL.
 type URLInfo struct {
 	URL        string
@@ -118,7 +132,7 @@ type Crawler struct {
 	db      *gobDB
 	queue   *queue
 	seeds   []*url.URL
-	scopes  []Scope
+	scope   Scope
 	fetcher Fetcher
 	handler Handler
 
@@ -126,17 +140,15 @@ type Crawler struct {
 }
 
 // Enqueue a (possibly new) URL for processing.
-func (c *Crawler) Enqueue(u *url.URL, depth int) {
-	// Normalize the URL.
-	urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
-
-	// See if it's in scope. Checks are ANDed.
-	for _, sc := range c.scopes {
-		if !sc.Check(u, depth) {
-			return
-		}
+func (c *Crawler) Enqueue(link Outlink, depth int) {
+	// See if it's in scope.
+	if !c.scope.Check(link, depth) {
+		return
 	}
 
+	// Normalize the URL.
+	urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
+
 	// Protect the read-modify-update below with a mutex.
 	c.enqueueMx.Lock()
 	defer c.enqueueMx.Unlock()
@@ -228,7 +240,7 @@ func MustParseURLs(urls []string) []*url.URL {
 }
 
 // NewCrawler creates a new Crawler object with the specified behavior.
-func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
+func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
 	// Open the crawl database.
 	db, err := newGobDB(path)
 	if err != nil {
@@ -241,7 +253,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
 		fetcher: f,
 		handler: h,
 		seeds:   seeds,
-		scopes:  scopes,
+		scope:   scope,
 	}
 
 	// Recover active tasks.
@@ -255,7 +267,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
 func (c *Crawler) Run(concurrency int) {
 	// Load initial seeds into the queue.
 	for _, u := range c.seeds {
-		c.Enqueue(u, 0)
+		c.Enqueue(Outlink{URL: u, Tag: TagPrimary}, 0)
 	}
 
 	// Start some runners and wait until they're done.
@@ -291,7 +303,7 @@ func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http.
 				if err != nil {
 					log.Printf("error parsing Location header: %v", err)
 				} else {
-					c.Enqueue(locationURL, depth+1)
+					c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
 				}
 			}
 		} else {
diff --git a/crawler_test.go b/crawler_test.go
new file mode 100644
index 0000000..66acbe4
--- /dev/null
+++ b/crawler_test.go
@@ -0,0 +1,58 @@
+package crawl
+
+import (
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"testing"
+)
+
+func TestCrawler(t *testing.T) {
+	dir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(dir)
+
+	// Run a trivial test http server just so our test Fetcher can
+	// return a real http.Response object.
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		io.WriteString(w, "hello")
+	}))
+	defer srv.Close()
+
+	seeds := MustParseURLs([]string{srv.URL})
+	scope := AND(
+		NewSchemeScope([]string{"http"}),
+		NewSeedScope(seeds),
+		NewDepthScope(2),
+	)
+
+	var crawledPages int
+	h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+		crawledPages++
+		next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
+		log.Printf("%s -> %s", u, next)
+		c.Enqueue(Outlink{
+			URL: mustParseURL(next),
+			Tag: TagPrimary,
+		}, depth+1)
+		return nil
+	})
+
+	crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h))
+	if err != nil {
+		t.Fatal("NewCrawler", err)
+	}
+
+	crawler.Run(1)
+	crawler.Close()
+
+	if crawledPages != 2 {
+		t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10)
+	}
+}
diff --git a/scope.go b/scope.go
index 6a63018..b2e90ea 100644
--- a/scope.go
+++ b/scope.go
@@ -10,14 +10,14 @@ import (
 // Scope defines the crawling scope.
 type Scope interface {
 	// Check a URL to see if it's in scope for crawling.
-	Check(*url.URL, int) bool
+	Check(Outlink, int) bool
 }
 
 type maxDepthScope struct {
 	maxDepth int
 }
 
-func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
+func (s *maxDepthScope) Check(_ Outlink, depth int) bool {
 	return depth < s.maxDepth
 }
 
@@ -31,8 +31,8 @@ type schemeScope struct {
 	allowedSchemes map[string]struct{}
 }
 
-func (s *schemeScope) Check(uri *url.URL, depth int) bool {
-	_, ok := s.allowedSchemes[uri.Scheme]
+func (s *schemeScope) Check(link Outlink, depth int) bool {
+	_, ok := s.allowedSchemes[link.URL.Scheme]
 	return ok
 }
 
@@ -81,8 +81,8 @@ type urlPrefixScope struct {
 	prefixes URLPrefixMap
 }
 
-func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
-	return s.prefixes.Contains(uri)
+func (s *urlPrefixScope) Check(link Outlink, depth int) bool {
+	return s.prefixes.Contains(link.URL)
 }
 
 // NewURLPrefixScope returns a Scope that limits the crawl to a set of
@@ -105,8 +105,8 @@ type regexpIgnoreScope struct {
 	ignores []*regexp.Regexp
 }
 
-func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
-	uriStr := uri.String()
+func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
+	uriStr := link.URL.String()
 	for _, i := range s.ignores {
 		if i.MatchString(uriStr) {
 			return false
@@ -129,3 +129,50 @@ func NewRegexpIgnoreScope(ignores []string) Scope {
 	}
 	return &r
 }
+
+// NewIncludeRelatedScope always includes resources with TagRelated.
+func NewIncludeRelatedScope() Scope {
+	return &includeRelatedScope{}
+}
+
+type includeRelatedScope struct{}
+
+func (s *includeRelatedScope) Check(link Outlink, _ int) bool {
+	return link.Tag == TagRelated
+}
+
+// AND performs a boolean AND.
+func AND(elems ...Scope) Scope {
+	return &andScope{elems: elems}
+}
+
+type andScope struct {
+	elems []Scope
+}
+
+func (s *andScope) Check(link Outlink, depth int) bool {
+	for _, e := range s.elems {
+		if !e.Check(link, depth) {
+			return false
+		}
+	}
+	return true
+}
+
+// OR performs a boolean OR.
+func OR(elems ...Scope) Scope {
+	return &orScope{elems: elems}
+}
+
+type orScope struct {
+	elems []Scope
+}
+
+func (s *orScope) Check(link Outlink, depth int) bool {
+	for _, e := range s.elems {
+		if e.Check(link, depth) {
+			return true
+		}
+	}
+	return false
+}
diff --git a/scope_test.go b/scope_test.go
index bccf93c..95366bb 100644
--- a/scope_test.go
+++ b/scope_test.go
@@ -19,7 +19,7 @@ type testScopeEntry struct {
 func runScopeTest(t *testing.T, sc Scope, testdata []testScopeEntry) {
 	for _, td := range testdata {
 		uri := mustParseURL(td.uri)
-		result := sc.Check(uri, td.depth)
+		result := sc.Check(Outlink{URL: uri, Tag: TagPrimary}, td.depth)
 		if result != td.expected {
 			t.Errorf("Check(%s, %d) -> got %v, want %v", td.uri, td.depth, result, td.expected)
 		}
author	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
committer	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
commit	4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree	c3bf3e88729291ecf0e371d0dd43977cdd1d08ea
parent	77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download	crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip