Add tags (primary/related) to links

This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
author: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
committer: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
commit: 4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree: c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /cmd
parent: 77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download: crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
1 files changed, 11 insertions, 7 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 3954682..e31f63e 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -23,12 +23,13 @@ import (
 )
 
 var (
-	dbPath       = flag.String("state", "crawldb", "crawl state database path")
-	keepDb       = flag.Bool("keep", false, "keep the state database when done")
-	concurrency  = flag.Int("c", 10, "concurrent workers")
-	depth        = flag.Int("depth", 10, "maximum link depth")
-	validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
-	outputFile   = flag.String("output", "crawl.warc.gz", "output WARC file")
+	dbPath               = flag.String("state", "crawldb", "crawl state database path")
+	keepDb               = flag.Bool("keep", false, "keep the state database when done")
+	concurrency          = flag.Int("c", 10, "concurrent workers")
+	depth                = flag.Int("depth", 10, "maximum link depth")
+	validSchemes         = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+	alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
+	outputFile           = flag.String("output", "crawl.warc.gz", "output WARC file")
 )
 
 func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
@@ -196,11 +197,14 @@ func main() {
 	}
 
 	seeds := crawl.MustParseURLs(flag.Args())
-	scope := []crawl.Scope{
+	scope := crawl.AND(
 		crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
 		crawl.NewDepthScope(*depth),
 		crawl.NewSeedScope(seeds),
 		crawl.NewRegexpIgnoreScope(nil),
+	)
+	if *alwaysIncludeRelated {
+		scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
 	}
 
 	w := warc.NewWriter(outf)
author	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
committer	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
commit	4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree	c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /cmd
parent	77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download	crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip