From 4cd67e7234943baf31b2e122f8ee3c70c21fb489 Mon Sep 17 00:00:00 2001 From: ale Date: Tue, 19 Dec 2017 00:12:11 +0000 Subject: Add tags (primary/related) to links This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired). --- cmd/crawl/crawl.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'cmd') diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 3954682..e31f63e 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -23,12 +23,13 @@ import ( ) var ( - dbPath = flag.String("state", "crawldb", "crawl state database path") - keepDb = flag.Bool("keep", false, "keep the state database when done") - concurrency = flag.Int("c", 10, "concurrent workers") - depth = flag.Int("depth", 10, "maximum link depth") - validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") - outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 10, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") + alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") + outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") ) func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { @@ -196,11 +197,14 @@ func main() { } seeds := crawl.MustParseURLs(flag.Args()) - scope := []crawl.Scope{ + scope := crawl.AND( crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), crawl.NewDepthScope(*depth), crawl.NewSeedScope(seeds), crawl.NewRegexpIgnoreScope(nil), + ) + if *alwaysIncludeRelated { + scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) } w := warc.NewWriter(outf) -- cgit v1.2.3-54-g00ecf