diff options
author | ale <ale@incal.net> | 2017-12-19 00:12:11 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2017-12-19 00:12:11 +0000 |
commit | 4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch) | |
tree | c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /cmd | |
parent | 77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff) | |
download | crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip |
Add tags (primary/related) to links
This change allows more complex scope boundaries, including loosening
edges a bit to include related resources of HTML pages (which makes
for more complete archives if desired).
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/crawl/crawl.go | 18 |
1 files changed, 11 insertions, 7 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 3954682..e31f63e 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -23,12 +23,13 @@ import ( ) var ( - dbPath = flag.String("state", "crawldb", "crawl state database path") - keepDb = flag.Bool("keep", false, "keep the state database when done") - concurrency = flag.Int("c", 10, "concurrent workers") - depth = flag.Int("depth", 10, "maximum link depth") - validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") - outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 10, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") + alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") + outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") ) func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { @@ -196,11 +197,14 @@ func main() { } seeds := crawl.MustParseURLs(flag.Args()) - scope := []crawl.Scope{ + scope := crawl.AND( crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), crawl.NewDepthScope(*depth), crawl.NewSeedScope(seeds), crawl.NewRegexpIgnoreScope(nil), + ) + if *alwaysIncludeRelated { + scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) } w := warc.NewWriter(outf) |