aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorale <ale@incal.net>2017-12-19 00:12:11 +0000
committerale <ale@incal.net>2017-12-19 00:12:11 +0000
commit4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
treec3bf3e88729291ecf0e371d0dd43977cdd1d08ea /cmd
parent77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
downloadcrawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
Add tags (primary/related) to links
This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
Diffstat (limited to 'cmd')
-rw-r--r--cmd/crawl/crawl.go18
1 files changed, 11 insertions, 7 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 3954682..e31f63e 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -23,12 +23,13 @@ import (
)
var (
- dbPath = flag.String("state", "crawldb", "crawl state database path")
- keepDb = flag.Bool("keep", false, "keep the state database when done")
- concurrency = flag.Int("c", 10, "concurrent workers")
- depth = flag.Int("depth", 10, "maximum link depth")
- validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
- outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
+ dbPath = flag.String("state", "crawldb", "crawl state database path")
+ keepDb = flag.Bool("keep", false, "keep the state database when done")
+ concurrency = flag.Int("c", 10, "concurrent workers")
+ depth = flag.Int("depth", 10, "maximum link depth")
+ validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+ alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
+ outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
)
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
@@ -196,11 +197,14 @@ func main() {
}
seeds := crawl.MustParseURLs(flag.Args())
- scope := []crawl.Scope{
+ scope := crawl.AND(
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
+ )
+ if *alwaysIncludeRelated {
+ scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
}
w := warc.NewWriter(outf)