aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorale <ale@incal.net>2017-12-19 10:16:08 +0000
committerale <ale@incal.net>2017-12-19 10:16:08 +0000
commitdf800e154f3265f43a3758ac5071caba026ae585 (patch)
tree4ddb94ea6ee6c06390860bc31e48f97d9a076caa /cmd
parentb06b0cd46156918b8a9ab3f60328c22dc60582ef (diff)
downloadcrawl-df800e154f3265f43a3758ac5071caba026ae585.tar.gz
crawl-df800e154f3265f43a3758ac5071caba026ae585.zip
Provide better defaults for command-line options
Defaults that are more suitable to real-world site archiving.
Diffstat (limited to 'cmd')
-rw-r--r--cmd/crawl/crawl.go16
1 files changed, 8 insertions, 8 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index e7e8582..0e5fc15 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -26,13 +26,13 @@ import (
)
var (
- dbPath = flag.String("state", "crawldb", "crawl state database path")
- keepDb = flag.Bool("keep", false, "keep the state database when done")
- concurrency = flag.Int("c", 10, "concurrent workers")
- depth = flag.Int("depth", 10, "maximum link depth")
- validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
- alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
- outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
+ dbPath = flag.String("state", "crawldb", "crawl state database path")
+ keepDb = flag.Bool("keep", false, "keep the state database when done")
+ concurrency = flag.Int("c", 10, "concurrent workers")
+ depth = flag.Int("depth", 100, "maximum link depth")
+ validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+ excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
+ outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
)
@@ -213,7 +213,7 @@ func main() {
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
)
- if *alwaysIncludeRelated {
+ if !*excludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
}