diff options
author | ale <ale@incal.net> | 2017-12-19 10:16:08 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2017-12-19 10:16:08 +0000 |
commit | df800e154f3265f43a3758ac5071caba026ae585 (patch) | |
tree | 4ddb94ea6ee6c06390860bc31e48f97d9a076caa /cmd | |
parent | b06b0cd46156918b8a9ab3f60328c22dc60582ef (diff) | |
download | crawl-df800e154f3265f43a3758ac5071caba026ae585.tar.gz crawl-df800e154f3265f43a3758ac5071caba026ae585.zip |
Provide better defaults for command-line options
Defaults that are more suitable to real-world site archiving.
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/crawl/crawl.go | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index e7e8582..0e5fc15 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -26,13 +26,13 @@ import ( ) var ( - dbPath = flag.String("state", "crawldb", "crawl state database path") - keepDb = flag.Bool("keep", false, "keep the state database when done") - concurrency = flag.Int("c", 10, "concurrent workers") - depth = flag.Int("depth", 10, "maximum link depth") - validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") - alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") - outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 100, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") + excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope") + outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") cpuprofile = flag.String("cpuprofile", "", "create cpu profile") ) @@ -213,7 +213,7 @@ func main() { crawl.NewSeedScope(seeds), crawl.NewRegexpIgnoreScope(nil), ) - if *alwaysIncludeRelated { + if !*excludeRelated { scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) } |