From df800e154f3265f43a3758ac5071caba026ae585 Mon Sep 17 00:00:00 2001 From: ale Date: Tue, 19 Dec 2017 10:16:08 +0000 Subject: Provide better defaults for command-line options Defaults that are more suitable to real-world site archiving. --- README.md | 4 ++-- cmd/crawl/crawl.go | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 34360fa..39b0cea 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,8 @@ The crawling scope is controlled with a set of overlapping checks: prefix is implicitly ignored) * maximum crawling depth can be controlled with the *--depth* option * resources related to a page (CSS, JS, etc) will always be fetched, - even if on external domains, if the *--include-related* option is - specified + even if on external domains, unless the *--exclude-related* option + is specified If the program is interrupted, running it again with the same command line from the same directory will cause it to resume crawling from diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index e7e8582..0e5fc15 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -26,13 +26,13 @@ import ( ) var ( - dbPath = flag.String("state", "crawldb", "crawl state database path") - keepDb = flag.Bool("keep", false, "keep the state database when done") - concurrency = flag.Int("c", 10, "concurrent workers") - depth = flag.Int("depth", 10, "maximum link depth") - validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") - alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") - outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 100, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") + excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope") + outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") cpuprofile = flag.String("cpuprofile", "", "create cpu profile") ) @@ -213,7 +213,7 @@ func main() { crawl.NewSeedScope(seeds), crawl.NewRegexpIgnoreScope(nil), ) - if *alwaysIncludeRelated { + if !*excludeRelated { scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) } -- cgit v1.2.3-54-g00ecf