From 2191536b9ea9084525f649dff38b88ac40d222f2 Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 10 Feb 2022 11:40:46 -0700 Subject: crawl, scope: recurse infinitely by default --- cmd/crawl/crawl.go | 2 +- scope.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 4a98b02..90f19d6 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -32,7 +32,7 @@ var ( dbPath = flag.String("state", "crawldb", "crawl state database path") keepDb = flag.Bool("keep", false, "keep the state database when done") concurrency = flag.Int("c", 10, "concurrent workers") - depth = flag.Int("depth", 100, "maximum link depth") + depth = flag.Int("depth", -1, "maximum link depth") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") excludeRelated = flag.Bool("exclude-related", false, "do not include related resources (css, images, etc) if their URL is not in scope") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file or pattern (patterns must include a \"%s\" literal token)") diff --git a/scope.go b/scope.go index bda1035..c53182f 100644 --- a/scope.go +++ b/scope.go @@ -18,7 +18,7 @@ type maxDepthScope struct { } func (s *maxDepthScope) Check(_ Outlink, depth int) bool { - return depth < s.maxDepth + return depth < s.maxDepth || s.maxDepth == -1 } // NewDepthScope returns a Scope that will limit crawls to a -- cgit v1.2.3-54-g00ecf