From caadc00d8dfadc0c9e0237fc7377eb632f500926 Mon Sep 17 00:00:00 2001
From: Jordan <me@jordan.im>
Date: Thu, 10 Feb 2022 20:19:27 -0700
Subject: crawl, readme: max default WARC size 100 MB -> 5 GB

---
 README.md          | 1 +
 cmd/crawl/crawl.go | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0f2298e..c7124a0 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Notable changes include:
   a browser
 * store crawl contents in a dated directory
 * update ignore regex set per updates to [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot)
+* max default WARC size 100 MB -> 5 GB
 
 This tool can crawl a bunch of URLs for HTML content, and save the
 results in a nice WARC file. It has little control over its traffic,
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index ea88412..8c20901 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -37,7 +37,7 @@ var (
 	validSchemes   = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
 	excludeRelated = flag.Bool("exclude-related", false, "do not include related resources (css, images, etc) if their URL is not in scope")
 	resumeDir      = flag.String("resume", "", "path to directory of previous crawl to resume")
-	warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns")
+	warcFileSizeMB = flag.Int("output-max-size", 5000, "maximum output WARC file size (in MB)")
 	cpuprofile     = flag.String("cpuprofile", "", "create cpu profile")
 	bindIP         = flag.String("bind", "", "IP address from which to make outbound connections")
 
-- 
cgit v1.2.3-54-g00ecf