From a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747 Mon Sep 17 00:00:00 2001
From: Jordan <me@jordan.im>
Date: Mon, 14 Feb 2022 21:02:12 -0700
Subject: crawl, readme: record assembled seed URLs to seed_urls file

---
 README.md          |  1 +
 cmd/crawl/crawl.go | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 5c740fc..128088c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ Notable changes include:
 * update ignore regex set per updates to
   [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot)
 * max default WARC size 100 MB -> 5 GB
+* record assembled seed URLs to seed_urls file
 
 This tool can crawl a bunch of URLs for HTML content, and save the
 results in a nice WARC file. It has little control over its traffic,
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 8c20901..63fda1a 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -327,6 +327,16 @@ func main() {
 		}
 	}
 
+	// Write seed list (assembled URLs) to seed_urls file
+	f, err := os.Create("seed_urls")
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer f.Close()
+	for _, v := range seeds {
+		fmt.Fprintln(f, v)
+	}
+
 	w, err := warcWriterFromFlags(host)
 	if err != nil {
 		log.Fatal(err)
-- 
cgit v1.2.3-54-g00ecf