crawl, readme: record assembled seed URLs to seed_urls file

author: Jordan <me@jordan.im> 2022-02-14 21:02:12 -0700
committer: Jordan <me@jordan.im> 2022-02-14 21:02:12 -0700
commit: a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747 (patch)
tree: 7928f9229c26a12917a2303408dd6ce4fb691432
parent: 13996013034f19d0d5ddf00a2926d2a117610170 (diff)
download: crawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.tar.gz
crawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.zip
2 files changed, 11 insertions, 0 deletions
diff --git a/README.md b/README.md
index 5c740fc..128088c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ Notable changes include:
 * update ignore regex set per updates to
   [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot)
 * max default WARC size 100 MB -> 5 GB
+* record assembled seed URLs to seed_urls file
 
 This tool can crawl a bunch of URLs for HTML content, and save the
 results in a nice WARC file. It has little control over its traffic,
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 8c20901..63fda1a 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -327,6 +327,16 @@ func main() {
 		}
 	}
 
+	// Write seed list (assembled URLs) to seed_urls file
+	f, err := os.Create("seed_urls")
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer f.Close()
+	for _, v := range seeds {
+		fmt.Fprintln(f, v)
+	}
+
 	w, err := warcWriterFromFlags(host)
 	if err != nil {
 		log.Fatal(err)
author	Jordan <me@jordan.im>	2022-02-14 21:02:12 -0700
committer	Jordan <me@jordan.im>	2022-02-14 21:02:12 -0700
commit	a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747 (patch)
tree	7928f9229c26a12917a2303408dd6ce4fb691432
parent	13996013034f19d0d5ddf00a2926d2a117610170 (diff)
download	crawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.tar.gz crawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.zip