aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md1
-rw-r--r--cmd/crawl/crawl.go10
2 files changed, 11 insertions, 0 deletions
diff --git a/README.md b/README.md
index 5c740fc..128088c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ Notable changes include:
* update ignore regex set per updates to
[ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot)
* max default WARC size 100 MB -> 5 GB
+* record assembled seed URLs to seed_urls file
This tool can crawl a bunch of URLs for HTML content, and save the
results in a nice WARC file. It has little control over its traffic,
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 8c20901..63fda1a 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -327,6 +327,16 @@ func main() {
}
}
+ // Write seed list (assembled URLs) to seed_urls file
+ f, err := os.Create("seed_urls")
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ for _, v := range seeds {
+ fmt.Fprintln(f, v)
+ }
+
w, err := warcWriterFromFlags(host)
if err != nil {
log.Fatal(err)