aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-02-14 21:02:12 -0700
committerJordan <me@jordan.im>2022-02-14 21:02:12 -0700
commita6a6fef1c7cc7d6878e8aa36541565fb3e0c9747 (patch)
tree7928f9229c26a12917a2303408dd6ce4fb691432
parent13996013034f19d0d5ddf00a2926d2a117610170 (diff)
downloadcrawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.tar.gz
crawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.zip
crawl, readme: record assembled seed URLs to seed_urls file
-rw-r--r--README.md1
-rw-r--r--cmd/crawl/crawl.go10
2 files changed, 11 insertions, 0 deletions
diff --git a/README.md b/README.md
index 5c740fc..128088c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ Notable changes include:
* update ignore regex set per updates to
[ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot)
* max default WARC size 100 MB -> 5 GB
+* record assembled seed URLs to seed_urls file
This tool can crawl a bunch of URLs for HTML content, and save the
results in a nice WARC file. It has little control over its traffic,
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 8c20901..63fda1a 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -327,6 +327,16 @@ func main() {
}
}
+ // Write seed list (assembled URLs) to seed_urls file
+ f, err := os.Create("seed_urls")
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ for _, v := range seeds {
+ fmt.Fprintln(f, v)
+ }
+
w, err := warcWriterFromFlags(host)
if err != nil {
log.Fatal(err)