diff options
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | cmd/crawl/crawl.go | 10 |
2 files changed, 11 insertions, 0 deletions
@@ -15,6 +15,7 @@ Notable changes include: * update ignore regex set per updates to [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot) * max default WARC size 100 MB -> 5 GB +* record assembled seed URLs to seed_urls file This tool can crawl a bunch of URLs for HTML content, and save the results in a nice WARC file. It has little control over its traffic, diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 8c20901..63fda1a 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -327,6 +327,16 @@ func main() { } } + // Write seed list (assembled URLs) to seed_urls file + f, err := os.Create("seed_urls") + if err != nil { + log.Fatal(err) + } + defer f.Close() + for _, v := range seeds { + fmt.Fprintln(f, v) + } + w, err := warcWriterFromFlags(host) if err != nil { log.Fatal(err) |