From a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747 Mon Sep 17 00:00:00 2001 From: Jordan Date: Mon, 14 Feb 2022 21:02:12 -0700 Subject: crawl, readme: record assembled seed URLs to seed_urls file --- README.md | 1 + cmd/crawl/crawl.go | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/README.md b/README.md index 5c740fc..128088c 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Notable changes include: * update ignore regex set per updates to [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot) * max default WARC size 100 MB -> 5 GB +* record assembled seed URLs to seed_urls file This tool can crawl a bunch of URLs for HTML content, and save the results in a nice WARC file. It has little control over its traffic, diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 8c20901..63fda1a 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -327,6 +327,16 @@ func main() { } } + // Write seed list (assembled URLs) to seed_urls file + f, err := os.Create("seed_urls") + if err != nil { + log.Fatal(err) + } + defer f.Close() + for _, v := range seeds { + fmt.Fprintln(f, v) + } + w, err := warcWriterFromFlags(host) if err != nil { log.Fatal(err) -- cgit v1.2.3-54-g00ecf