diff options
author | Jordan <me@jordan.im> | 2022-02-14 21:02:12 -0700 |
---|---|---|
committer | Jordan <me@jordan.im> | 2022-02-14 21:02:12 -0700 |
commit | a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747 (patch) | |
tree | 7928f9229c26a12917a2303408dd6ce4fb691432 | |
parent | 13996013034f19d0d5ddf00a2926d2a117610170 (diff) | |
download | crawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.tar.gz crawl-a6a6fef1c7cc7d6878e8aa36541565fb3e0c9747.zip |
crawl, readme: record assembled seed URLs to seed_urls file
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | cmd/crawl/crawl.go | 10 |
2 files changed, 11 insertions, 0 deletions
@@ -15,6 +15,7 @@ Notable changes include: * update ignore regex set per updates to [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot) * max default WARC size 100 MB -> 5 GB +* record assembled seed URLs to seed_urls file This tool can crawl a bunch of URLs for HTML content, and save the results in a nice WARC file. It has little control over its traffic, diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 8c20901..63fda1a 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -327,6 +327,16 @@ func main() { } } + // Write seed list (assembled URLs) to seed_urls file + f, err := os.Create("seed_urls") + if err != nil { + log.Fatal(err) + } + defer f.Close() + for _, v := range seeds { + fmt.Fprintln(f, v) + } + w, err := warcWriterFromFlags(host) if err != nil { log.Fatal(err) |