From 392cd30dda49503c967b1239f72fe53b98ba63cc Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 10 Feb 2022 13:58:06 -0700 Subject: crawl: create new directory to store crawl contents, resume param --- cmd/crawl/crawl.go | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 90f19d6..b3ab049 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -35,7 +35,7 @@ var ( depth = flag.Int("depth", -1, "maximum link depth") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") excludeRelated = flag.Bool("exclude-related", false, "do not include related resources (css, images, etc) if their URL is not in scope") - outputFile = flag.String("output", "crawl.warc.gz", "output WARC file or pattern (patterns must include a \"%s\" literal token)") + resumeDir = flag.String("resume", "", "path to directory of previous crawl to resume") warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns") cpuprofile = flag.String("cpuprofile", "", "create cpu profile") @@ -267,16 +267,8 @@ func (b *byteCounter) Read(buf []byte) (int, error) { return n, err } -func warcWriterFromFlags() (w *warc.Writer, err error) { - if strings.Contains(*outputFile, "%s") { - w, err = warc.NewMultiWriter(*outputFile, uint64(*warcFileSizeMB)*1024*1024) - } else { - var f *os.File - f, err = os.Create(*outputFile) - if err == nil { - w = warc.NewWriter(f) - } - } +func warcWriterFromFlags(host string) (w *warc.Writer, err error) { + w, err = warc.NewMultiWriter(host+"-%s.warc.gz", uint64(*warcFileSizeMB)*1024*1024) return } @@ -296,6 +288,9 @@ func main() { } seeds := crawl.MustParseURLs(flag.Args()) + if len(seeds) == 0 { + log.Fatal("no seed URL provided") + } scope := crawl.AND( crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), crawl.NewDepthScope(*depth), @@ -306,7 +301,27 @@ func main() { scope = crawl.AND(crawl.OR(scope, crawl.NewIncludeRelatedScope()), crawl.NewRegexpIgnoreScope(excludes)) } - w, err := warcWriterFromFlags() + // Use first URL's host in crawl directory/file names + host := seeds[0].Host + if *resumeDir != "" { + err := os.Chdir(*resumeDir) + if err != nil { + log.Fatal(err) + } + } else { + if _, err := os.Stat(host); os.IsNotExist(err) { + err := os.Mkdir(host, 0700) + if err != nil { + log.Fatal(err) + } + } + err := os.Chdir(host) + if err != nil { + log.Fatal(err) + } + } + + w, err := warcWriterFromFlags(host) if err != nil { log.Fatal(err) } -- cgit v1.2.3-54-g00ecf