aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-02-10 13:58:06 -0700
committerJordan <me@jordan.im>2022-02-10 13:58:06 -0700
commit392cd30dda49503c967b1239f72fe53b98ba63cc (patch)
tree2f51ea8a6365940953f34131c04b2a043d510562
parent2191536b9ea9084525f649dff38b88ac40d222f2 (diff)
downloadcrawl-392cd30dda49503c967b1239f72fe53b98ba63cc.tar.gz
crawl-392cd30dda49503c967b1239f72fe53b98ba63cc.zip
crawl: create new directory to store crawl contents, resume param
-rw-r--r--cmd/crawl/crawl.go39
1 files changed, 27 insertions, 12 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 90f19d6..b3ab049 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -35,7 +35,7 @@ var (
depth = flag.Int("depth", -1, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
excludeRelated = flag.Bool("exclude-related", false, "do not include related resources (css, images, etc) if their URL is not in scope")
- outputFile = flag.String("output", "crawl.warc.gz", "output WARC file or pattern (patterns must include a \"%s\" literal token)")
+ resumeDir = flag.String("resume", "", "path to directory of previous crawl to resume")
warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
@@ -267,16 +267,8 @@ func (b *byteCounter) Read(buf []byte) (int, error) {
return n, err
}
-func warcWriterFromFlags() (w *warc.Writer, err error) {
- if strings.Contains(*outputFile, "%s") {
- w, err = warc.NewMultiWriter(*outputFile, uint64(*warcFileSizeMB)*1024*1024)
- } else {
- var f *os.File
- f, err = os.Create(*outputFile)
- if err == nil {
- w = warc.NewWriter(f)
- }
- }
+func warcWriterFromFlags(host string) (w *warc.Writer, err error) {
+ w, err = warc.NewMultiWriter(host+"-%s.warc.gz", uint64(*warcFileSizeMB)*1024*1024)
return
}
@@ -296,6 +288,9 @@ func main() {
}
seeds := crawl.MustParseURLs(flag.Args())
+ if len(seeds) == 0 {
+ log.Fatal("no seed URL provided")
+ }
scope := crawl.AND(
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
@@ -306,7 +301,27 @@ func main() {
scope = crawl.AND(crawl.OR(scope, crawl.NewIncludeRelatedScope()), crawl.NewRegexpIgnoreScope(excludes))
}
- w, err := warcWriterFromFlags()
+ // Use first URL's host in crawl directory/file names
+ host := seeds[0].Host
+ if *resumeDir != "" {
+ err := os.Chdir(*resumeDir)
+ if err != nil {
+ log.Fatal(err)
+ }
+ } else {
+ if _, err := os.Stat(host); os.IsNotExist(err) {
+ err := os.Mkdir(host, 0700)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+ err := os.Chdir(host)
+ if err != nil {
+ log.Fatal(err)
+ }
+ }
+
+ w, err := warcWriterFromFlags(host)
if err != nil {
log.Fatal(err)
}