aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2015-06-29 10:24:42 +0100
committerale <ale@incal.net>2015-06-29 10:24:42 +0100
commitb06e5a296b16d0080afd36470b0ba59c7b8a5bc2 (patch)
tree34431afd15443264b45b49272c9d42ca05b1c9f4
parent9fbc656c6cd2ad610986a265c6b346bc234bb881 (diff)
downloadcrawl-b06e5a296b16d0080afd36470b0ba59c7b8a5bc2.tar.gz
crawl-b06e5a296b16d0080afd36470b0ba59c7b8a5bc2.zip
clean up the state directory when done
-rw-r--r--cmd/crawl/crawl.go8
-rw-r--r--crawler.go4
2 files changed, 11 insertions, 1 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index d68ac5e..de45494 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -24,6 +24,7 @@ import (
var (
dbPath = flag.String("state", "crawldb", "crawl state database path")
+ keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
@@ -207,9 +208,14 @@ func main() {
saver := NewSaveHandler(w)
- crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
+ crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
if err != nil {
log.Fatal(err)
}
crawler.Run(*concurrency)
+
+ crawler.Close()
+ if !*keepDb {
+ os.RemoveAll(*dbPath)
+ }
}
diff --git a/crawler.go b/crawler.go
index c337d97..d162330 100644
--- a/crawler.go
+++ b/crawler.go
@@ -319,6 +319,10 @@ func (c *Crawler) Run(concurrency int) {
wg.Wait()
}
+func (c *Crawler) Close() {
+ c.db.Close()
+}
+
type redirectHandler struct {
h Handler
}