From b06e5a296b16d0080afd36470b0ba59c7b8a5bc2 Mon Sep 17 00:00:00 2001 From: ale Date: Mon, 29 Jun 2015 10:24:42 +0100 Subject: clean up the state directory when done --- cmd/crawl/crawl.go | 8 +++++++- crawler.go | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index d68ac5e..de45494 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -24,6 +24,7 @@ import ( var ( dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") concurrency = flag.Int("c", 10, "concurrent workers") depth = flag.Int("depth", 10, "maximum link depth") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") @@ -207,9 +208,14 @@ func main() { saver := NewSaveHandler(w) - crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) + crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) if err != nil { log.Fatal(err) } crawler.Run(*concurrency) + + crawler.Close() + if !*keepDb { + os.RemoveAll(*dbPath) + } } diff --git a/crawler.go b/crawler.go index c337d97..d162330 100644 --- a/crawler.go +++ b/crawler.go @@ -319,6 +319,10 @@ func (c *Crawler) Run(concurrency int) { wg.Wait() } +func (c *Crawler) Close() { + c.db.Close() +} + type redirectHandler struct { h Handler } -- cgit v1.2.3-54-g00ecf