diff options
author | ale <ale@incal.net> | 2015-06-29 10:24:42 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2015-06-29 10:24:42 +0100 |
commit | b06e5a296b16d0080afd36470b0ba59c7b8a5bc2 (patch) | |
tree | 34431afd15443264b45b49272c9d42ca05b1c9f4 | |
parent | 9fbc656c6cd2ad610986a265c6b346bc234bb881 (diff) | |
download | crawl-b06e5a296b16d0080afd36470b0ba59c7b8a5bc2.tar.gz crawl-b06e5a296b16d0080afd36470b0ba59c7b8a5bc2.zip |
clean up the state directory when done
-rw-r--r-- | cmd/crawl/crawl.go | 8 | ||||
-rw-r--r-- | crawler.go | 4 |
2 files changed, 11 insertions, 1 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index d68ac5e..de45494 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -24,6 +24,7 @@ import ( var ( dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") concurrency = flag.Int("c", 10, "concurrent workers") depth = flag.Int("depth", 10, "maximum link depth") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") @@ -207,9 +208,14 @@ func main() { saver := NewSaveHandler(w) - crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) + crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) if err != nil { log.Fatal(err) } crawler.Run(*concurrency) + + crawler.Close() + if !*keepDb { + os.RemoveAll(*dbPath) + } } @@ -319,6 +319,10 @@ func (c *Crawler) Run(concurrency int) { wg.Wait() } +func (c *Crawler) Close() { + c.db.Close() +} + type redirectHandler struct { h Handler } |