From 9825334954ec555a9798e8e9be1ac04093595793 Mon Sep 17 00:00:00 2001 From: ale Date: Fri, 31 Aug 2018 10:36:49 +0100 Subject: Explicitly delegate retry logic to handlers Makes it possible to retry requests for temporary HTTP errors (429, 500, etc). --- cmd/crawl/crawl.go | 16 +++++++++------- cmd/links/links.go | 14 ++++++++------ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'cmd') diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 587b64a..cf2af5d 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -81,11 +81,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error { return w.Close() } -func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { - if err != nil { - return nil - } - +func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { // Read the response body (so we can save it to the WARC // output) and replace it with a buffer. data, derr := ioutil.ReadAll(resp.Body) @@ -113,7 +109,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht return werr } - return extractLinks(c, u, depth, resp, err) + return extractLinks(c, u, depth, resp, nil) } func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) { @@ -240,7 +236,13 @@ func main() { log.Fatal(err) } - crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) + crawler, err := crawl.NewCrawler( + *dbPath, + seeds, + scope, + crawl.FetcherFunc(fetch), + crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))), + ) if err != nil { log.Fatal(err) } diff --git a/cmd/links/links.go b/cmd/links/links.go index 5f76a6a..bf91f3f 100644 --- a/cmd/links/links.go +++ b/cmd/links/links.go @@ -20,11 +20,7 @@ var ( validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") ) -func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { - if err != nil { - return nil - } - +func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { links, err := analysis.GetLinks(resp) if err != nil { // Not a fatal error, just a bad web page. @@ -50,7 +46,13 @@ func main() { crawl.NewSeedScope(seeds), ) - crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(crawl.HandlerFunc(extractLinks))) + crawler, err := crawl.NewCrawler( + "crawldb", + seeds, + scope, + crawl.FetcherFunc(http.Get), + crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(crawl.HandlerFunc(extractLinks)))), + ) if err != nil { log.Fatal(err) } -- cgit v1.2.3-54-g00ecf