diff options
author | ale <ale@incal.net> | 2018-08-31 10:36:49 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2018-08-31 10:36:49 +0100 |
commit | 9825334954ec555a9798e8e9be1ac04093595793 (patch) | |
tree | af898b7f30294c5edc784de591f083ab21a3ebef | |
parent | 70c12b7a5de3fe635f4f49aa7e249f5d6141d2af (diff) | |
download | crawl-9825334954ec555a9798e8e9be1ac04093595793.tar.gz crawl-9825334954ec555a9798e8e9be1ac04093595793.zip |
Explicitly delegate retry logic to handlers
Makes it possible to retry requests for temporary HTTP errors (429,
500, etc).
-rw-r--r-- | cmd/crawl/crawl.go | 16 | ||||
-rw-r--r-- | cmd/links/links.go | 14 | ||||
-rw-r--r-- | crawler.go | 86 | ||||
-rw-r--r-- | crawler_test.go | 2 |
4 files changed, 74 insertions, 44 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 587b64a..cf2af5d 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -81,11 +81,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error { return w.Close() } -func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { - if err != nil { - return nil - } - +func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { // Read the response body (so we can save it to the WARC // output) and replace it with a buffer. data, derr := ioutil.ReadAll(resp.Body) @@ -113,7 +109,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht return werr } - return extractLinks(c, u, depth, resp, err) + return extractLinks(c, u, depth, resp, nil) } func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) { @@ -240,7 +236,13 @@ func main() { log.Fatal(err) } - crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) + crawler, err := crawl.NewCrawler( + *dbPath, + seeds, + scope, + crawl.FetcherFunc(fetch), + crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))), + ) if err != nil { log.Fatal(err) } diff --git a/cmd/links/links.go b/cmd/links/links.go index 5f76a6a..bf91f3f 100644 --- a/cmd/links/links.go +++ b/cmd/links/links.go @@ -20,11 +20,7 @@ var ( validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") ) -func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { - if err != nil { - return nil - } - +func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { links, err := analysis.GetLinks(resp) if err != nil { // Not a fatal error, just a bad web page. @@ -50,7 +46,13 @@ func main() { crawl.NewSeedScope(seeds), ) - crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(crawl.HandlerFunc(extractLinks))) + crawler, err := crawl.NewCrawler( + "crawldb", + seeds, + scope, + crawl.FetcherFunc(http.Get), + crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(crawl.HandlerFunc(extractLinks)))), + ) if err != nil { log.Fatal(err) } @@ -115,8 +115,9 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) { } // A Handler processes crawled contents. Any errors returned by public -// implementations of this interface are considered permanent and will -// not cause the URL to be fetched again. +// implementations of this interface are considered fatal and will +// cause the crawl to abort. The URL will be removed from the queue +// unless the handler returns the special error ErrRetryRequest. type Handler interface { // Handle the response from a URL. Handle(*Crawler, string, int, *http.Response, error) error @@ -130,6 +131,10 @@ func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Respons return f(db, u, depth, resp, err) } +// ErrRetryRequest is returned by a Handler when the request should be +// retried after some time. +var ErrRetryRequest = errors.New("retry_request") + // The Crawler object contains the crawler state. type Crawler struct { db *gobDB @@ -234,21 +239,22 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { // Invoke the handler (even if the fetcher errored // out). Errors in handling requests are fatal, crawl // will be aborted. - Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)) - - // Write the result in our database. - wb := new(leveldb.Batch) + err := c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr) if httpErr == nil { respBody.Close() // nolint + } - // Remove the URL from the queue if the fetcher was successful. + wb := new(leveldb.Batch) + switch err { + case nil: c.queue.Release(wb, p) - } else { - info.Error = httpErr.Error() - log.Printf("error retrieving %s: %v", p.URL, httpErr) + case ErrRetryRequest: Must(c.queue.Retry(wb, p, errorRetryDelay)) + default: + log.Fatalf("fatal error in handling %s: %v", p.URL, err) } + // Write the result in our database. Must(c.db.PutObjBatch(wb, urlkey, &info)) Must(c.db.Write(wb, nil)) } @@ -327,37 +333,57 @@ func (c *Crawler) Close() { c.db.Close() // nolint } -type redirectHandler struct { - h Handler -} +// FollowRedirects returns a Handler that follows HTTP redirects +// and adds them to the queue for crawling. It will call the wrapped +// handler on all requests regardless. +func FollowRedirects(wrap Handler) Handler { + return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + if herr := wrap.Handle(c, u, depth, resp, err); herr != nil { + return herr + } -func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http.Response, err error) error { - if err != nil { - return err - } + if err != nil { + return nil + } - if resp.StatusCode == 200 { - err = wrap.h.Handle(c, u, depth, resp, err) - } else if resp.StatusCode > 300 && resp.StatusCode < 400 { location := resp.Header.Get("Location") - if location != "" { + if resp.StatusCode >= 300 && resp.StatusCode < 400 && location != "" { locationURL, uerr := resp.Request.URL.Parse(location) if uerr != nil { log.Printf("error parsing Location header: %v", uerr) } else { - Must(c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)) + return c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1) } } - } else { - err = errors.New(resp.Status) - } - return err + return nil + }) } -// NewRedirectHandler returns a Handler that follows HTTP redirects, -// and will call the wrapped handler on every request with HTTP status 200. -func NewRedirectHandler(wrap Handler) Handler { - return &redirectHandler{wrap} +// FilterErrors returns a Handler that forwards only requests with a +// "successful" HTTP status code (anything < 400). When using this +// wrapper, subsequent Handle calls will always have err set to nil. +func FilterErrors(wrap Handler) Handler { + return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + if err != nil { + return nil + } + if resp.StatusCode >= 400 { + return nil + } + return wrap.Handle(c, u, depth, resp, nil) + }) +} + +// HandleRetries returns a Handler that will retry requests on +// temporary errors (all transport-level errors are considered +// temporary, as well as any HTTP status code >= 500). +func HandleRetries(wrap Handler) Handler { + return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 { + return ErrRetryRequest + } + return wrap.Handle(c, u, depth, resp, nil) + }) } // Must will abort the program with a message when we encounter an diff --git a/crawler_test.go b/crawler_test.go index 66acbe4..fecc850 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) { return nil }) - crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h)) + crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), FollowRedirects(h)) if err != nil { t.Fatal("NewCrawler", err) } |