From 70c12b7a5de3fe635f4f49aa7e249f5d6141d2af Mon Sep 17 00:00:00 2001 From: ale Date: Fri, 31 Aug 2018 09:57:06 +0100 Subject: Improve error handling, part two Handler errors are fatal, so that an error writing the WARC output will cause the crawl to abort. --- cmd/crawl/crawl.go | 12 +++++++----- cmd/links/links.go | 5 +++-- crawler.go | 14 ++++++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 3d1120c..587b64a 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -40,7 +40,8 @@ var ( func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { links, err := analysis.GetLinks(resp) if err != nil { - return err + // This is not a fatal error, just a bad web page. + return nil } for _, link := range links { @@ -82,7 +83,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error { func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { if err != nil { - return err + return nil } // Read the response body (so we can save it to the WARC @@ -104,9 +105,10 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht // Dump the response. statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status) - respPayload := bytes.Join([][]byte{ - []byte(statusLine), hdr2str(resp.Header), data}, - []byte{'\r', '\n'}) + respPayload := bytes.Join( + [][]byte{[]byte(statusLine), hdr2str(resp.Header), data}, + []byte{'\r', '\n'}, + ) if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respPayload); werr != nil { return werr } diff --git a/cmd/links/links.go b/cmd/links/links.go index 9cd741f..5f76a6a 100644 --- a/cmd/links/links.go +++ b/cmd/links/links.go @@ -22,12 +22,13 @@ var ( func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { if err != nil { - return err + return nil } links, err := analysis.GetLinks(resp) if err != nil { - return err + // Not a fatal error, just a bad web page. + return nil } for _, link := range links { diff --git a/crawler.go b/crawler.go index f6670c1..d91d5b4 100644 --- a/crawler.go +++ b/crawler.go @@ -20,6 +20,8 @@ import ( lutil "github.com/syndtr/goleveldb/leveldb/util" ) +var errorRetryDelay = 180 * time.Second + type gobDB struct { *leveldb.DB } @@ -95,7 +97,7 @@ type URLInfo struct { URL string StatusCode int CrawledAt time.Time - Error error + Error string } // A Fetcher retrieves contents from remote URLs. @@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { info.StatusCode = httpResp.StatusCode } - // Invoke the handler (even if the fetcher errored out). - info.Error = c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr) + // Invoke the handler (even if the fetcher errored + // out). Errors in handling requests are fatal, crawl + // will be aborted. + Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)) + // Write the result in our database. wb := new(leveldb.Batch) if httpErr == nil { respBody.Close() // nolint @@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { // Remove the URL from the queue if the fetcher was successful. c.queue.Release(wb, p) } else { + info.Error = httpErr.Error() log.Printf("error retrieving %s: %v", p.URL, httpErr) - Must(c.queue.Retry(wb, p, 300*time.Second)) + Must(c.queue.Retry(wb, p, errorRetryDelay)) } Must(c.db.PutObjBatch(wb, urlkey, &info)) -- cgit v1.2.3-54-g00ecf