diff options
author | ale <ale@incal.net> | 2018-08-31 09:57:06 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2018-08-31 09:57:06 +0100 |
commit | 70c12b7a5de3fe635f4f49aa7e249f5d6141d2af (patch) | |
tree | bb532cda2f759f77e61508600cfc1f23e37bb7ba /crawler.go | |
parent | 98e2528f410908e50b4be3a2d5f6ed2b5f32bd2c (diff) | |
download | crawl-70c12b7a5de3fe635f4f49aa7e249f5d6141d2af.tar.gz crawl-70c12b7a5de3fe635f4f49aa7e249f5d6141d2af.zip |
Improve error handling, part two
Handler errors are fatal, so that an error writing the WARC output
will cause the crawl to abort.
Diffstat (limited to 'crawler.go')
-rw-r--r-- | crawler.go | 14 |
1 files changed, 10 insertions, 4 deletions
@@ -20,6 +20,8 @@ import ( lutil "github.com/syndtr/goleveldb/leveldb/util" ) +var errorRetryDelay = 180 * time.Second + type gobDB struct { *leveldb.DB } @@ -95,7 +97,7 @@ type URLInfo struct { URL string StatusCode int CrawledAt time.Time - Error error + Error string } // A Fetcher retrieves contents from remote URLs. @@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { info.StatusCode = httpResp.StatusCode } - // Invoke the handler (even if the fetcher errored out). - info.Error = c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr) + // Invoke the handler (even if the fetcher errored + // out). Errors in handling requests are fatal, crawl + // will be aborted. + Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)) + // Write the result in our database. wb := new(leveldb.Batch) if httpErr == nil { respBody.Close() // nolint @@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { // Remove the URL from the queue if the fetcher was successful. c.queue.Release(wb, p) } else { + info.Error = httpErr.Error() log.Printf("error retrieving %s: %v", p.URL, httpErr) - Must(c.queue.Retry(wb, p, 300*time.Second)) + Must(c.queue.Retry(wb, p, errorRetryDelay)) } Must(c.db.PutObjBatch(wb, urlkey, &info)) |