aboutsummaryrefslogtreecommitdiff
path: root/crawler.go
diff options
context:
space:
mode:
authorale <ale@incal.net>2018-08-31 09:57:06 +0100
committerale <ale@incal.net>2018-08-31 09:57:06 +0100
commit70c12b7a5de3fe635f4f49aa7e249f5d6141d2af (patch)
treebb532cda2f759f77e61508600cfc1f23e37bb7ba /crawler.go
parent98e2528f410908e50b4be3a2d5f6ed2b5f32bd2c (diff)
downloadcrawl-70c12b7a5de3fe635f4f49aa7e249f5d6141d2af.tar.gz
crawl-70c12b7a5de3fe635f4f49aa7e249f5d6141d2af.zip
Improve error handling, part two
Handler errors are fatal, so that an error writing the WARC output will cause the crawl to abort.
Diffstat (limited to 'crawler.go')
-rw-r--r--crawler.go14
1 files changed, 10 insertions, 4 deletions
diff --git a/crawler.go b/crawler.go
index f6670c1..d91d5b4 100644
--- a/crawler.go
+++ b/crawler.go
@@ -20,6 +20,8 @@ import (
lutil "github.com/syndtr/goleveldb/leveldb/util"
)
+var errorRetryDelay = 180 * time.Second
+
type gobDB struct {
*leveldb.DB
}
@@ -95,7 +97,7 @@ type URLInfo struct {
URL string
StatusCode int
CrawledAt time.Time
- Error error
+ Error string
}
// A Fetcher retrieves contents from remote URLs.
@@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
info.StatusCode = httpResp.StatusCode
}
- // Invoke the handler (even if the fetcher errored out).
- info.Error = c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)
+ // Invoke the handler (even if the fetcher errored
+ // out). Errors in handling requests are fatal, crawl
+ // will be aborted.
+ Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr))
+ // Write the result in our database.
wb := new(leveldb.Batch)
if httpErr == nil {
respBody.Close() // nolint
@@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
// Remove the URL from the queue if the fetcher was successful.
c.queue.Release(wb, p)
} else {
+ info.Error = httpErr.Error()
log.Printf("error retrieving %s: %v", p.URL, httpErr)
- Must(c.queue.Retry(wb, p, 300*time.Second))
+ Must(c.queue.Retry(wb, p, errorRetryDelay))
}
Must(c.db.PutObjBatch(wb, urlkey, &info))