aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/crawl/crawl.go16
-rw-r--r--cmd/links/links.go14
-rw-r--r--crawler.go86
-rw-r--r--crawler_test.go2
4 files changed, 74 insertions, 44 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 587b64a..cf2af5d 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -81,11 +81,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
return w.Close()
}
-func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
- if err != nil {
- return nil
- }
-
+func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
// Read the response body (so we can save it to the WARC
// output) and replace it with a buffer.
data, derr := ioutil.ReadAll(resp.Body)
@@ -113,7 +109,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
return werr
}
- return extractLinks(c, u, depth, resp, err)
+ return extractLinks(c, u, depth, resp, nil)
}
func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) {
@@ -240,7 +236,13 @@ func main() {
log.Fatal(err)
}
- crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
+ crawler, err := crawl.NewCrawler(
+ *dbPath,
+ seeds,
+ scope,
+ crawl.FetcherFunc(fetch),
+ crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))),
+ )
if err != nil {
log.Fatal(err)
}
diff --git a/cmd/links/links.go b/cmd/links/links.go
index 5f76a6a..bf91f3f 100644
--- a/cmd/links/links.go
+++ b/cmd/links/links.go
@@ -20,11 +20,7 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
)
-func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
- if err != nil {
- return nil
- }
-
+func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
// Not a fatal error, just a bad web page.
@@ -50,7 +46,13 @@ func main() {
crawl.NewSeedScope(seeds),
)
- crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(crawl.HandlerFunc(extractLinks)))
+ crawler, err := crawl.NewCrawler(
+ "crawldb",
+ seeds,
+ scope,
+ crawl.FetcherFunc(http.Get),
+ crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(crawl.HandlerFunc(extractLinks)))),
+ )
if err != nil {
log.Fatal(err)
}
diff --git a/crawler.go b/crawler.go
index d91d5b4..9776cfc 100644
--- a/crawler.go
+++ b/crawler.go
@@ -115,8 +115,9 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) {
}
// A Handler processes crawled contents. Any errors returned by public
-// implementations of this interface are considered permanent and will
-// not cause the URL to be fetched again.
+// implementations of this interface are considered fatal and will
+// cause the crawl to abort. The URL will be removed from the queue
+// unless the handler returns the special error ErrRetryRequest.
type Handler interface {
// Handle the response from a URL.
Handle(*Crawler, string, int, *http.Response, error) error
@@ -130,6 +131,10 @@ func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Respons
return f(db, u, depth, resp, err)
}
+// ErrRetryRequest is returned by a Handler when the request should be
+// retried after some time.
+var ErrRetryRequest = errors.New("retry_request")
+
// The Crawler object contains the crawler state.
type Crawler struct {
db *gobDB
@@ -234,21 +239,22 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
// Invoke the handler (even if the fetcher errored
// out). Errors in handling requests are fatal, crawl
// will be aborted.
- Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr))
-
- // Write the result in our database.
- wb := new(leveldb.Batch)
+ err := c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)
if httpErr == nil {
respBody.Close() // nolint
+ }
- // Remove the URL from the queue if the fetcher was successful.
+ wb := new(leveldb.Batch)
+ switch err {
+ case nil:
c.queue.Release(wb, p)
- } else {
- info.Error = httpErr.Error()
- log.Printf("error retrieving %s: %v", p.URL, httpErr)
+ case ErrRetryRequest:
Must(c.queue.Retry(wb, p, errorRetryDelay))
+ default:
+ log.Fatalf("fatal error in handling %s: %v", p.URL, err)
}
+ // Write the result in our database.
Must(c.db.PutObjBatch(wb, urlkey, &info))
Must(c.db.Write(wb, nil))
}
@@ -327,37 +333,57 @@ func (c *Crawler) Close() {
c.db.Close() // nolint
}
-type redirectHandler struct {
- h Handler
-}
+// FollowRedirects returns a Handler that follows HTTP redirects
+// and adds them to the queue for crawling. It will call the wrapped
+// handler on all requests regardless.
+func FollowRedirects(wrap Handler) Handler {
+ return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+ if herr := wrap.Handle(c, u, depth, resp, err); herr != nil {
+ return herr
+ }
-func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http.Response, err error) error {
- if err != nil {
- return err
- }
+ if err != nil {
+ return nil
+ }
- if resp.StatusCode == 200 {
- err = wrap.h.Handle(c, u, depth, resp, err)
- } else if resp.StatusCode > 300 && resp.StatusCode < 400 {
location := resp.Header.Get("Location")
- if location != "" {
+ if resp.StatusCode >= 300 && resp.StatusCode < 400 && location != "" {
locationURL, uerr := resp.Request.URL.Parse(location)
if uerr != nil {
log.Printf("error parsing Location header: %v", uerr)
} else {
- Must(c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1))
+ return c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
}
}
- } else {
- err = errors.New(resp.Status)
- }
- return err
+ return nil
+ })
}
-// NewRedirectHandler returns a Handler that follows HTTP redirects,
-// and will call the wrapped handler on every request with HTTP status 200.
-func NewRedirectHandler(wrap Handler) Handler {
- return &redirectHandler{wrap}
+// FilterErrors returns a Handler that forwards only requests with a
+// "successful" HTTP status code (anything < 400). When using this
+// wrapper, subsequent Handle calls will always have err set to nil.
+func FilterErrors(wrap Handler) Handler {
+ return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+ if err != nil {
+ return nil
+ }
+ if resp.StatusCode >= 400 {
+ return nil
+ }
+ return wrap.Handle(c, u, depth, resp, nil)
+ })
+}
+
+// HandleRetries returns a Handler that will retry requests on
+// temporary errors (all transport-level errors are considered
+// temporary, as well as any HTTP status code >= 500).
+func HandleRetries(wrap Handler) Handler {
+ return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+ if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 {
+ return ErrRetryRequest
+ }
+ return wrap.Handle(c, u, depth, resp, nil)
+ })
}
// Must will abort the program with a message when we encounter an
diff --git a/crawler_test.go b/crawler_test.go
index 66acbe4..fecc850 100644
--- a/crawler_test.go
+++ b/crawler_test.go
@@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) {
return nil
})
- crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h))
+ crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), FollowRedirects(h))
if err != nil {
t.Fatal("NewCrawler", err)
}