aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorale <ale@incal.net>2018-08-31 10:36:49 +0100
committerale <ale@incal.net>2018-08-31 10:36:49 +0100
commit9825334954ec555a9798e8e9be1ac04093595793 (patch)
treeaf898b7f30294c5edc784de591f083ab21a3ebef /cmd
parent70c12b7a5de3fe635f4f49aa7e249f5d6141d2af (diff)
downloadcrawl-9825334954ec555a9798e8e9be1ac04093595793.tar.gz
crawl-9825334954ec555a9798e8e9be1ac04093595793.zip
Explicitly delegate retry logic to handlers
Makes it possible to retry requests for temporary HTTP errors (429, 500, etc).
Diffstat (limited to 'cmd')
-rw-r--r--cmd/crawl/crawl.go16
-rw-r--r--cmd/links/links.go14
2 files changed, 17 insertions, 13 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 587b64a..cf2af5d 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -81,11 +81,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
return w.Close()
}
-func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
- if err != nil {
- return nil
- }
-
+func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
// Read the response body (so we can save it to the WARC
// output) and replace it with a buffer.
data, derr := ioutil.ReadAll(resp.Body)
@@ -113,7 +109,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
return werr
}
- return extractLinks(c, u, depth, resp, err)
+ return extractLinks(c, u, depth, resp, nil)
}
func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) {
@@ -240,7 +236,13 @@ func main() {
log.Fatal(err)
}
- crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
+ crawler, err := crawl.NewCrawler(
+ *dbPath,
+ seeds,
+ scope,
+ crawl.FetcherFunc(fetch),
+ crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))),
+ )
if err != nil {
log.Fatal(err)
}
diff --git a/cmd/links/links.go b/cmd/links/links.go
index 5f76a6a..bf91f3f 100644
--- a/cmd/links/links.go
+++ b/cmd/links/links.go
@@ -20,11 +20,7 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
)
-func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
- if err != nil {
- return nil
- }
-
+func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
// Not a fatal error, just a bad web page.
@@ -50,7 +46,13 @@ func main() {
crawl.NewSeedScope(seeds),
)
- crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(crawl.HandlerFunc(extractLinks)))
+ crawler, err := crawl.NewCrawler(
+ "crawldb",
+ seeds,
+ scope,
+ crawl.FetcherFunc(http.Get),
+ crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(crawl.HandlerFunc(extractLinks)))),
+ )
if err != nil {
log.Fatal(err)
}