diff options
author | ale <ale@incal.net> | 2019-01-20 08:15:22 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2019-01-20 08:15:22 +0000 |
commit | 64eb5fb23f64f209e3d813e017097044a111151f (patch) | |
tree | 03288c4cfee6f8f59d4e2dabbe057292273fe653 | |
parent | cce28f44e7ad88900e6c53394a8e496f2955b784 (diff) | |
download | crawl-64eb5fb23f64f209e3d813e017097044a111151f.tar.gz crawl-64eb5fb23f64f209e3d813e017097044a111151f.zip |
Refactor Handlers in terms of a Publisher interface
Introduce an interface to decouple the Enqueue functionality from the
Crawler implementation.
-rw-r--r-- | cmd/crawl/crawl.go | 8 | ||||
-rw-r--r-- | cmd/links/links.go | 4 | ||||
-rw-r--r-- | crawler.go | 28 | ||||
-rw-r--r-- | crawler_test.go | 4 |
4 files changed, 25 insertions, 19 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 2ebba98..54bb505 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -82,7 +82,7 @@ func (f *excludesFileFlag) Set(s string) error { return nil } -func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { +func extractLinks(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error { links, err := analysis.GetLinks(resp) if err != nil { // This is not a fatal error, just a bad web page. @@ -90,7 +90,7 @@ func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ } for _, link := range links { - if err := c.Enqueue(link, depth+1); err != nil { + if err := p.Enqueue(link, depth+1); err != nil { return err } } @@ -127,7 +127,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error { return w.Close() } -func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { +func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error { // Read the response body (so we can save it to the WARC // output) and replace it with a buffer. data, derr := ioutil.ReadAll(resp.Body) @@ -157,7 +157,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht h.numWritten++ - return extractLinks(c, u, depth, resp, nil) + return extractLinks(p, u, depth, resp, nil) } func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) { diff --git a/cmd/links/links.go b/cmd/links/links.go index bf91f3f..2263414 100644 --- a/cmd/links/links.go +++ b/cmd/links/links.go @@ -20,7 +20,7 @@ var ( validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") ) -func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { +func extractLinks(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error { links, err := analysis.GetLinks(resp) if err != nil { // Not a fatal error, just a bad web page. @@ -28,7 +28,7 @@ func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ } for _, link := range links { - if err := c.Enqueue(link, depth+1); err != nil { + if err := p.Enqueue(link, depth+1); err != nil { return err } } @@ -112,21 +112,27 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) { // unless the handler returns the special error ErrRetryRequest. type Handler interface { // Handle the response from a URL. - Handle(*Crawler, string, int, *http.Response, error) error + Handle(Publisher, string, int, *http.Response, error) error } // HandlerFunc wraps a function into the Handler interface. -type HandlerFunc func(*Crawler, string, int, *http.Response, error) error +type HandlerFunc func(Publisher, string, int, *http.Response, error) error // Handle the response from a URL. -func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Response, err error) error { - return f(db, u, depth, resp, err) +func (f HandlerFunc) Handle(p Publisher, u string, depth int, resp *http.Response, err error) error { + return f(p, u, depth, resp, err) } // ErrRetryRequest is returned by a Handler when the request should be // retried after some time. var ErrRetryRequest = errors.New("retry_request") +// Publisher is an interface to something with an Enqueue() method to +// add new potential URLs to crawl. +type Publisher interface { + Enqueue(Outlink, int) error +} + // The Crawler object contains the crawler state. type Crawler struct { db *gobDB @@ -341,8 +347,8 @@ func (c *Crawler) Close() { // and adds them to the queue for crawling. It will call the wrapped // handler on all requests regardless. func FollowRedirects(wrap Handler) Handler { - return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { - if herr := wrap.Handle(c, u, depth, resp, err); herr != nil { + return HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error { + if herr := wrap.Handle(p, u, depth, resp, err); herr != nil { return herr } @@ -356,7 +362,7 @@ func FollowRedirects(wrap Handler) Handler { if uerr != nil { log.Printf("error parsing Location header: %v", uerr) } else { - return c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1) + return p.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1) } } return nil @@ -367,14 +373,14 @@ func FollowRedirects(wrap Handler) Handler { // "successful" HTTP status code (anything < 400). When using this // wrapper, subsequent Handle calls will always have err set to nil. func FilterErrors(wrap Handler) Handler { - return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + return HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error { if err != nil { return nil } if resp.StatusCode >= 400 { return nil } - return wrap.Handle(c, u, depth, resp, nil) + return wrap.Handle(p, u, depth, resp, nil) }) } @@ -382,11 +388,11 @@ func FilterErrors(wrap Handler) Handler { // temporary errors (all transport-level errors are considered // temporary, as well as any HTTP status code >= 500). func HandleRetries(wrap Handler) Handler { - return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + return HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error { if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 { return ErrRetryRequest } - return wrap.Handle(c, u, depth, resp, nil) + return wrap.Handle(p, u, depth, resp, nil) }) } diff --git a/crawler_test.go b/crawler_test.go index 7b5c92c..0ad469b 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -33,11 +33,11 @@ func TestCrawler(t *testing.T) { ) var crawledPages int - h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + h := HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error { crawledPages++ next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages) log.Printf("%s -> %s", u, next) - c.Enqueue(Outlink{ + p.Enqueue(Outlink{ URL: mustParseURL(next), Tag: TagPrimary, }, depth+1) |