diff options
Diffstat (limited to 'crawler.go')
-rw-r--r-- | crawler.go | 37 |
1 files changed, 21 insertions, 16 deletions
@@ -5,10 +5,11 @@ import ( "encoding/gob" "errors" "fmt" - "io" + "io/ioutil" "log" "net/http" "net/url" + "os" "sync" "sync/atomic" "time" @@ -112,15 +113,15 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) { // unless the handler returns the special error ErrRetryRequest. type Handler interface { // Handle the response from a URL. - Handle(Publisher, string, int, int, *http.Response, error) error + Handle(Publisher, string, int, int, *http.Response, *os.File, error) error } // HandlerFunc wraps a function into the Handler interface. -type HandlerFunc func(Publisher, string, int, int, *http.Response, error) error +type HandlerFunc func(Publisher, string, int, int, *http.Response, *os.File, error) error // Handle the response from a URL. -func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { - return f(p, u, tag, depth, resp, err) +func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { + return f(p, u, tag, depth, resp, body, err) } // ErrRetryRequest is returned by a Handler when the request should be @@ -249,17 +250,21 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { // Response object). fmt.Printf("%s\n", p.URL) httpResp, httpErr := c.fetcher.Fetch(p.URL) - var respBody io.ReadCloser - if httpErr == nil { - respBody = httpResp.Body + + // Keep temporary file to store request/response data + tmpFile, err := ioutil.TempFile("temp", "crawl") + if err != nil { + log.Fatal(err) } + defer tmpFile.Close() + defer os.Remove(tmpFile.Name()) // Invoke the handler (even if the fetcher errored // out). Errors in handling requests are fatal, crawl // will be aborted. - err := c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, httpErr) + err = c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, tmpFile, httpErr) if httpErr == nil { - respBody.Close() // nolint + httpResp.Body.Close() // nolint } wb := new(leveldb.Batch) @@ -354,8 +359,8 @@ func (c *Crawler) Close() { // and adds them to the queue for crawling. It will call the wrapped // handler on all requests regardless. func FollowRedirects(wrap Handler) Handler { - return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { - if herr := wrap.Handle(p, u, tag, depth, resp, err); herr != nil { + return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { + if herr := wrap.Handle(p, u, tag, depth, resp, body,err); herr != nil { return herr } @@ -380,14 +385,14 @@ func FollowRedirects(wrap Handler) Handler { // "successful" HTTP status code (anything < 400). When using this // wrapper, subsequent Handle calls will always have err set to nil. func FilterErrors(wrap Handler) Handler { - return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { + return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { if err != nil { return nil } if resp.StatusCode >= 400 { return nil } - return wrap.Handle(p, u, tag, depth, resp, nil) + return wrap.Handle(p, u, tag, depth, resp, body, nil) }) } @@ -395,11 +400,11 @@ func FilterErrors(wrap Handler) Handler { // temporary errors (all transport-level errors are considered // temporary, as well as any HTTP status code >= 500). func HandleRetries(wrap Handler) Handler { - return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { + return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 { return ErrRetryRequest } - return wrap.Handle(p, u, tag, depth, resp, nil) + return wrap.Handle(p, u, tag, depth, resp, body, nil) }) } |