aboutsummaryrefslogtreecommitdiff
path: root/crawler.go
diff options
context:
space:
mode:
Diffstat (limited to 'crawler.go')
-rw-r--r--crawler.go37
1 files changed, 21 insertions, 16 deletions
diff --git a/crawler.go b/crawler.go
index b2ad3d9..24b9eae 100644
--- a/crawler.go
+++ b/crawler.go
@@ -5,10 +5,11 @@ import (
"encoding/gob"
"errors"
"fmt"
- "io"
+ "io/ioutil"
"log"
"net/http"
"net/url"
+ "os"
"sync"
"sync/atomic"
"time"
@@ -112,15 +113,15 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) {
// unless the handler returns the special error ErrRetryRequest.
type Handler interface {
// Handle the response from a URL.
- Handle(Publisher, string, int, int, *http.Response, error) error
+ Handle(Publisher, string, int, int, *http.Response, *os.File, error) error
}
// HandlerFunc wraps a function into the Handler interface.
-type HandlerFunc func(Publisher, string, int, int, *http.Response, error) error
+type HandlerFunc func(Publisher, string, int, int, *http.Response, *os.File, error) error
// Handle the response from a URL.
-func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
- return f(p, u, tag, depth, resp, err)
+func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
+ return f(p, u, tag, depth, resp, body, err)
}
// ErrRetryRequest is returned by a Handler when the request should be
@@ -249,17 +250,21 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
// Response object).
fmt.Printf("%s\n", p.URL)
httpResp, httpErr := c.fetcher.Fetch(p.URL)
- var respBody io.ReadCloser
- if httpErr == nil {
- respBody = httpResp.Body
+
+ // Keep temporary file to store request/response data
+ tmpFile, err := ioutil.TempFile("temp", "crawl")
+ if err != nil {
+ log.Fatal(err)
}
+ defer tmpFile.Close()
+ defer os.Remove(tmpFile.Name())
// Invoke the handler (even if the fetcher errored
// out). Errors in handling requests are fatal, crawl
// will be aborted.
- err := c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, httpErr)
+ err = c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, tmpFile, httpErr)
if httpErr == nil {
- respBody.Close() // nolint
+ httpResp.Body.Close() // nolint
}
wb := new(leveldb.Batch)
@@ -354,8 +359,8 @@ func (c *Crawler) Close() {
// and adds them to the queue for crawling. It will call the wrapped
// handler on all requests regardless.
func FollowRedirects(wrap Handler) Handler {
- return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
- if herr := wrap.Handle(p, u, tag, depth, resp, err); herr != nil {
+ return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
+ if herr := wrap.Handle(p, u, tag, depth, resp, body,err); herr != nil {
return herr
}
@@ -380,14 +385,14 @@ func FollowRedirects(wrap Handler) Handler {
// "successful" HTTP status code (anything < 400). When using this
// wrapper, subsequent Handle calls will always have err set to nil.
func FilterErrors(wrap Handler) Handler {
- return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
+ return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
if err != nil {
return nil
}
if resp.StatusCode >= 400 {
return nil
}
- return wrap.Handle(p, u, tag, depth, resp, nil)
+ return wrap.Handle(p, u, tag, depth, resp, body, nil)
})
}
@@ -395,11 +400,11 @@ func FilterErrors(wrap Handler) Handler {
// temporary errors (all transport-level errors are considered
// temporary, as well as any HTTP status code >= 500).
func HandleRetries(wrap Handler) Handler {
- return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
+ return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 {
return ErrRetryRequest
}
- return wrap.Handle(p, u, tag, depth, resp, nil)
+ return wrap.Handle(p, u, tag, depth, resp, body, nil)
})
}