diff options
-rw-r--r-- | analysis/links_test.go | 21 | ||||
-rw-r--r-- | cmd/crawl/crawl.go | 45 | ||||
-rw-r--r-- | cmd/crawl/crawl_test.go | 17 | ||||
-rw-r--r-- | cmd/links/links.go | 5 | ||||
-rw-r--r-- | crawler.go | 37 | ||||
-rw-r--r-- | crawler_test.go | 17 |
6 files changed, 86 insertions, 56 deletions
diff --git a/analysis/links_test.go b/analysis/links_test.go index 1bd906b..e108c7a 100644 --- a/analysis/links_test.go +++ b/analysis/links_test.go @@ -2,9 +2,11 @@ package analysis import ( "fmt" + "io" "io/ioutil" "net/http" "net/url" + "os" "strings" "testing" @@ -31,7 +33,24 @@ type testdata struct { } func (td *testdata) runTestCase() error { - links, err := GetLinks(makeResponse(td.ctype, td.body)) + r, err := ioutil.TempFile("temp", "crawl") + if err != nil { + return err + } + defer os.Remove(r.Name()) + + w, err := os.OpenFile(r.Name(), os.O_RDWR, 0777) + if err != nil { + return err + } + resp := makeResponse(td.ctype, td.body) + if _, err := io.Copy(w, resp.Body); err != nil { + return err + } + if _, err := r.Seek(0, io.SeekStart); err != nil { + return err + } + links, err := GetLinks(resp, r) if err != nil { return fmt.Errorf("GetLinks() error: %v", err) } diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 8f28bc4..7082d14 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -9,7 +9,6 @@ import ( "flag" "fmt" "io" - "io/ioutil" "log" "net" "net/http" @@ -181,47 +180,29 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, header []byte, body * } -func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, _ error) error { - // Read the response body (so we can save it to the WARC - // output) and replace it with a buffer. - - /* - data, derr := ioutil.ReadAll(resp.Body) - if derr != nil { - // Errors at this stage are usually transport-level errors, - // and as such, retriable. - return crawl.ErrRetryRequest - } - resp.Body = ioutil.NopCloser(bytes.NewReader(data)) - */ - - // Keep temporary file to store request/response data - r, _ := ioutil.TempFile("temp", "crawl") - defer r.Close() - - w, _ := os.OpenFile(r.Name(), os.O_RDWR, 0777) - defer w.Close() - defer os.Remove(r.Name()) +func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, rBody *os.File, _ error) error { + wBody, _ := os.OpenFile(rBody.Name(), os.O_RDWR, 0777) + defer wBody.Close() // Dump the request to the WARC output. - if werr := resp.Request.Write(w); werr != nil { + if werr := resp.Request.Write(wBody); werr != nil { return werr } - if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, r); werr != nil { + if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, rBody); werr != nil { return werr } // Seek to start; we've written since last read - if _, err := r.Seek(0, io.SeekStart); err != nil { + if _, err := rBody.Seek(0, io.SeekStart); err != nil { return err } - w.Close() - w, _ = os.OpenFile(r.Name(), os.O_RDWR, 0777) - defer w.Close() + wBody.Close() + wBody, _ = os.OpenFile(rBody.Name(), os.O_RDWR, 0777) + defer wBody.Close() // Write response body to tmp file - if _, err := io.Copy(w, resp.Body); err != nil { + if _, err := io.Copy(wBody, resp.Body); err != nil { return err } @@ -231,18 +212,18 @@ func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, re [][]byte{[]byte(statusLine), hdr2str(resp.Header), []byte("")}, []byte{'\r', '\n'}, ) - if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, r); werr != nil { + if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, rBody); werr != nil { return werr } // Seek to start; we've written since last read - if _, err := r.Seek(0, io.SeekStart); err != nil { + if _, err := rBody.Seek(0, io.SeekStart); err != nil { return err } h.numWritten++ - return extractLinks(p, u, depth, resp, r, nil) + return extractLinks(p, u, depth, resp, rBody, nil) } func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) { diff --git a/cmd/crawl/crawl_test.go b/cmd/crawl/crawl_test.go index 57acffc..1646e40 100644 --- a/cmd/crawl/crawl_test.go +++ b/cmd/crawl/crawl_test.go @@ -6,7 +6,6 @@ import ( "net/http" "net/http/httptest" "os" - "path/filepath" "testing" "git.jordan.im/crawl" @@ -25,6 +24,18 @@ func TestCrawl(t *testing.T) { } defer os.RemoveAll(tmpdir) + if err := os.Chdir(tmpdir); err != nil { + t.Fatal(err) + } + + // Create directory to (temporarily) store response bodies + if _, err := os.Stat("temp"); os.IsNotExist(err) { + err := os.Mkdir("temp", 0700) + if err != nil { + t.Fatal(err) + } + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case "/": @@ -46,7 +57,7 @@ func TestCrawl(t *testing.T) { crawl.NewSeedScope(seeds), ) - outf, err := os.Create(filepath.Join(tmpdir, "warc.gz")) + outf, err := os.Create("warc.gz") if err != nil { t.Fatal(err) } @@ -58,7 +69,7 @@ func TestCrawl(t *testing.T) { } crawler, err := crawl.NewCrawler( - filepath.Join(tmpdir, "db"), + "db", seeds, scope, crawl.FetcherFunc(fetch), diff --git a/cmd/links/links.go b/cmd/links/links.go index 95f48d9..847f80f 100644 --- a/cmd/links/links.go +++ b/cmd/links/links.go @@ -8,6 +8,7 @@ import ( "flag" "log" "net/http" + "os" "strings" "git.jordan.im/crawl" @@ -20,8 +21,8 @@ var ( validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") ) -func extractLinks(p crawl.Publisher, u string, tag, depth int, resp *http.Response, _ error) error { - links, err := analysis.GetLinks(resp) +func extractLinks(p crawl.Publisher, u string, tag, depth int, resp *http.Response, body *os.File, _ error) error { + links, err := analysis.GetLinks(resp, body) if err != nil { // Not a fatal error, just a bad web page. return nil @@ -5,10 +5,11 @@ import ( "encoding/gob" "errors" "fmt" - "io" + "io/ioutil" "log" "net/http" "net/url" + "os" "sync" "sync/atomic" "time" @@ -112,15 +113,15 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) { // unless the handler returns the special error ErrRetryRequest. type Handler interface { // Handle the response from a URL. - Handle(Publisher, string, int, int, *http.Response, error) error + Handle(Publisher, string, int, int, *http.Response, *os.File, error) error } // HandlerFunc wraps a function into the Handler interface. -type HandlerFunc func(Publisher, string, int, int, *http.Response, error) error +type HandlerFunc func(Publisher, string, int, int, *http.Response, *os.File, error) error // Handle the response from a URL. -func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { - return f(p, u, tag, depth, resp, err) +func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { + return f(p, u, tag, depth, resp, body, err) } // ErrRetryRequest is returned by a Handler when the request should be @@ -249,17 +250,21 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { // Response object). fmt.Printf("%s\n", p.URL) httpResp, httpErr := c.fetcher.Fetch(p.URL) - var respBody io.ReadCloser - if httpErr == nil { - respBody = httpResp.Body + + // Keep temporary file to store request/response data + tmpFile, err := ioutil.TempFile("temp", "crawl") + if err != nil { + log.Fatal(err) } + defer tmpFile.Close() + defer os.Remove(tmpFile.Name()) // Invoke the handler (even if the fetcher errored // out). Errors in handling requests are fatal, crawl // will be aborted. - err := c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, httpErr) + err = c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, tmpFile, httpErr) if httpErr == nil { - respBody.Close() // nolint + httpResp.Body.Close() // nolint } wb := new(leveldb.Batch) @@ -354,8 +359,8 @@ func (c *Crawler) Close() { // and adds them to the queue for crawling. It will call the wrapped // handler on all requests regardless. func FollowRedirects(wrap Handler) Handler { - return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { - if herr := wrap.Handle(p, u, tag, depth, resp, err); herr != nil { + return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { + if herr := wrap.Handle(p, u, tag, depth, resp, body,err); herr != nil { return herr } @@ -380,14 +385,14 @@ func FollowRedirects(wrap Handler) Handler { // "successful" HTTP status code (anything < 400). When using this // wrapper, subsequent Handle calls will always have err set to nil. func FilterErrors(wrap Handler) Handler { - return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { + return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { if err != nil { return nil } if resp.StatusCode >= 400 { return nil } - return wrap.Handle(p, u, tag, depth, resp, nil) + return wrap.Handle(p, u, tag, depth, resp, body, nil) }) } @@ -395,11 +400,11 @@ func FilterErrors(wrap Handler) Handler { // temporary errors (all transport-level errors are considered // temporary, as well as any HTTP status code >= 500). func HandleRetries(wrap Handler) Handler { - return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { + return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 { return ErrRetryRequest } - return wrap.Handle(p, u, tag, depth, resp, nil) + return wrap.Handle(p, u, tag, depth, resp, body, nil) }) } diff --git a/crawler_test.go b/crawler_test.go index fa81c2f..d395b75 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -12,12 +12,25 @@ import ( ) func TestCrawler(t *testing.T) { + dir, err := ioutil.TempDir("", "") if err != nil { t.Fatal(err) } defer os.RemoveAll(dir) + if err := os.Chdir(dir); err != nil { + t.Fatal(err) + } + + // Create directory to (temporarily) store response bodies + if _, err := os.Stat("temp"); os.IsNotExist(err) { + err := os.Mkdir("temp", 0700) + if err != nil { + t.Fatal(err) + } + } + // Run a trivial test http server just so our test Fetcher can // return a real http.Response object. srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -33,7 +46,7 @@ func TestCrawler(t *testing.T) { ) var crawledPages int - h := HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error { + h := HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error { crawledPages++ next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages) log.Printf("%s -> %s", u, next) @@ -44,7 +57,7 @@ func TestCrawler(t *testing.T) { return nil }) - crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h)))) + crawler, err := NewCrawler("crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h)))) if err != nil { t.Fatal("NewCrawler", err) } |