aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--analysis/links_test.go21
-rw-r--r--cmd/crawl/crawl.go45
-rw-r--r--cmd/crawl/crawl_test.go17
-rw-r--r--cmd/links/links.go5
-rw-r--r--crawler.go37
-rw-r--r--crawler_test.go17
6 files changed, 86 insertions, 56 deletions
diff --git a/analysis/links_test.go b/analysis/links_test.go
index 1bd906b..e108c7a 100644
--- a/analysis/links_test.go
+++ b/analysis/links_test.go
@@ -2,9 +2,11 @@ package analysis
import (
"fmt"
+ "io"
"io/ioutil"
"net/http"
"net/url"
+ "os"
"strings"
"testing"
@@ -31,7 +33,24 @@ type testdata struct {
}
func (td *testdata) runTestCase() error {
- links, err := GetLinks(makeResponse(td.ctype, td.body))
+ r, err := ioutil.TempFile("temp", "crawl")
+ if err != nil {
+ return err
+ }
+ defer os.Remove(r.Name())
+
+ w, err := os.OpenFile(r.Name(), os.O_RDWR, 0777)
+ if err != nil {
+ return err
+ }
+ resp := makeResponse(td.ctype, td.body)
+ if _, err := io.Copy(w, resp.Body); err != nil {
+ return err
+ }
+ if _, err := r.Seek(0, io.SeekStart); err != nil {
+ return err
+ }
+ links, err := GetLinks(resp, r)
if err != nil {
return fmt.Errorf("GetLinks() error: %v", err)
}
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 8f28bc4..7082d14 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -9,7 +9,6 @@ import (
"flag"
"fmt"
"io"
- "io/ioutil"
"log"
"net"
"net/http"
@@ -181,47 +180,29 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, header []byte, body *
}
-func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, _ error) error {
- // Read the response body (so we can save it to the WARC
- // output) and replace it with a buffer.
-
- /*
- data, derr := ioutil.ReadAll(resp.Body)
- if derr != nil {
- // Errors at this stage are usually transport-level errors,
- // and as such, retriable.
- return crawl.ErrRetryRequest
- }
- resp.Body = ioutil.NopCloser(bytes.NewReader(data))
- */
-
- // Keep temporary file to store request/response data
- r, _ := ioutil.TempFile("temp", "crawl")
- defer r.Close()
-
- w, _ := os.OpenFile(r.Name(), os.O_RDWR, 0777)
- defer w.Close()
- defer os.Remove(r.Name())
+func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, rBody *os.File, _ error) error {
+ wBody, _ := os.OpenFile(rBody.Name(), os.O_RDWR, 0777)
+ defer wBody.Close()
// Dump the request to the WARC output.
- if werr := resp.Request.Write(w); werr != nil {
+ if werr := resp.Request.Write(wBody); werr != nil {
return werr
}
- if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, r); werr != nil {
+ if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, rBody); werr != nil {
return werr
}
// Seek to start; we've written since last read
- if _, err := r.Seek(0, io.SeekStart); err != nil {
+ if _, err := rBody.Seek(0, io.SeekStart); err != nil {
return err
}
- w.Close()
- w, _ = os.OpenFile(r.Name(), os.O_RDWR, 0777)
- defer w.Close()
+ wBody.Close()
+ wBody, _ = os.OpenFile(rBody.Name(), os.O_RDWR, 0777)
+ defer wBody.Close()
// Write response body to tmp file
- if _, err := io.Copy(w, resp.Body); err != nil {
+ if _, err := io.Copy(wBody, resp.Body); err != nil {
return err
}
@@ -231,18 +212,18 @@ func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, re
[][]byte{[]byte(statusLine), hdr2str(resp.Header), []byte("")},
[]byte{'\r', '\n'},
)
- if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, r); werr != nil {
+ if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, rBody); werr != nil {
return werr
}
// Seek to start; we've written since last read
- if _, err := r.Seek(0, io.SeekStart); err != nil {
+ if _, err := rBody.Seek(0, io.SeekStart); err != nil {
return err
}
h.numWritten++
- return extractLinks(p, u, depth, resp, r, nil)
+ return extractLinks(p, u, depth, resp, rBody, nil)
}
func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) {
diff --git a/cmd/crawl/crawl_test.go b/cmd/crawl/crawl_test.go
index 57acffc..1646e40 100644
--- a/cmd/crawl/crawl_test.go
+++ b/cmd/crawl/crawl_test.go
@@ -6,7 +6,6 @@ import (
"net/http"
"net/http/httptest"
"os"
- "path/filepath"
"testing"
"git.jordan.im/crawl"
@@ -25,6 +24,18 @@ func TestCrawl(t *testing.T) {
}
defer os.RemoveAll(tmpdir)
+ if err := os.Chdir(tmpdir); err != nil {
+ t.Fatal(err)
+ }
+
+ // Create directory to (temporarily) store response bodies
+ if _, err := os.Stat("temp"); os.IsNotExist(err) {
+ err := os.Mkdir("temp", 0700)
+ if err != nil {
+ t.Fatal(err)
+ }
+ }
+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/":
@@ -46,7 +57,7 @@ func TestCrawl(t *testing.T) {
crawl.NewSeedScope(seeds),
)
- outf, err := os.Create(filepath.Join(tmpdir, "warc.gz"))
+ outf, err := os.Create("warc.gz")
if err != nil {
t.Fatal(err)
}
@@ -58,7 +69,7 @@ func TestCrawl(t *testing.T) {
}
crawler, err := crawl.NewCrawler(
- filepath.Join(tmpdir, "db"),
+ "db",
seeds,
scope,
crawl.FetcherFunc(fetch),
diff --git a/cmd/links/links.go b/cmd/links/links.go
index 95f48d9..847f80f 100644
--- a/cmd/links/links.go
+++ b/cmd/links/links.go
@@ -8,6 +8,7 @@ import (
"flag"
"log"
"net/http"
+ "os"
"strings"
"git.jordan.im/crawl"
@@ -20,8 +21,8 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
)
-func extractLinks(p crawl.Publisher, u string, tag, depth int, resp *http.Response, _ error) error {
- links, err := analysis.GetLinks(resp)
+func extractLinks(p crawl.Publisher, u string, tag, depth int, resp *http.Response, body *os.File, _ error) error {
+ links, err := analysis.GetLinks(resp, body)
if err != nil {
// Not a fatal error, just a bad web page.
return nil
diff --git a/crawler.go b/crawler.go
index b2ad3d9..24b9eae 100644
--- a/crawler.go
+++ b/crawler.go
@@ -5,10 +5,11 @@ import (
"encoding/gob"
"errors"
"fmt"
- "io"
+ "io/ioutil"
"log"
"net/http"
"net/url"
+ "os"
"sync"
"sync/atomic"
"time"
@@ -112,15 +113,15 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) {
// unless the handler returns the special error ErrRetryRequest.
type Handler interface {
// Handle the response from a URL.
- Handle(Publisher, string, int, int, *http.Response, error) error
+ Handle(Publisher, string, int, int, *http.Response, *os.File, error) error
}
// HandlerFunc wraps a function into the Handler interface.
-type HandlerFunc func(Publisher, string, int, int, *http.Response, error) error
+type HandlerFunc func(Publisher, string, int, int, *http.Response, *os.File, error) error
// Handle the response from a URL.
-func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
- return f(p, u, tag, depth, resp, err)
+func (f HandlerFunc) Handle(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
+ return f(p, u, tag, depth, resp, body, err)
}
// ErrRetryRequest is returned by a Handler when the request should be
@@ -249,17 +250,21 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
// Response object).
fmt.Printf("%s\n", p.URL)
httpResp, httpErr := c.fetcher.Fetch(p.URL)
- var respBody io.ReadCloser
- if httpErr == nil {
- respBody = httpResp.Body
+
+ // Keep temporary file to store request/response data
+ tmpFile, err := ioutil.TempFile("temp", "crawl")
+ if err != nil {
+ log.Fatal(err)
}
+ defer tmpFile.Close()
+ defer os.Remove(tmpFile.Name())
// Invoke the handler (even if the fetcher errored
// out). Errors in handling requests are fatal, crawl
// will be aborted.
- err := c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, httpErr)
+ err = c.handler.Handle(c, p.URL, p.Tag, p.Depth, httpResp, tmpFile, httpErr)
if httpErr == nil {
- respBody.Close() // nolint
+ httpResp.Body.Close() // nolint
}
wb := new(leveldb.Batch)
@@ -354,8 +359,8 @@ func (c *Crawler) Close() {
// and adds them to the queue for crawling. It will call the wrapped
// handler on all requests regardless.
func FollowRedirects(wrap Handler) Handler {
- return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
- if herr := wrap.Handle(p, u, tag, depth, resp, err); herr != nil {
+ return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
+ if herr := wrap.Handle(p, u, tag, depth, resp, body,err); herr != nil {
return herr
}
@@ -380,14 +385,14 @@ func FollowRedirects(wrap Handler) Handler {
// "successful" HTTP status code (anything < 400). When using this
// wrapper, subsequent Handle calls will always have err set to nil.
func FilterErrors(wrap Handler) Handler {
- return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
+ return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
if err != nil {
return nil
}
if resp.StatusCode >= 400 {
return nil
}
- return wrap.Handle(p, u, tag, depth, resp, nil)
+ return wrap.Handle(p, u, tag, depth, resp, body, nil)
})
}
@@ -395,11 +400,11 @@ func FilterErrors(wrap Handler) Handler {
// temporary errors (all transport-level errors are considered
// temporary, as well as any HTTP status code >= 500).
func HandleRetries(wrap Handler) Handler {
- return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
+ return HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 {
return ErrRetryRequest
}
- return wrap.Handle(p, u, tag, depth, resp, nil)
+ return wrap.Handle(p, u, tag, depth, resp, body, nil)
})
}
diff --git a/crawler_test.go b/crawler_test.go
index fa81c2f..d395b75 100644
--- a/crawler_test.go
+++ b/crawler_test.go
@@ -12,12 +12,25 @@ import (
)
func TestCrawler(t *testing.T) {
+
dir, err := ioutil.TempDir("", "")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(dir)
+ if err := os.Chdir(dir); err != nil {
+ t.Fatal(err)
+ }
+
+ // Create directory to (temporarily) store response bodies
+ if _, err := os.Stat("temp"); os.IsNotExist(err) {
+ err := os.Mkdir("temp", 0700)
+ if err != nil {
+ t.Fatal(err)
+ }
+ }
+
// Run a trivial test http server just so our test Fetcher can
// return a real http.Response object.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -33,7 +46,7 @@ func TestCrawler(t *testing.T) {
)
var crawledPages int
- h := HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
+ h := HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, body *os.File, err error) error {
crawledPages++
next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
log.Printf("%s -> %s", u, next)
@@ -44,7 +57,7 @@ func TestCrawler(t *testing.T) {
return nil
})
- crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))))
+ crawler, err := NewCrawler("crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))))
if err != nil {
t.Fatal("NewCrawler", err)
}