aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2018-08-31 11:08:50 +0100
committerale <ale@incal.net>2018-08-31 11:08:50 +0100
commit23a80bd68c5c51967eaf4e6a857c5d59fe58daf5 (patch)
tree55fc0a8222b5b282a693124b6f81f00ed91ab6d4
parent9825334954ec555a9798e8e9be1ac04093595793 (diff)
downloadcrawl-23a80bd68c5c51967eaf4e6a857c5d59fe58daf5.tar.gz
crawl-23a80bd68c5c51967eaf4e6a857c5d59fe58daf5.zip
Add a simple test for the full WARC crawler
-rw-r--r--cmd/crawl/crawl.go3
-rw-r--r--cmd/crawl/crawl_test.go77
-rw-r--r--crawler_test.go2
3 files changed, 81 insertions, 1 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index cf2af5d..bbbd65b 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -62,6 +62,7 @@ func hdr2str(h http.Header) []byte {
type warcSaveHandler struct {
warc *warc.Writer
warcInfoID string
+ numWritten int
}
func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
@@ -109,6 +110,8 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
return werr
}
+ h.numWritten++
+
return extractLinks(c, u, depth, resp, nil)
}
diff --git a/cmd/crawl/crawl_test.go b/cmd/crawl/crawl_test.go
new file mode 100644
index 0000000..46bb2ad
--- /dev/null
+++ b/cmd/crawl/crawl_test.go
@@ -0,0 +1,77 @@
+package main
+
+import (
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "path/filepath"
+ "testing"
+
+ "git.autistici.org/ale/crawl"
+ "git.autistici.org/ale/crawl/warc"
+)
+
+func linkTo(w http.ResponseWriter, uri string) {
+ w.Header().Set("Content-Type", "text/html")
+ fmt.Fprintf(w, "<html><body><a href=\"%s\">link!</a></body></html>", uri)
+}
+
+func TestCrawl(t *testing.T) {
+ tmpdir, err := ioutil.TempDir("", "")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(tmpdir)
+
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ switch r.URL.Path {
+ case "/":
+ linkTo(w, "/redir")
+ case "/b":
+ linkTo(w, "/")
+ case "/redir":
+ http.Redirect(w, r, "/b", http.StatusFound)
+ default:
+ http.NotFound(w, r)
+ }
+ }))
+ defer srv.Close()
+
+ seeds := crawl.MustParseURLs([]string{srv.URL + "/"})
+ scope := crawl.AND(
+ crawl.NewSchemeScope([]string{"http"}),
+ crawl.NewDepthScope(10),
+ crawl.NewSeedScope(seeds),
+ )
+
+ outf, err := os.Create(filepath.Join(tmpdir, "warc.gz"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ w := warc.NewWriter(outf)
+ defer w.Close()
+ saver, err := newWarcSaveHandler(w)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ crawler, err := crawl.NewCrawler(
+ filepath.Join(tmpdir, "db"),
+ seeds,
+ scope,
+ crawl.FetcherFunc(fetch),
+ crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))),
+ )
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ crawler.Run(1)
+ crawler.Close()
+
+ if n := saver.(*warcSaveHandler).numWritten; n != 3 {
+ t.Fatalf("warc handler wrote %d records, expected 3", n)
+ }
+}
diff --git a/crawler_test.go b/crawler_test.go
index fecc850..7b5c92c 100644
--- a/crawler_test.go
+++ b/crawler_test.go
@@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) {
return nil
})
- crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), FollowRedirects(h))
+ crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))))
if err != nil {
t.Fatal("NewCrawler", err)
}