From 23a80bd68c5c51967eaf4e6a857c5d59fe58daf5 Mon Sep 17 00:00:00 2001 From: ale Date: Fri, 31 Aug 2018 11:08:50 +0100 Subject: Add a simple test for the full WARC crawler --- cmd/crawl/crawl.go | 3 ++ cmd/crawl/crawl_test.go | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ crawler_test.go | 2 +- 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 cmd/crawl/crawl_test.go diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index cf2af5d..bbbd65b 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -62,6 +62,7 @@ func hdr2str(h http.Header) []byte { type warcSaveHandler struct { warc *warc.Writer warcInfoID string + numWritten int } func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error { @@ -109,6 +110,8 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht return werr } + h.numWritten++ + return extractLinks(c, u, depth, resp, nil) } diff --git a/cmd/crawl/crawl_test.go b/cmd/crawl/crawl_test.go new file mode 100644 index 0000000..46bb2ad --- /dev/null +++ b/cmd/crawl/crawl_test.go @@ -0,0 +1,77 @@ +package main + +import ( + "fmt" + "io/ioutil" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + + "git.autistici.org/ale/crawl" + "git.autistici.org/ale/crawl/warc" +) + +func linkTo(w http.ResponseWriter, uri string) { + w.Header().Set("Content-Type", "text/html") + fmt.Fprintf(w, "link!", uri) +} + +func TestCrawl(t *testing.T) { + tmpdir, err := ioutil.TempDir("", "") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpdir) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/": + linkTo(w, "/redir") + case "/b": + linkTo(w, "/") + case "/redir": + http.Redirect(w, r, "/b", http.StatusFound) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + seeds := crawl.MustParseURLs([]string{srv.URL + "/"}) + scope := crawl.AND( + crawl.NewSchemeScope([]string{"http"}), + crawl.NewDepthScope(10), + crawl.NewSeedScope(seeds), + ) + + outf, err := os.Create(filepath.Join(tmpdir, "warc.gz")) + if err != nil { + t.Fatal(err) + } + w := warc.NewWriter(outf) + defer w.Close() + saver, err := newWarcSaveHandler(w) + if err != nil { + t.Fatal(err) + } + + crawler, err := crawl.NewCrawler( + filepath.Join(tmpdir, "db"), + seeds, + scope, + crawl.FetcherFunc(fetch), + crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))), + ) + if err != nil { + t.Fatal(err) + } + + crawler.Run(1) + crawler.Close() + + if n := saver.(*warcSaveHandler).numWritten; n != 3 { + t.Fatalf("warc handler wrote %d records, expected 3", n) + } +} diff --git a/crawler_test.go b/crawler_test.go index fecc850..7b5c92c 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) { return nil }) - crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), FollowRedirects(h)) + crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h)))) if err != nil { t.Fatal("NewCrawler", err) } -- cgit v1.2.3-54-g00ecf