aboutsummaryrefslogtreecommitdiff
path: root/cmd/crawl/crawl_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/crawl/crawl_test.go')
-rw-r--r--cmd/crawl/crawl_test.go77
1 files changed, 77 insertions, 0 deletions
diff --git a/cmd/crawl/crawl_test.go b/cmd/crawl/crawl_test.go
new file mode 100644
index 0000000..46bb2ad
--- /dev/null
+++ b/cmd/crawl/crawl_test.go
@@ -0,0 +1,77 @@
+package main
+
+import (
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "path/filepath"
+ "testing"
+
+ "git.autistici.org/ale/crawl"
+ "git.autistici.org/ale/crawl/warc"
+)
+
+func linkTo(w http.ResponseWriter, uri string) {
+ w.Header().Set("Content-Type", "text/html")
+ fmt.Fprintf(w, "<html><body><a href=\"%s\">link!</a></body></html>", uri)
+}
+
+func TestCrawl(t *testing.T) {
+ tmpdir, err := ioutil.TempDir("", "")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(tmpdir)
+
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ switch r.URL.Path {
+ case "/":
+ linkTo(w, "/redir")
+ case "/b":
+ linkTo(w, "/")
+ case "/redir":
+ http.Redirect(w, r, "/b", http.StatusFound)
+ default:
+ http.NotFound(w, r)
+ }
+ }))
+ defer srv.Close()
+
+ seeds := crawl.MustParseURLs([]string{srv.URL + "/"})
+ scope := crawl.AND(
+ crawl.NewSchemeScope([]string{"http"}),
+ crawl.NewDepthScope(10),
+ crawl.NewSeedScope(seeds),
+ )
+
+ outf, err := os.Create(filepath.Join(tmpdir, "warc.gz"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ w := warc.NewWriter(outf)
+ defer w.Close()
+ saver, err := newWarcSaveHandler(w)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ crawler, err := crawl.NewCrawler(
+ filepath.Join(tmpdir, "db"),
+ seeds,
+ scope,
+ crawl.FetcherFunc(fetch),
+ crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))),
+ )
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ crawler.Run(1)
+ crawler.Close()
+
+ if n := saver.(*warcSaveHandler).numWritten; n != 3 {
+ t.Fatalf("warc handler wrote %d records, expected 3", n)
+ }
+}