From 4cd67e7234943baf31b2e122f8ee3c70c21fb489 Mon Sep 17 00:00:00 2001 From: ale Date: Tue, 19 Dec 2017 00:12:11 +0000 Subject: Add tags (primary/related) to links This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired). --- crawler_test.go | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 crawler_test.go (limited to 'crawler_test.go') diff --git a/crawler_test.go b/crawler_test.go new file mode 100644 index 0000000..66acbe4 --- /dev/null +++ b/crawler_test.go @@ -0,0 +1,58 @@ +package crawl + +import ( + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "net/http/httptest" + "os" + "testing" +) + +func TestCrawler(t *testing.T) { + dir, err := ioutil.TempDir("", "") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(dir) + + // Run a trivial test http server just so our test Fetcher can + // return a real http.Response object. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + io.WriteString(w, "hello") + })) + defer srv.Close() + + seeds := MustParseURLs([]string{srv.URL}) + scope := AND( + NewSchemeScope([]string{"http"}), + NewSeedScope(seeds), + NewDepthScope(2), + ) + + var crawledPages int + h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + crawledPages++ + next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages) + log.Printf("%s -> %s", u, next) + c.Enqueue(Outlink{ + URL: mustParseURL(next), + Tag: TagPrimary, + }, depth+1) + return nil + }) + + crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h)) + if err != nil { + t.Fatal("NewCrawler", err) + } + + crawler.Run(1) + crawler.Close() + + if crawledPages != 2 { + t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10) + } +} -- cgit v1.2.3-54-g00ecf