aboutsummaryrefslogtreecommitdiff
path: root/crawler_test.go
diff options
context:
space:
mode:
authorale <ale@incal.net>2017-12-19 00:12:11 +0000
committerale <ale@incal.net>2017-12-19 00:12:11 +0000
commit4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
treec3bf3e88729291ecf0e371d0dd43977cdd1d08ea /crawler_test.go
parent77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
downloadcrawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
Add tags (primary/related) to links
This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
Diffstat (limited to 'crawler_test.go')
-rw-r--r--crawler_test.go58
1 files changed, 58 insertions, 0 deletions
diff --git a/crawler_test.go b/crawler_test.go
new file mode 100644
index 0000000..66acbe4
--- /dev/null
+++ b/crawler_test.go
@@ -0,0 +1,58 @@
+package crawl
+
+import (
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "testing"
+)
+
+func TestCrawler(t *testing.T) {
+ dir, err := ioutil.TempDir("", "")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(dir)
+
+ // Run a trivial test http server just so our test Fetcher can
+ // return a real http.Response object.
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ io.WriteString(w, "hello")
+ }))
+ defer srv.Close()
+
+ seeds := MustParseURLs([]string{srv.URL})
+ scope := AND(
+ NewSchemeScope([]string{"http"}),
+ NewSeedScope(seeds),
+ NewDepthScope(2),
+ )
+
+ var crawledPages int
+ h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+ crawledPages++
+ next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
+ log.Printf("%s -> %s", u, next)
+ c.Enqueue(Outlink{
+ URL: mustParseURL(next),
+ Tag: TagPrimary,
+ }, depth+1)
+ return nil
+ })
+
+ crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h))
+ if err != nil {
+ t.Fatal("NewCrawler", err)
+ }
+
+ crawler.Run(1)
+ crawler.Close()
+
+ if crawledPages != 2 {
+ t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10)
+ }
+}