1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
package crawl
import (
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"net/http/httptest"
"os"
"testing"
)
func TestCrawler(t *testing.T) {
dir, err := ioutil.TempDir("", "")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(dir)
// Run a trivial test http server just so our test Fetcher can
// return a real http.Response object.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
io.WriteString(w, "hello")
}))
defer srv.Close()
seeds := MustParseURLs([]string{srv.URL})
scope := AND(
NewSchemeScope([]string{"http"}),
NewSeedScope(seeds),
NewDepthScope(2),
)
var crawledPages int
h := HandlerFunc(func(p Publisher, u string, tag, depth int, resp *http.Response, err error) error {
crawledPages++
next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
log.Printf("%s -> %s", u, next)
p.Enqueue(Outlink{
URL: mustParseURL(next),
Tag: TagPrimary,
}, depth+1)
return nil
})
crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))))
if err != nil {
t.Fatal("NewCrawler", err)
}
crawler.Run(1)
crawler.Close()
if crawledPages != 2 {
t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10)
}
}
|