aboutsummaryrefslogtreecommitdiff
path: root/cmd/crawl/crawl.go
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/crawl/crawl.go')
-rw-r--r--cmd/crawl/crawl.go46
1 files changed, 32 insertions, 14 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 93506ac..b54b999 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -5,6 +5,7 @@ package main
import (
"bufio"
"bytes"
+ "errors"
"flag"
"fmt"
"io"
@@ -38,12 +39,27 @@ var (
warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
+ dnsMap = dnsMapFlag(make(map[string]string))
excludes []*regexp.Regexp
+
+ httpClient *http.Client
)
func init() {
flag.Var(&excludesFlag{}, "exclude", "exclude regex URL patterns")
flag.Var(&excludesFileFlag{}, "exclude-from-file", "load exclude regex URL patterns from a file")
+ flag.Var(dnsMap, "resolve", "set DNS overrides (in hostname=addr format)")
+
+ stats = &crawlStats{
+ states: make(map[int]int),
+ start: time.Now(),
+ }
+
+ go func() {
+ for range time.Tick(10 * time.Second) {
+ stats.Dump()
+ }
+ }()
}
type excludesFlag struct{}
@@ -82,6 +98,19 @@ func (f *excludesFileFlag) Set(s string) error {
return nil
}
+type dnsMapFlag map[string]string
+
+func (f dnsMapFlag) String() string { return "" }
+
+func (f dnsMapFlag) Set(s string) error {
+ parts := strings.Split(s, "=")
+ if len(parts) != 2 {
+ return errors.New("value not in host=addr format")
+ }
+ f[parts[0]] = parts[1]
+ return nil
+}
+
func extractLinks(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
@@ -217,26 +246,13 @@ func (c *crawlStats) Dump() {
var stats *crawlStats
func fetch(urlstr string) (*http.Response, error) {
- resp, err := crawl.DefaultClient.Get(urlstr)
+ resp, err := httpClient.Get(urlstr)
if err == nil {
stats.Update(resp)
}
return resp, err
}
-func init() {
- stats = &crawlStats{
- states: make(map[int]int),
- start: time.Now(),
- }
-
- go func() {
- for range time.Tick(10 * time.Second) {
- stats.Dump()
- }
- }()
-}
-
type byteCounter struct {
io.ReadCloser
}
@@ -298,6 +314,8 @@ func main() {
log.Fatal(err)
}
+ httpClient = crawl.NewHTTPClientWithDNSOverride(dnsMap)
+
crawler, err := crawl.NewCrawler(
*dbPath,
seeds,