From dc39be70c4b9dbb4cf330ddc7776340551a4764c Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 10 Feb 2022 14:54:24 -0700 Subject: client, crawl: --bind, support making outbound requests from a particular address --- client.go | 29 +++++++++++++++++++++++++++++ cmd/crawl/crawl.go | 10 ++++++++++ 2 files changed, 39 insertions(+) diff --git a/client.go b/client.go index c028e42..2284e9e 100644 --- a/client.go +++ b/client.go @@ -66,3 +66,32 @@ func NewHTTPClientWithDNSOverride(dnsMap map[string]string) *http.Client { Jar: jar, } } + +// NewHTTPClientWithLocalAddrOverride returns an http.Client suitable for +// crawling, with a LocalAddr override for making outbound connections using +// an explicit interface +func NewHTTPClientWithLocalAddrOverride(addr *net.IPAddr) *http.Client { + jar, _ := cookiejar.New(nil) // nolint + localTCPAddr := net.TCPAddr{ + IP: addr.IP, + } + transport := &http.Transport{ + DialContext: (&net.Dialer{ + LocalAddr: &localTCPAddr, + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + DualStack: false, + }).DialContext, + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, // nolint + }, + } + return &http.Client{ + Timeout: defaultClientTimeout, + Transport: transport, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse + }, + Jar: jar, + } +} diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 0be1572..0fc6af1 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -11,6 +11,7 @@ import ( "io" "io/ioutil" "log" + "net" "net/http" "os" "os/signal" @@ -38,6 +39,7 @@ var ( resumeDir = flag.String("resume", "", "path to directory of previous crawl to resume") warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns") cpuprofile = flag.String("cpuprofile", "", "create cpu profile") + bindIP = flag.String("bind", "", "IP address from which to make outbound connections") dnsMap = dnsMapFlag(make(map[string]string)) excludes []*regexp.Regexp @@ -338,6 +340,14 @@ func main() { httpClient = crawl.NewHTTPClientWithDNSOverride(dnsMap) + if *bindIP != "" { + addr, err := net.ResolveIPAddr("ip", *bindIP) + if err != nil { + log.Fatal(err) + } + httpClient = crawl.NewHTTPClientWithLocalAddrOverride(addr) + } + crawler, err := crawl.NewCrawler( *dbPath, seeds, -- cgit v1.2.3-54-g00ecf