aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-02-10 14:54:24 -0700
committerJordan <me@jordan.im>2022-02-10 14:54:24 -0700
commitdc39be70c4b9dbb4cf330ddc7776340551a4764c (patch)
tree98d5ab21f92a8e1db130a3020cbc8803ff21f724
parent2dfdfd864285c5a12cd47e446fc8bdf244fd58f8 (diff)
downloadcrawl-dc39be70c4b9dbb4cf330ddc7776340551a4764c.tar.gz
crawl-dc39be70c4b9dbb4cf330ddc7776340551a4764c.zip
client, crawl: --bind, support making outbound requests from a particular address
-rw-r--r--client.go29
-rw-r--r--cmd/crawl/crawl.go10
2 files changed, 39 insertions, 0 deletions
diff --git a/client.go b/client.go
index c028e42..2284e9e 100644
--- a/client.go
+++ b/client.go
@@ -66,3 +66,32 @@ func NewHTTPClientWithDNSOverride(dnsMap map[string]string) *http.Client {
Jar: jar,
}
}
+
+// NewHTTPClientWithLocalAddrOverride returns an http.Client suitable for
+// crawling, with a LocalAddr override for making outbound connections using
+// an explicit interface
+func NewHTTPClientWithLocalAddrOverride(addr *net.IPAddr) *http.Client {
+ jar, _ := cookiejar.New(nil) // nolint
+ localTCPAddr := net.TCPAddr{
+ IP: addr.IP,
+ }
+ transport := &http.Transport{
+ DialContext: (&net.Dialer{
+ LocalAddr: &localTCPAddr,
+ Timeout: 30 * time.Second,
+ KeepAlive: 30 * time.Second,
+ DualStack: false,
+ }).DialContext,
+ TLSClientConfig: &tls.Config{
+ InsecureSkipVerify: true, // nolint
+ },
+ }
+ return &http.Client{
+ Timeout: defaultClientTimeout,
+ Transport: transport,
+ CheckRedirect: func(req *http.Request, via []*http.Request) error {
+ return http.ErrUseLastResponse
+ },
+ Jar: jar,
+ }
+}
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 0be1572..0fc6af1 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -11,6 +11,7 @@ import (
"io"
"io/ioutil"
"log"
+ "net"
"net/http"
"os"
"os/signal"
@@ -38,6 +39,7 @@ var (
resumeDir = flag.String("resume", "", "path to directory of previous crawl to resume")
warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
+ bindIP = flag.String("bind", "", "IP address from which to make outbound connections")
dnsMap = dnsMapFlag(make(map[string]string))
excludes []*regexp.Regexp
@@ -338,6 +340,14 @@ func main() {
httpClient = crawl.NewHTTPClientWithDNSOverride(dnsMap)
+ if *bindIP != "" {
+ addr, err := net.ResolveIPAddr("ip", *bindIP)
+ if err != nil {
+ log.Fatal(err)
+ }
+ httpClient = crawl.NewHTTPClientWithLocalAddrOverride(addr)
+ }
+
crawler, err := crawl.NewCrawler(
*dbPath,
seeds,