From 6f5bef5ffb58aab818cb46ad14310d2874cb1492 Mon Sep 17 00:00:00 2001 From: ale Date: Tue, 19 Dec 2017 08:36:02 +0000 Subject: Use a global http.Client with sane settings --- client.go | 31 +++++++++++++++++++++++++++++++ cmd/crawl/crawl.go | 24 ++++++++++++++++-------- 2 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 client.go diff --git a/client.go b/client.go new file mode 100644 index 0000000..c0c2626 --- /dev/null +++ b/client.go @@ -0,0 +1,31 @@ +package crawl + +import ( + "crypto/tls" + "net/http" + "net/http/cookiejar" + "time" +) + +var defaultClientTimeout = 60 * time.Second + +var DefaultClient *http.Client + +// DefaultClient returns a http.Client suitable for crawling: does not +// follow redirects, accepts invalid TLS certificates, sets a +// reasonable timeout for requests. +func init() { + jar, _ := cookiejar.New(nil) + DefaultClient = &http.Client{ + Timeout: defaultClientTimeout, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse + }, + Jar: jar, + } +} diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index e31f63e..abf2b42 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -11,6 +11,7 @@ import ( "log" "net/http" "os" + "runtime/pprof" "strconv" "strings" "sync" @@ -30,6 +31,8 @@ var ( validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + + cpuprofile = flag.String("cpuprofile", "", "create cpu profile") ) func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { @@ -147,14 +150,10 @@ func (c *crawlStats) Dump() { fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states) } -var ( - stats *crawlStats - - client *http.Client -) +var stats *crawlStats func fetch(urlstr string) (*http.Response, error) { - resp, err := client.Get(urlstr) + resp, err := crawl.DefaultClient.Get(urlstr) if err == nil { stats.Update(resp) } @@ -162,8 +161,6 @@ func fetch(urlstr string) (*http.Response, error) { } func init() { - client = &http.Client{} - stats = &crawlStats{ states: make(map[int]int), start: time.Now(), @@ -191,6 +188,17 @@ func (b *byteCounter) Read(buf []byte) (int, error) { func main() { flag.Parse() + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal("could not create CPU profile: ", err) + } + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + } + outf, err := os.Create(*outputFile) if err != nil { log.Fatal(err) -- cgit v1.2.3-54-g00ecf