aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2017-12-19 08:36:02 +0000
committerale <ale@incal.net>2017-12-19 08:36:02 +0000
commit6f5bef5ffb58aab818cb46ad14310d2874cb1492 (patch)
tree9b0928f0b48496cd55b39d431a510379f07ba9f2
parent979f2e8d216629f8beea4bb224686c73ddd0c4b6 (diff)
downloadcrawl-6f5bef5ffb58aab818cb46ad14310d2874cb1492.tar.gz
crawl-6f5bef5ffb58aab818cb46ad14310d2874cb1492.zip
Use a global http.Client with sane settings
-rw-r--r--client.go31
-rw-r--r--cmd/crawl/crawl.go24
2 files changed, 47 insertions, 8 deletions
diff --git a/client.go b/client.go
new file mode 100644
index 0000000..c0c2626
--- /dev/null
+++ b/client.go
@@ -0,0 +1,31 @@
+package crawl
+
+import (
+ "crypto/tls"
+ "net/http"
+ "net/http/cookiejar"
+ "time"
+)
+
+var defaultClientTimeout = 60 * time.Second
+
+var DefaultClient *http.Client
+
+// DefaultClient returns a http.Client suitable for crawling: does not
+// follow redirects, accepts invalid TLS certificates, sets a
+// reasonable timeout for requests.
+func init() {
+ jar, _ := cookiejar.New(nil)
+ DefaultClient = &http.Client{
+ Timeout: defaultClientTimeout,
+ Transport: &http.Transport{
+ TLSClientConfig: &tls.Config{
+ InsecureSkipVerify: true,
+ },
+ },
+ CheckRedirect: func(req *http.Request, via []*http.Request) error {
+ return http.ErrUseLastResponse
+ },
+ Jar: jar,
+ }
+}
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index e31f63e..abf2b42 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -11,6 +11,7 @@ import (
"log"
"net/http"
"os"
+ "runtime/pprof"
"strconv"
"strings"
"sync"
@@ -30,6 +31,8 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
+
+ cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
)
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
@@ -147,14 +150,10 @@ func (c *crawlStats) Dump() {
fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states)
}
-var (
- stats *crawlStats
-
- client *http.Client
-)
+var stats *crawlStats
func fetch(urlstr string) (*http.Response, error) {
- resp, err := client.Get(urlstr)
+ resp, err := crawl.DefaultClient.Get(urlstr)
if err == nil {
stats.Update(resp)
}
@@ -162,8 +161,6 @@ func fetch(urlstr string) (*http.Response, error) {
}
func init() {
- client = &http.Client{}
-
stats = &crawlStats{
states: make(map[int]int),
start: time.Now(),
@@ -191,6 +188,17 @@ func (b *byteCounter) Read(buf []byte) (int, error) {
func main() {
flag.Parse()
+ if *cpuprofile != "" {
+ f, err := os.Create(*cpuprofile)
+ if err != nil {
+ log.Fatal("could not create CPU profile: ", err)
+ }
+ if err := pprof.StartCPUProfile(f); err != nil {
+ log.Fatal("could not start CPU profile: ", err)
+ }
+ defer pprof.StopCPUProfile()
+ }
+
outf, err := os.Create(*outputFile)
if err != nil {
log.Fatal(err)