aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorale <ale@incal.net>2018-08-31 08:29:14 +0100
committerale <ale@incal.net>2018-08-31 08:29:14 +0100
commitee1a3d8e5278a4a4e8435f9129852b95a9c22afb (patch)
treefd7a42cfff4aed5bd2379feb35f7172287430ba2 /cmd
parentb3d419486a87c9193c2fd6c16168f600876e0f73 (diff)
downloadcrawl-ee1a3d8e5278a4a4e8435f9129852b95a9c22afb.tar.gz
crawl-ee1a3d8e5278a4a4e8435f9129852b95a9c22afb.zip
Improve error checking
Detect write errors (both on the database and to the WARC output) and abort with an error message. Also fix a bunch of harmless lint warnings.
Diffstat (limited to 'cmd')
-rw-r--r--cmd/crawl/crawl.go107
-rw-r--r--cmd/links/links.go10
2 files changed, 69 insertions, 48 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 0e5fc15..3d1120c 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -37,30 +37,24 @@ var (
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
)
-func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
+func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
return err
}
for _, link := range links {
- c.Enqueue(link, depth+1)
+ if err := c.Enqueue(link, depth+1); err != nil {
+ return err
+ }
}
return nil
}
-type fakeCloser struct {
- io.Reader
-}
-
-func (f *fakeCloser) Close() error {
- return nil
-}
-
func hdr2str(h http.Header) []byte {
var b bytes.Buffer
- h.Write(&b)
+ h.Write(&b) // nolint
return b.Bytes()
}
@@ -69,43 +63,58 @@ type warcSaveHandler struct {
warcInfoID string
}
+func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
+ hdr := warc.NewHeader()
+ hdr.Set("WARC-Type", typ)
+ hdr.Set("WARC-Target-URI", uri)
+ hdr.Set("WARC-Warcinfo-ID", h.warcInfoID)
+ hdr.Set("Content-Length", strconv.Itoa(len(data)))
+
+ w, err := h.warc.NewRecord(hdr)
+ if err != nil {
+ return err
+ }
+ if _, err := w.Write(data); err != nil {
+ return err
+ }
+ return w.Close()
+}
+
func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
+ if err != nil {
+ return err
+ }
+
+ // Read the response body (so we can save it to the WARC
+ // output) and replace it with a buffer.
data, derr := ioutil.ReadAll(resp.Body)
if derr != nil {
- return err
+ return derr
}
- resp.Body = &fakeCloser{bytes.NewReader(data)}
+ resp.Body = ioutil.NopCloser(bytes.NewReader(data))
- // Dump the request.
+ // Dump the request to the WARC output.
var b bytes.Buffer
- resp.Request.Write(&b)
- hdr := warc.NewHeader()
- hdr.Set("WARC-Type", "request")
- hdr.Set("WARC-Target-URI", resp.Request.URL.String())
- hdr.Set("WARC-Warcinfo-ID", h.warcInfoID)
- hdr.Set("Content-Length", strconv.Itoa(b.Len()))
- w := h.warc.NewRecord(hdr)
- w.Write(b.Bytes())
- w.Close()
+ if werr := resp.Request.Write(&b); werr != nil {
+ return werr
+ }
+ if werr := h.writeWARCRecord("request", resp.Request.URL.String(), b.Bytes()); werr != nil {
+ return werr
+ }
// Dump the response.
statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status)
respPayload := bytes.Join([][]byte{
[]byte(statusLine), hdr2str(resp.Header), data},
[]byte{'\r', '\n'})
- hdr = warc.NewHeader()
- hdr.Set("WARC-Type", "response")
- hdr.Set("WARC-Target-URI", resp.Request.URL.String())
- hdr.Set("WARC-Warcinfo-ID", h.warcInfoID)
- hdr.Set("Content-Length", strconv.Itoa(len(respPayload)))
- w = h.warc.NewRecord(hdr)
- w.Write(respPayload)
- w.Close()
+ if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respPayload); werr != nil {
+ return werr
+ }
return extractLinks(c, u, depth, resp, err)
}
-func newWarcSaveHandler(w *warc.Writer) crawl.Handler {
+func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) {
info := strings.Join([]string{
"Software: crawl/1.0\r\n",
"Format: WARC File Format 1.0\r\n",
@@ -116,13 +125,18 @@ func newWarcSaveHandler(w *warc.Writer) crawl.Handler {
hdr.Set("WARC-Type", "warcinfo")
hdr.Set("WARC-Warcinfo-ID", hdr.Get("WARC-Record-ID"))
hdr.Set("Content-Length", strconv.Itoa(len(info)))
- hdrw := w.NewRecord(hdr)
- io.WriteString(hdrw, info)
- hdrw.Close()
+ hdrw, err := w.NewRecord(hdr)
+ if err != nil {
+ return nil, err
+ }
+ if _, err := io.WriteString(hdrw, info); err != nil {
+ return nil, err
+ }
+ hdrw.Close() // nolint
return &warcSaveHandler{
warc: w,
warcInfoID: hdr.Get("WARC-Record-ID"),
- }
+ }, nil
}
type crawlStats struct {
@@ -149,7 +163,7 @@ func (c *crawlStats) Dump() {
c.lock.Lock()
defer c.lock.Unlock()
rate := float64(c.bytes) / time.Since(c.start).Seconds() / 1000
- fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states)
+ fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states) // nolint
}
var stats *crawlStats
@@ -201,11 +215,6 @@ func main() {
defer pprof.StopCPUProfile()
}
- outf, err := os.Create(*outputFile)
- if err != nil {
- log.Fatal(err)
- }
-
seeds := crawl.MustParseURLs(flag.Args())
scope := crawl.AND(
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
@@ -217,10 +226,17 @@ func main() {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
}
+ outf, err := os.Create(*outputFile)
+ if err != nil {
+ log.Fatal(err)
+ }
w := warc.NewWriter(outf)
- defer w.Close()
+ defer w.Close() // nolint
- saver := newWarcSaveHandler(w)
+ saver, err := newWarcSaveHandler(w)
+ if err != nil {
+ log.Fatal(err)
+ }
crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
if err != nil {
@@ -240,13 +256,12 @@ func main() {
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
crawler.Run(*concurrency)
-
crawler.Close()
if signaled.Load().(bool) {
os.Exit(1)
}
if !*keepDb {
- os.RemoveAll(*dbPath)
+ os.RemoveAll(*dbPath) // nolint
}
}
diff --git a/cmd/links/links.go b/cmd/links/links.go
index e8b0fcb..9cd741f 100644
--- a/cmd/links/links.go
+++ b/cmd/links/links.go
@@ -15,20 +15,25 @@ import (
)
var (
- dbPath = flag.String("state", "crawldb", "crawl state database path")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
)
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
+ if err != nil {
+ return err
+ }
+
links, err := analysis.GetLinks(resp)
if err != nil {
return err
}
for _, link := range links {
- c.Enqueue(link, depth+1)
+ if err := c.Enqueue(link, depth+1); err != nil {
+ return err
+ }
}
return nil
@@ -49,4 +54,5 @@ func main() {
log.Fatal(err)
}
crawler.Run(*concurrency)
+ crawler.Close()
}