diff options
Diffstat (limited to 'cmd/crawl/crawl.go')
-rw-r--r-- | cmd/crawl/crawl.go | 45 |
1 files changed, 13 insertions, 32 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 8f28bc4..7082d14 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -9,7 +9,6 @@ import ( "flag" "fmt" "io" - "io/ioutil" "log" "net" "net/http" @@ -181,47 +180,29 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, header []byte, body * } -func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, _ error) error { - // Read the response body (so we can save it to the WARC - // output) and replace it with a buffer. - - /* - data, derr := ioutil.ReadAll(resp.Body) - if derr != nil { - // Errors at this stage are usually transport-level errors, - // and as such, retriable. - return crawl.ErrRetryRequest - } - resp.Body = ioutil.NopCloser(bytes.NewReader(data)) - */ - - // Keep temporary file to store request/response data - r, _ := ioutil.TempFile("temp", "crawl") - defer r.Close() - - w, _ := os.OpenFile(r.Name(), os.O_RDWR, 0777) - defer w.Close() - defer os.Remove(r.Name()) +func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, rBody *os.File, _ error) error { + wBody, _ := os.OpenFile(rBody.Name(), os.O_RDWR, 0777) + defer wBody.Close() // Dump the request to the WARC output. - if werr := resp.Request.Write(w); werr != nil { + if werr := resp.Request.Write(wBody); werr != nil { return werr } - if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, r); werr != nil { + if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, rBody); werr != nil { return werr } // Seek to start; we've written since last read - if _, err := r.Seek(0, io.SeekStart); err != nil { + if _, err := rBody.Seek(0, io.SeekStart); err != nil { return err } - w.Close() - w, _ = os.OpenFile(r.Name(), os.O_RDWR, 0777) - defer w.Close() + wBody.Close() + wBody, _ = os.OpenFile(rBody.Name(), os.O_RDWR, 0777) + defer wBody.Close() // Write response body to tmp file - if _, err := io.Copy(w, resp.Body); err != nil { + if _, err := io.Copy(wBody, resp.Body); err != nil { return err } @@ -231,18 +212,18 @@ func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, re [][]byte{[]byte(statusLine), hdr2str(resp.Header), []byte("")}, []byte{'\r', '\n'}, ) - if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, r); werr != nil { + if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, rBody); werr != nil { return werr } // Seek to start; we've written since last read - if _, err := r.Seek(0, io.SeekStart); err != nil { + if _, err := rBody.Seek(0, io.SeekStart); err != nil { return err } h.numWritten++ - return extractLinks(p, u, depth, resp, r, nil) + return extractLinks(p, u, depth, resp, rBody, nil) } func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) { |