aboutsummaryrefslogtreecommitdiff
path: root/cmd/crawl/crawl.go
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/crawl/crawl.go')
-rw-r--r--cmd/crawl/crawl.go45
1 files changed, 13 insertions, 32 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 8f28bc4..7082d14 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -9,7 +9,6 @@ import (
"flag"
"fmt"
"io"
- "io/ioutil"
"log"
"net"
"net/http"
@@ -181,47 +180,29 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, header []byte, body *
}
-func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, _ error) error {
- // Read the response body (so we can save it to the WARC
- // output) and replace it with a buffer.
-
- /*
- data, derr := ioutil.ReadAll(resp.Body)
- if derr != nil {
- // Errors at this stage are usually transport-level errors,
- // and as such, retriable.
- return crawl.ErrRetryRequest
- }
- resp.Body = ioutil.NopCloser(bytes.NewReader(data))
- */
-
- // Keep temporary file to store request/response data
- r, _ := ioutil.TempFile("temp", "crawl")
- defer r.Close()
-
- w, _ := os.OpenFile(r.Name(), os.O_RDWR, 0777)
- defer w.Close()
- defer os.Remove(r.Name())
+func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, rBody *os.File, _ error) error {
+ wBody, _ := os.OpenFile(rBody.Name(), os.O_RDWR, 0777)
+ defer wBody.Close()
// Dump the request to the WARC output.
- if werr := resp.Request.Write(w); werr != nil {
+ if werr := resp.Request.Write(wBody); werr != nil {
return werr
}
- if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, r); werr != nil {
+ if werr := h.writeWARCRecord("request", resp.Request.URL.String(), nil, rBody); werr != nil {
return werr
}
// Seek to start; we've written since last read
- if _, err := r.Seek(0, io.SeekStart); err != nil {
+ if _, err := rBody.Seek(0, io.SeekStart); err != nil {
return err
}
- w.Close()
- w, _ = os.OpenFile(r.Name(), os.O_RDWR, 0777)
- defer w.Close()
+ wBody.Close()
+ wBody, _ = os.OpenFile(rBody.Name(), os.O_RDWR, 0777)
+ defer wBody.Close()
// Write response body to tmp file
- if _, err := io.Copy(w, resp.Body); err != nil {
+ if _, err := io.Copy(wBody, resp.Body); err != nil {
return err
}
@@ -231,18 +212,18 @@ func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, re
[][]byte{[]byte(statusLine), hdr2str(resp.Header), []byte("")},
[]byte{'\r', '\n'},
)
- if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, r); werr != nil {
+ if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respHeader, rBody); werr != nil {
return werr
}
// Seek to start; we've written since last read
- if _, err := r.Seek(0, io.SeekStart); err != nil {
+ if _, err := rBody.Seek(0, io.SeekStart); err != nil {
return err
}
h.numWritten++
- return extractLinks(p, u, depth, resp, r, nil)
+ return extractLinks(p, u, depth, resp, rBody, nil)
}
func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) {