From d4c561c23d016cf6a7507840153e835994915cb8 Mon Sep 17 00:00:00 2001 From: ale Date: Sat, 20 Dec 2014 10:39:53 +0000 Subject: move the WARC code into its own package Now generates well-formed, indexable WARC files. --- cmd/crawl/crawl.go | 15 ++++++++------- cmd/links/links.go | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'cmd') diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 1e5f952..0979d43 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -17,6 +17,7 @@ import ( "strings" "git.autistici.org/ale/crawl" + "git.autistici.org/ale/crawl/warc" "github.com/PuerkitoBio/goquery" ) @@ -94,7 +95,7 @@ func hdr2str(h http.Header) []byte { } type warcSaveHandler struct { - warc *crawl.WarcWriter + warc *warc.Writer warcInfoID string } @@ -108,7 +109,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht // Dump the request. var b bytes.Buffer resp.Request.Write(&b) - hdr := crawl.NewWarcHeader() + hdr := warc.NewHeader() hdr.Set("WARC-Type", "request") hdr.Set("WARC-Target-URI", resp.Request.URL.String()) hdr.Set("WARC-Warcinfo-ID", h.warcInfoID) @@ -122,7 +123,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht respPayload := bytes.Join([][]byte{ []byte(statusLine), hdr2str(resp.Header), data}, []byte{'\r', '\n'}) - hdr = crawl.NewWarcHeader() + hdr = warc.NewHeader() hdr.Set("WARC-Type", "response") hdr.Set("WARC-Target-URI", resp.Request.URL.String()) hdr.Set("WARC-Warcinfo-ID", h.warcInfoID) @@ -134,14 +135,14 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht return extractLinks(c, u, depth, resp, err) } -func NewSaveHandler(w *crawl.WarcWriter) crawl.Handler { +func NewSaveHandler(w *warc.Writer) crawl.Handler { info := strings.Join([]string{ "Software: crawl/1.0\r\n", "Format: WARC File Format 1.0\r\n", "Conformsto: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n", }, "") - hdr := crawl.NewWarcHeader() + hdr := warc.NewHeader() hdr.Set("WARC-Type", "warcinfo") hdr.Set("WARC-Warcinfo-ID", hdr.Get("WARC-Record-ID")) hdr.Set("Content-Length", strconv.Itoa(len(info))) @@ -165,7 +166,7 @@ func main() { seeds := crawl.MustParseURLs(flag.Args()) scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ",")) - w := crawl.NewWarcWriter(outf) + w := warc.NewWriter(outf) defer w.Close() saver := NewSaveHandler(w) @@ -174,5 +175,5 @@ func main() { if err != nil { log.Fatal(err) } - crawler.Run() + crawler.Run(*concurrency) } diff --git a/cmd/links/links.go b/cmd/links/links.go index 3ba63be..9ae2394 100644 --- a/cmd/links/links.go +++ b/cmd/links/links.go @@ -71,5 +71,5 @@ func main() { if err != nil { log.Fatal(err) } - crawler.Run() + crawler.Run(*concurrency) } -- cgit v1.2.3-54-g00ecf