diff options
Diffstat (limited to 'warc.go')
-rw-r--r-- | warc.go | 96 |
1 files changed, 96 insertions, 0 deletions
@@ -0,0 +1,96 @@ +package crawl + +import ( + "fmt" + "io" + "time" + + "compress/gzip" + + "code.google.com/p/go-uuid/uuid" +) + +var ( + warcTimeFmt = time.RFC3339 + warcVersion = "WARC/1.0" + warcContentTypes = map[string]string{ + "warcinfo": "application/warc-fields", + "response": "application/http; msgtype=response", + "request": "application/http; msgtype=request", + "metadata": "application/warc-fields", + } +) + +// A Warc header. Header field names are case-sensitive. +type WarcHeader map[string]string + +// Set a header to the specified value. Multiple values are not +// supported. +func (h WarcHeader) Set(key, value string) { + h[key] = value + + // Keep Content-Type in sync with WARC-Type. + if key == "WARC-Type" { + if ct, ok := warcContentTypes[value]; ok { + h["Content-Type"] = ct + } else { + h["Content-Type"] = "application/octet-stream" + } + } +} + +// Get the value of a header. If not found, returns an empty string. +func (h WarcHeader) Get(key string) string { + return h[key] +} + +// Encode the header to a Writer. +func (h WarcHeader) Encode(w io.Writer) { + fmt.Fprintf(w, "%s\r\n", warcVersion) + for hdr, value := range h { + fmt.Fprintf(w, "%s: %s\r\n", hdr, value) + } + fmt.Fprintf(w, "\r\n") +} + +// NewWarcHeader returns a WarcHeader with its own unique ID and the +// current timestamp. +func NewWarcHeader() WarcHeader { + h := make(WarcHeader) + h.Set("WARC-Record-ID", fmt.Sprintf("<%s>", uuid.NewUUID().URN())) + h.Set("WARC-Date", time.Now().Format(warcTimeFmt)) + h.Set("Content-Type", "application/octet-stream") + return h +} + +// WarcWriter can write records to a file in WARC format. +type WarcWriter struct { + writer io.WriteCloser +} + +type recordWriter struct { + io.Writer +} + +func (rw *recordWriter) Close() error { + // Add the end-of-record marker. + fmt.Fprintf(rw, "\r\n\r\n") + return nil +} + +// NewRecord starts a new WARC record with the provided header. The +// caller must call Close on the returned writer before creating the +// next record. +func (w *WarcWriter) NewRecord(hdr WarcHeader) io.WriteCloser { + hdr.Encode(w.writer) + return &recordWriter{w.writer} +} + +// Close the WARC writer and flush all buffers. +func (w *WarcWriter) Close() error { + return w.writer.Close() +} + +func NewWarcWriter(w io.WriteCloser) *WarcWriter { + return &WarcWriter{gzip.NewWriter(w)} +} |