diff options
author | ale <ale@incal.net> | 2014-12-20 10:39:53 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2014-12-20 10:39:53 +0000 |
commit | d4c561c23d016cf6a7507840153e835994915cb8 (patch) | |
tree | 53d25b35a90e5c4e475915ee8746ec24312812d9 /warc | |
parent | b09f05f8137e5bbc27a0a306de0529c59d3f2c28 (diff) | |
download | crawl-d4c561c23d016cf6a7507840153e835994915cb8.tar.gz crawl-d4c561c23d016cf6a7507840153e835994915cb8.zip |
move the WARC code into its own package
Now generates well-formed, indexable WARC files.
Diffstat (limited to 'warc')
-rw-r--r-- | warc/warc.go | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/warc/warc.go b/warc/warc.go new file mode 100644 index 0000000..85e53f7 --- /dev/null +++ b/warc/warc.go @@ -0,0 +1,122 @@ +// Package to write WARC files. + +package warc + +import ( + "fmt" + "io" + "time" + + "compress/gzip" + + "code.google.com/p/go-uuid/uuid" +) + +var ( + warcTimeFmt = time.RFC3339 + warcVersion = "WARC/1.0" + warcContentTypes = map[string]string{ + "warcinfo": "application/warc-fields", + "response": "application/http; msgtype=response", + "request": "application/http; msgtype=request", + "metadata": "application/warc-fields", + } +) + +// A WARC header. Header field names are case-sensitive. +type Header map[string]string + +// Set a header to the specified value. Multiple values are not +// supported. +func (h Header) Set(key, value string) { + h[key] = value + + // Keep Content-Type in sync with WARC-Type. + if key == "WARC-Type" { + if ct, ok := warcContentTypes[value]; ok { + h["Content-Type"] = ct + } else { + h["Content-Type"] = "application/octet-stream" + } + } +} + +// Get the value of a header. If not found, returns an empty string. +func (h Header) Get(key string) string { + return h[key] +} + +// Encode the header to a Writer. +func (h Header) Encode(w io.Writer) { + fmt.Fprintf(w, "%s\r\n", warcVersion) + for hdr, value := range h { + fmt.Fprintf(w, "%s: %s\r\n", hdr, value) + } + fmt.Fprintf(w, "\r\n") +} + +// NewHeader returns a Header with its own unique ID and the +// current timestamp. +func NewHeader() Header { + h := make(Header) + h.Set("WARC-Record-ID", fmt.Sprintf("<%s>", uuid.NewUUID().URN())) + h.Set("WARC-Date", time.Now().Format(warcTimeFmt)) + h.Set("Content-Type", "application/octet-stream") + return h +} + +// Writer can write records to a file in WARC format. It is safe +// for concurrent access, since writes are serialized internally. +type Writer struct { + writer io.WriteCloser + gzwriter *gzip.Writer + lockCh chan bool +} + +type recordWriter struct { + io.Writer + lockCh chan bool +} + +func (rw *recordWriter) Close() error { + // Add the end-of-record marker. + fmt.Fprintf(rw, "\r\n\r\n") + + <-rw.lockCh + + return nil +} + +// NewRecord starts a new WARC record with the provided header. The +// caller must call Close on the returned writer before creating the +// next record. Note that this function may block until that condition +// is satisfied. +func (w *Writer) NewRecord(hdr Header) io.WriteCloser { + w.lockCh <- true + if w.gzwriter != nil { + w.gzwriter.Close() + } + w.gzwriter, _ = gzip.NewWriterLevel(w.writer, gzip.BestCompression) + w.gzwriter.Header.Name = hdr.Get("WARC-Record-ID") + hdr.Encode(w.gzwriter) + return &recordWriter{Writer: w.gzwriter, lockCh: w.lockCh} +} + +// Close the WARC writer and flush all buffers. This will also call +// Close on the wrapped io.WriteCloser object. +func (w *Writer) Close() error { + if err := w.gzwriter.Close(); err != nil { + return err + } + return w.writer.Close() +} + +// NewWriter initializes a new Writer and returns it. +func NewWriter(w io.WriteCloser) *Writer { + return &Writer{ + writer: w, + // Buffering is important here since we're using this + // channel as a semaphore. + lockCh: make(chan bool, 1), + } +} |