aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorale <ale@incal.net>2014-12-19 13:55:05 +0000
committerale <ale@incal.net>2014-12-19 13:55:05 +0000
commitb09f05f8137e5bbc27a0a306de0529c59d3f2c28 (patch)
tree37e66968c2eb0e361a1f284804a86fe339e68b78 /cmd
downloadcrawl-b09f05f8137e5bbc27a0a306de0529c59d3f2c28.tar.gz
crawl-b09f05f8137e5bbc27a0a306de0529c59d3f2c28.zip
initial commit
Diffstat (limited to 'cmd')
-rw-r--r--cmd/crawl/crawl.go178
-rw-r--r--cmd/links/links.go75
2 files changed, 253 insertions, 0 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
new file mode 100644
index 0000000..1e5f952
--- /dev/null
+++ b/cmd/crawl/crawl.go
@@ -0,0 +1,178 @@
+// A restartable crawler that dumps everything to a WARC file.
+
+package main
+
+import (
+ "bytes"
+ "flag"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "regexp"
+ "strconv"
+ "strings"
+
+ "git.autistici.org/ale/crawl"
+ "github.com/PuerkitoBio/goquery"
+)
+
+var (
+ dbPath = flag.String("state", "crawldb", "crawl state database path")
+ concurrency = flag.Int("c", 10, "concurrent workers")
+ depth = flag.Int("depth", 10, "maximum link depth")
+ validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+ outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
+
+ urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`)
+)
+
+var linkMatches = []struct {
+ tag string
+ attr string
+}{
+ {"a", "href"},
+ {"link", "href"},
+ {"img", "src"},
+ {"script", "src"},
+}
+
+func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
+ var outlinks []string
+
+ ctype := resp.Header.Get("Content-Type")
+ if strings.HasPrefix(ctype, "text/html") {
+ doc, err := goquery.NewDocumentFromResponse(resp)
+ if err != nil {
+ return err
+ }
+
+ for _, lm := range linkMatches {
+ doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
+ val, _ := s.Attr(lm.attr)
+ outlinks = append(outlinks, val)
+ })
+ }
+ } else if strings.HasPrefix(ctype, "text/css") {
+ if data, err := ioutil.ReadAll(resp.Body); err == nil {
+ for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
+ outlinks = append(outlinks, val[1])
+ }
+ }
+ }
+
+ // Uniquify and parse outbound links.
+ links := make(map[string]*url.URL)
+ for _, val := range outlinks {
+ if linkurl, err := resp.Request.URL.Parse(val); err == nil {
+ links[linkurl.String()] = linkurl
+ }
+ }
+ for _, link := range links {
+ //log.Printf("%s -> %s", u, link.String())
+ c.Enqueue(link, depth+1)
+ }
+
+ return nil
+}
+
+type fakeCloser struct {
+ io.Reader
+}
+
+func (f *fakeCloser) Close() error {
+ return nil
+}
+
+func hdr2str(h http.Header) []byte {
+ var b bytes.Buffer
+ h.Write(&b)
+ return b.Bytes()
+}
+
+type warcSaveHandler struct {
+ warc *crawl.WarcWriter
+ warcInfoID string
+}
+
+func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
+ data, derr := ioutil.ReadAll(resp.Body)
+ if derr != nil {
+ return err
+ }
+ resp.Body = &fakeCloser{bytes.NewReader(data)}
+
+ // Dump the request.
+ var b bytes.Buffer
+ resp.Request.Write(&b)
+ hdr := crawl.NewWarcHeader()
+ hdr.Set("WARC-Type", "request")
+ hdr.Set("WARC-Target-URI", resp.Request.URL.String())
+ hdr.Set("WARC-Warcinfo-ID", h.warcInfoID)
+ hdr.Set("Content-Length", strconv.Itoa(b.Len()))
+ w := h.warc.NewRecord(hdr)
+ w.Write(b.Bytes())
+ w.Close()
+
+ // Dump the response.
+ statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status)
+ respPayload := bytes.Join([][]byte{
+ []byte(statusLine), hdr2str(resp.Header), data},
+ []byte{'\r', '\n'})
+ hdr = crawl.NewWarcHeader()
+ hdr.Set("WARC-Type", "response")
+ hdr.Set("WARC-Target-URI", resp.Request.URL.String())
+ hdr.Set("WARC-Warcinfo-ID", h.warcInfoID)
+ hdr.Set("Content-Length", strconv.Itoa(len(respPayload)))
+ w = h.warc.NewRecord(hdr)
+ w.Write(respPayload)
+ w.Close()
+
+ return extractLinks(c, u, depth, resp, err)
+}
+
+func NewSaveHandler(w *crawl.WarcWriter) crawl.Handler {
+ info := strings.Join([]string{
+ "Software: crawl/1.0\r\n",
+ "Format: WARC File Format 1.0\r\n",
+ "Conformsto: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n",
+ }, "")
+
+ hdr := crawl.NewWarcHeader()
+ hdr.Set("WARC-Type", "warcinfo")
+ hdr.Set("WARC-Warcinfo-ID", hdr.Get("WARC-Record-ID"))
+ hdr.Set("Content-Length", strconv.Itoa(len(info)))
+ hdrw := w.NewRecord(hdr)
+ io.WriteString(hdrw, info)
+ hdrw.Close()
+ return &warcSaveHandler{
+ warc: w,
+ warcInfoID: hdr.Get("WARC-Record-ID"),
+ }
+}
+
+func main() {
+ flag.Parse()
+
+ outf, err := os.Create(*outputFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ seeds := crawl.MustParseURLs(flag.Args())
+ scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
+
+ w := crawl.NewWarcWriter(outf)
+ defer w.Close()
+
+ saver := NewSaveHandler(w)
+
+ crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), saver)
+ if err != nil {
+ log.Fatal(err)
+ }
+ crawler.Run()
+}
diff --git a/cmd/links/links.go b/cmd/links/links.go
new file mode 100644
index 0000000..3ba63be
--- /dev/null
+++ b/cmd/links/links.go
@@ -0,0 +1,75 @@
+// A restartable crawler that extracts links from HTML pages and
+// simply prints them.
+//
+
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "net/http"
+ "net/url"
+ "strings"
+
+ "git.autistici.org/ale/crawl"
+ "github.com/PuerkitoBio/goquery"
+)
+
+var (
+ dbPath = flag.String("state", "crawldb", "crawl state database path")
+ concurrency = flag.Int("c", 10, "concurrent workers")
+ depth = flag.Int("depth", 10, "maximum link depth")
+ validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+)
+
+var linkMatches = []struct {
+ tag string
+ attr string
+}{
+ {"a", "href"},
+ {"link", "href"},
+ {"img", "src"},
+ {"script", "src"},
+}
+
+func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
+ if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") {
+ return nil
+ }
+
+ doc, err := goquery.NewDocumentFromResponse(resp)
+ if err != nil {
+ return err
+ }
+
+ links := make(map[string]*url.URL)
+
+ for _, lm := range linkMatches {
+ doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
+ val, _ := s.Attr(lm.attr)
+ if linkurl, err := resp.Request.URL.Parse(val); err == nil {
+ links[linkurl.String()] = linkurl
+ }
+ })
+ }
+
+ for _, link := range links {
+ //log.Printf("%s -> %s", u, link.String())
+ c.Enqueue(link, depth+1)
+ }
+ return nil
+}
+
+func main() {
+ flag.Parse()
+
+ seeds := crawl.MustParseURLs(flag.Args())
+ scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
+
+ crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.HandlerFunc(extractLinks))
+ if err != nil {
+ log.Fatal(err)
+ }
+ crawler.Run()
+}