diff options
author | ale <ale@incal.net> | 2014-12-19 13:55:05 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2014-12-19 13:55:05 +0000 |
commit | b09f05f8137e5bbc27a0a306de0529c59d3f2c28 (patch) | |
tree | 37e66968c2eb0e361a1f284804a86fe339e68b78 /cmd | |
download | crawl-b09f05f8137e5bbc27a0a306de0529c59d3f2c28.tar.gz crawl-b09f05f8137e5bbc27a0a306de0529c59d3f2c28.zip |
initial commit
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/crawl/crawl.go | 178 | ||||
-rw-r--r-- | cmd/links/links.go | 75 |
2 files changed, 253 insertions, 0 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go new file mode 100644 index 0000000..1e5f952 --- /dev/null +++ b/cmd/crawl/crawl.go @@ -0,0 +1,178 @@ +// A restartable crawler that dumps everything to a WARC file. + +package main + +import ( + "bytes" + "flag" + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "net/url" + "os" + "regexp" + "strconv" + "strings" + + "git.autistici.org/ale/crawl" + "github.com/PuerkitoBio/goquery" +) + +var ( + dbPath = flag.String("state", "crawldb", "crawl state database path") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 10, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") + outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + + urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`) +) + +var linkMatches = []struct { + tag string + attr string +}{ + {"a", "href"}, + {"link", "href"}, + {"img", "src"}, + {"script", "src"}, +} + +func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { + var outlinks []string + + ctype := resp.Header.Get("Content-Type") + if strings.HasPrefix(ctype, "text/html") { + doc, err := goquery.NewDocumentFromResponse(resp) + if err != nil { + return err + } + + for _, lm := range linkMatches { + doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { + val, _ := s.Attr(lm.attr) + outlinks = append(outlinks, val) + }) + } + } else if strings.HasPrefix(ctype, "text/css") { + if data, err := ioutil.ReadAll(resp.Body); err == nil { + for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { + outlinks = append(outlinks, val[1]) + } + } + } + + // Uniquify and parse outbound links. + links := make(map[string]*url.URL) + for _, val := range outlinks { + if linkurl, err := resp.Request.URL.Parse(val); err == nil { + links[linkurl.String()] = linkurl + } + } + for _, link := range links { + //log.Printf("%s -> %s", u, link.String()) + c.Enqueue(link, depth+1) + } + + return nil +} + +type fakeCloser struct { + io.Reader +} + +func (f *fakeCloser) Close() error { + return nil +} + +func hdr2str(h http.Header) []byte { + var b bytes.Buffer + h.Write(&b) + return b.Bytes() +} + +type warcSaveHandler struct { + warc *crawl.WarcWriter + warcInfoID string +} + +func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { + data, derr := ioutil.ReadAll(resp.Body) + if derr != nil { + return err + } + resp.Body = &fakeCloser{bytes.NewReader(data)} + + // Dump the request. + var b bytes.Buffer + resp.Request.Write(&b) + hdr := crawl.NewWarcHeader() + hdr.Set("WARC-Type", "request") + hdr.Set("WARC-Target-URI", resp.Request.URL.String()) + hdr.Set("WARC-Warcinfo-ID", h.warcInfoID) + hdr.Set("Content-Length", strconv.Itoa(b.Len())) + w := h.warc.NewRecord(hdr) + w.Write(b.Bytes()) + w.Close() + + // Dump the response. + statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status) + respPayload := bytes.Join([][]byte{ + []byte(statusLine), hdr2str(resp.Header), data}, + []byte{'\r', '\n'}) + hdr = crawl.NewWarcHeader() + hdr.Set("WARC-Type", "response") + hdr.Set("WARC-Target-URI", resp.Request.URL.String()) + hdr.Set("WARC-Warcinfo-ID", h.warcInfoID) + hdr.Set("Content-Length", strconv.Itoa(len(respPayload))) + w = h.warc.NewRecord(hdr) + w.Write(respPayload) + w.Close() + + return extractLinks(c, u, depth, resp, err) +} + +func NewSaveHandler(w *crawl.WarcWriter) crawl.Handler { + info := strings.Join([]string{ + "Software: crawl/1.0\r\n", + "Format: WARC File Format 1.0\r\n", + "Conformsto: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n", + }, "") + + hdr := crawl.NewWarcHeader() + hdr.Set("WARC-Type", "warcinfo") + hdr.Set("WARC-Warcinfo-ID", hdr.Get("WARC-Record-ID")) + hdr.Set("Content-Length", strconv.Itoa(len(info))) + hdrw := w.NewRecord(hdr) + io.WriteString(hdrw, info) + hdrw.Close() + return &warcSaveHandler{ + warc: w, + warcInfoID: hdr.Get("WARC-Record-ID"), + } +} + +func main() { + flag.Parse() + + outf, err := os.Create(*outputFile) + if err != nil { + log.Fatal(err) + } + + seeds := crawl.MustParseURLs(flag.Args()) + scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ",")) + + w := crawl.NewWarcWriter(outf) + defer w.Close() + + saver := NewSaveHandler(w) + + crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), saver) + if err != nil { + log.Fatal(err) + } + crawler.Run() +} diff --git a/cmd/links/links.go b/cmd/links/links.go new file mode 100644 index 0000000..3ba63be --- /dev/null +++ b/cmd/links/links.go @@ -0,0 +1,75 @@ +// A restartable crawler that extracts links from HTML pages and +// simply prints them. +// + +package main + +import ( + "flag" + "fmt" + "log" + "net/http" + "net/url" + "strings" + + "git.autistici.org/ale/crawl" + "github.com/PuerkitoBio/goquery" +) + +var ( + dbPath = flag.String("state", "crawldb", "crawl state database path") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 10, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") +) + +var linkMatches = []struct { + tag string + attr string +}{ + {"a", "href"}, + {"link", "href"}, + {"img", "src"}, + {"script", "src"}, +} + +func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { + if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") { + return nil + } + + doc, err := goquery.NewDocumentFromResponse(resp) + if err != nil { + return err + } + + links := make(map[string]*url.URL) + + for _, lm := range linkMatches { + doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { + val, _ := s.Attr(lm.attr) + if linkurl, err := resp.Request.URL.Parse(val); err == nil { + links[linkurl.String()] = linkurl + } + }) + } + + for _, link := range links { + //log.Printf("%s -> %s", u, link.String()) + c.Enqueue(link, depth+1) + } + return nil +} + +func main() { + flag.Parse() + + seeds := crawl.MustParseURLs(flag.Args()) + scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ",")) + + crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.HandlerFunc(extractLinks)) + if err != nil { + log.Fatal(err) + } + crawler.Run() +} |