aboutsummaryrefslogtreecommitdiff
path: root/cmd/links
diff options
context:
space:
mode:
authorale <ale@incal.net>2014-12-19 13:55:05 +0000
committerale <ale@incal.net>2014-12-19 13:55:05 +0000
commitb09f05f8137e5bbc27a0a306de0529c59d3f2c28 (patch)
tree37e66968c2eb0e361a1f284804a86fe339e68b78 /cmd/links
downloadcrawl-b09f05f8137e5bbc27a0a306de0529c59d3f2c28.tar.gz
crawl-b09f05f8137e5bbc27a0a306de0529c59d3f2c28.zip
initial commit
Diffstat (limited to 'cmd/links')
-rw-r--r--cmd/links/links.go75
1 files changed, 75 insertions, 0 deletions
diff --git a/cmd/links/links.go b/cmd/links/links.go
new file mode 100644
index 0000000..3ba63be
--- /dev/null
+++ b/cmd/links/links.go
@@ -0,0 +1,75 @@
+// A restartable crawler that extracts links from HTML pages and
+// simply prints them.
+//
+
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "net/http"
+ "net/url"
+ "strings"
+
+ "git.autistici.org/ale/crawl"
+ "github.com/PuerkitoBio/goquery"
+)
+
+var (
+ dbPath = flag.String("state", "crawldb", "crawl state database path")
+ concurrency = flag.Int("c", 10, "concurrent workers")
+ depth = flag.Int("depth", 10, "maximum link depth")
+ validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+)
+
+var linkMatches = []struct {
+ tag string
+ attr string
+}{
+ {"a", "href"},
+ {"link", "href"},
+ {"img", "src"},
+ {"script", "src"},
+}
+
+func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
+ if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") {
+ return nil
+ }
+
+ doc, err := goquery.NewDocumentFromResponse(resp)
+ if err != nil {
+ return err
+ }
+
+ links := make(map[string]*url.URL)
+
+ for _, lm := range linkMatches {
+ doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
+ val, _ := s.Attr(lm.attr)
+ if linkurl, err := resp.Request.URL.Parse(val); err == nil {
+ links[linkurl.String()] = linkurl
+ }
+ })
+ }
+
+ for _, link := range links {
+ //log.Printf("%s -> %s", u, link.String())
+ c.Enqueue(link, depth+1)
+ }
+ return nil
+}
+
+func main() {
+ flag.Parse()
+
+ seeds := crawl.MustParseURLs(flag.Args())
+ scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
+
+ crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.HandlerFunc(extractLinks))
+ if err != nil {
+ log.Fatal(err)
+ }
+ crawler.Run()
+}