aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2017-12-19 00:12:11 +0000
committerale <ale@incal.net>2017-12-19 00:12:11 +0000
commit4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
treec3bf3e88729291ecf0e371d0dd43977cdd1d08ea
parent77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
downloadcrawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
Add tags (primary/related) to links
This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
-rw-r--r--analysis/links.go49
-rw-r--r--cmd/crawl/crawl.go18
-rw-r--r--crawler.go40
-rw-r--r--crawler_test.go58
-rw-r--r--scope.go63
-rw-r--r--scope_test.go2
6 files changed, 181 insertions, 49 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 9fdf8fb..5d61547 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -6,31 +6,39 @@ import (
"fmt"
"io/ioutil"
"net/http"
- "net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
+
+ "git.autistici.org/ale/crawl"
)
var (
- urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`)
+ urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`)
linkMatches = []struct {
- tag string
- attr string
+ tag string
+ attr string
+ linkTag int
}{
- {"a", "href"},
- {"link", "href"},
- {"img", "src"},
- {"script", "src"},
+ {"a", "href", crawl.TagPrimary},
+ {"link", "href", crawl.TagRelated},
+ {"img", "src", crawl.TagRelated},
+ {"script", "src", crawl.TagRelated},
}
)
+// The unparsed version of an Outlink.
+type rawOutlink struct {
+ URL string
+ Tag int
+}
+
// GetLinks returns all the links found in a document. Currently only
// parses HTML pages and CSS stylesheets.
-func GetLinks(resp *http.Response) ([]*url.URL, error) {
- var outlinks []string
+func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
+ var outlinks []rawOutlink
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
@@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
- outlinks = append(outlinks, val)
+ outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
} else if strings.HasPrefix(ctype, "text/css") {
@@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
// expression to extract "url()" links from CSS.
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
- outlinks = append(outlinks, val[1])
+ outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
}
}
// Parse outbound links relative to the request URI, and
// return unique results.
- var result []*url.URL
- links := make(map[string]*url.URL)
- for _, val := range outlinks {
- if linkurl, err := resp.Request.URL.Parse(val); err == nil {
- links[linkurl.String()] = linkurl
+ var result []crawl.Outlink
+ links := make(map[string]crawl.Outlink)
+ for _, l := range outlinks {
+ if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil {
+ links[linkurl.String()] = crawl.Outlink{
+ URL: linkurl,
+ Tag: l.Tag,
+ }
}
}
- for _, u := range links {
- result = append(result, u)
+ for _, l := range links {
+ result = append(result, l)
}
return result, nil
}
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 3954682..e31f63e 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -23,12 +23,13 @@ import (
)
var (
- dbPath = flag.String("state", "crawldb", "crawl state database path")
- keepDb = flag.Bool("keep", false, "keep the state database when done")
- concurrency = flag.Int("c", 10, "concurrent workers")
- depth = flag.Int("depth", 10, "maximum link depth")
- validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
- outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
+ dbPath = flag.String("state", "crawldb", "crawl state database path")
+ keepDb = flag.Bool("keep", false, "keep the state database when done")
+ concurrency = flag.Int("c", 10, "concurrent workers")
+ depth = flag.Int("depth", 10, "maximum link depth")
+ validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+ alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
+ outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
)
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
@@ -196,11 +197,14 @@ func main() {
}
seeds := crawl.MustParseURLs(flag.Args())
- scope := []crawl.Scope{
+ scope := crawl.AND(
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
+ )
+ if *alwaysIncludeRelated {
+ scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
}
w := warc.NewWriter(outf)
diff --git a/crawler.go b/crawler.go
index f1edc2d..9fad2ef 100644
--- a/crawler.go
+++ b/crawler.go
@@ -75,6 +75,20 @@ func (i *gobIterator) Value(obj interface{}) error {
return gob.NewDecoder(bytes.NewBuffer(i.Iterator.Value())).Decode(obj)
}
+// Outlink is a tagged outbound link.
+type Outlink struct {
+ URL *url.URL
+ Tag int
+}
+
+const (
+ // TagPrimary is a primary reference (another web page).
+ TagPrimary = iota
+
+ // TagRelated is a secondary resource, related to a page.
+ TagRelated
+)
+
// URLInfo stores information about a crawled URL.
type URLInfo struct {
URL string
@@ -118,7 +132,7 @@ type Crawler struct {
db *gobDB
queue *queue
seeds []*url.URL
- scopes []Scope
+ scope Scope
fetcher Fetcher
handler Handler
@@ -126,17 +140,15 @@ type Crawler struct {
}
// Enqueue a (possibly new) URL for processing.
-func (c *Crawler) Enqueue(u *url.URL, depth int) {
- // Normalize the URL.
- urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
-
- // See if it's in scope. Checks are ANDed.
- for _, sc := range c.scopes {
- if !sc.Check(u, depth) {
- return
- }
+func (c *Crawler) Enqueue(link Outlink, depth int) {
+ // See if it's in scope.
+ if !c.scope.Check(link, depth) {
+ return
}
+ // Normalize the URL.
+ urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
+
// Protect the read-modify-update below with a mutex.
c.enqueueMx.Lock()
defer c.enqueueMx.Unlock()
@@ -228,7 +240,7 @@ func MustParseURLs(urls []string) []*url.URL {
}
// NewCrawler creates a new Crawler object with the specified behavior.
-func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
+func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
// Open the crawl database.
db, err := newGobDB(path)
if err != nil {
@@ -241,7 +253,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
fetcher: f,
handler: h,
seeds: seeds,
- scopes: scopes,
+ scope: scope,
}
// Recover active tasks.
@@ -255,7 +267,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
func (c *Crawler) Run(concurrency int) {
// Load initial seeds into the queue.
for _, u := range c.seeds {
- c.Enqueue(u, 0)
+ c.Enqueue(Outlink{URL: u, Tag: TagPrimary}, 0)
}
// Start some runners and wait until they're done.
@@ -291,7 +303,7 @@ func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http.
if err != nil {
log.Printf("error parsing Location header: %v", err)
} else {
- c.Enqueue(locationURL, depth+1)
+ c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
}
}
} else {
diff --git a/crawler_test.go b/crawler_test.go
new file mode 100644
index 0000000..66acbe4
--- /dev/null
+++ b/crawler_test.go
@@ -0,0 +1,58 @@
+package crawl
+
+import (
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "testing"
+)
+
+func TestCrawler(t *testing.T) {
+ dir, err := ioutil.TempDir("", "")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(dir)
+
+ // Run a trivial test http server just so our test Fetcher can
+ // return a real http.Response object.
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ io.WriteString(w, "hello")
+ }))
+ defer srv.Close()
+
+ seeds := MustParseURLs([]string{srv.URL})
+ scope := AND(
+ NewSchemeScope([]string{"http"}),
+ NewSeedScope(seeds),
+ NewDepthScope(2),
+ )
+
+ var crawledPages int
+ h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+ crawledPages++
+ next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
+ log.Printf("%s -> %s", u, next)
+ c.Enqueue(Outlink{
+ URL: mustParseURL(next),
+ Tag: TagPrimary,
+ }, depth+1)
+ return nil
+ })
+
+ crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h))
+ if err != nil {
+ t.Fatal("NewCrawler", err)
+ }
+
+ crawler.Run(1)
+ crawler.Close()
+
+ if crawledPages != 2 {
+ t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10)
+ }
+}
diff --git a/scope.go b/scope.go
index 6a63018..b2e90ea 100644
--- a/scope.go
+++ b/scope.go
@@ -10,14 +10,14 @@ import (
// Scope defines the crawling scope.
type Scope interface {
// Check a URL to see if it's in scope for crawling.
- Check(*url.URL, int) bool
+ Check(Outlink, int) bool
}
type maxDepthScope struct {
maxDepth int
}
-func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
+func (s *maxDepthScope) Check(_ Outlink, depth int) bool {
return depth < s.maxDepth
}
@@ -31,8 +31,8 @@ type schemeScope struct {
allowedSchemes map[string]struct{}
}
-func (s *schemeScope) Check(uri *url.URL, depth int) bool {
- _, ok := s.allowedSchemes[uri.Scheme]
+func (s *schemeScope) Check(link Outlink, depth int) bool {
+ _, ok := s.allowedSchemes[link.URL.Scheme]
return ok
}
@@ -81,8 +81,8 @@ type urlPrefixScope struct {
prefixes URLPrefixMap
}
-func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
- return s.prefixes.Contains(uri)
+func (s *urlPrefixScope) Check(link Outlink, depth int) bool {
+ return s.prefixes.Contains(link.URL)
}
// NewURLPrefixScope returns a Scope that limits the crawl to a set of
@@ -105,8 +105,8 @@ type regexpIgnoreScope struct {
ignores []*regexp.Regexp
}
-func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
- uriStr := uri.String()
+func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
+ uriStr := link.URL.String()
for _, i := range s.ignores {
if i.MatchString(uriStr) {
return false
@@ -129,3 +129,50 @@ func NewRegexpIgnoreScope(ignores []string) Scope {
}
return &r
}
+
+// NewIncludeRelatedScope always includes resources with TagRelated.
+func NewIncludeRelatedScope() Scope {
+ return &includeRelatedScope{}
+}
+
+type includeRelatedScope struct{}
+
+func (s *includeRelatedScope) Check(link Outlink, _ int) bool {
+ return link.Tag == TagRelated
+}
+
+// AND performs a boolean AND.
+func AND(elems ...Scope) Scope {
+ return &andScope{elems: elems}
+}
+
+type andScope struct {
+ elems []Scope
+}
+
+func (s *andScope) Check(link Outlink, depth int) bool {
+ for _, e := range s.elems {
+ if !e.Check(link, depth) {
+ return false
+ }
+ }
+ return true
+}
+
+// OR performs a boolean OR.
+func OR(elems ...Scope) Scope {
+ return &orScope{elems: elems}
+}
+
+type orScope struct {
+ elems []Scope
+}
+
+func (s *orScope) Check(link Outlink, depth int) bool {
+ for _, e := range s.elems {
+ if e.Check(link, depth) {
+ return true
+ }
+ }
+ return false
+}
diff --git a/scope_test.go b/scope_test.go
index bccf93c..95366bb 100644
--- a/scope_test.go
+++ b/scope_test.go
@@ -19,7 +19,7 @@ type testScopeEntry struct {
func runScopeTest(t *testing.T, sc Scope, testdata []testScopeEntry) {
for _, td := range testdata {
uri := mustParseURL(td.uri)
- result := sc.Check(uri, td.depth)
+ result := sc.Check(Outlink{URL: uri, Tag: TagPrimary}, td.depth)
if result != td.expected {
t.Errorf("Check(%s, %d) -> got %v, want %v", td.uri, td.depth, result, td.expected)
}