From 4cd67e7234943baf31b2e122f8ee3c70c21fb489 Mon Sep 17 00:00:00 2001 From: ale Date: Tue, 19 Dec 2017 00:12:11 +0000 Subject: Add tags (primary/related) to links This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired). --- analysis/links.go | 49 ++++++++++++++++++++++++++---------------- cmd/crawl/crawl.go | 18 ++++++++++------ crawler.go | 40 ++++++++++++++++++++++------------ crawler_test.go | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ scope.go | 63 +++++++++++++++++++++++++++++++++++++++++++++++------- scope_test.go | 2 +- 6 files changed, 181 insertions(+), 49 deletions(-) create mode 100644 crawler_test.go diff --git a/analysis/links.go b/analysis/links.go index 9fdf8fb..5d61547 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -6,31 +6,39 @@ import ( "fmt" "io/ioutil" "net/http" - "net/url" "regexp" "strings" "github.com/PuerkitoBio/goquery" + + "git.autistici.org/ale/crawl" ) var ( - urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`) + urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`) linkMatches = []struct { - tag string - attr string + tag string + attr string + linkTag int }{ - {"a", "href"}, - {"link", "href"}, - {"img", "src"}, - {"script", "src"}, + {"a", "href", crawl.TagPrimary}, + {"link", "href", crawl.TagRelated}, + {"img", "src", crawl.TagRelated}, + {"script", "src", crawl.TagRelated}, } ) +// The unparsed version of an Outlink. +type rawOutlink struct { + URL string + Tag int +} + // GetLinks returns all the links found in a document. Currently only // parses HTML pages and CSS stylesheets. -func GetLinks(resp *http.Response) ([]*url.URL, error) { - var outlinks []string +func GetLinks(resp *http.Response) ([]crawl.Outlink, error) { + var outlinks []rawOutlink ctype := resp.Header.Get("Content-Type") if strings.HasPrefix(ctype, "text/html") { @@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { for _, lm := range linkMatches { doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { val, _ := s.Attr(lm.attr) - outlinks = append(outlinks, val) + outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) }) } } else if strings.HasPrefix(ctype, "text/css") { @@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { // expression to extract "url()" links from CSS. if data, err := ioutil.ReadAll(resp.Body); err == nil { for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { - outlinks = append(outlinks, val[1]) + outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated}) } } } // Parse outbound links relative to the request URI, and // return unique results. - var result []*url.URL - links := make(map[string]*url.URL) - for _, val := range outlinks { - if linkurl, err := resp.Request.URL.Parse(val); err == nil { - links[linkurl.String()] = linkurl + var result []crawl.Outlink + links := make(map[string]crawl.Outlink) + for _, l := range outlinks { + if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil { + links[linkurl.String()] = crawl.Outlink{ + URL: linkurl, + Tag: l.Tag, + } } } - for _, u := range links { - result = append(result, u) + for _, l := range links { + result = append(result, l) } return result, nil } diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 3954682..e31f63e 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -23,12 +23,13 @@ import ( ) var ( - dbPath = flag.String("state", "crawldb", "crawl state database path") - keepDb = flag.Bool("keep", false, "keep the state database when done") - concurrency = flag.Int("c", 10, "concurrent workers") - depth = flag.Int("depth", 10, "maximum link depth") - validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") - outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 10, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") + alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") + outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") ) func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { @@ -196,11 +197,14 @@ func main() { } seeds := crawl.MustParseURLs(flag.Args()) - scope := []crawl.Scope{ + scope := crawl.AND( crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), crawl.NewDepthScope(*depth), crawl.NewSeedScope(seeds), crawl.NewRegexpIgnoreScope(nil), + ) + if *alwaysIncludeRelated { + scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) } w := warc.NewWriter(outf) diff --git a/crawler.go b/crawler.go index f1edc2d..9fad2ef 100644 --- a/crawler.go +++ b/crawler.go @@ -75,6 +75,20 @@ func (i *gobIterator) Value(obj interface{}) error { return gob.NewDecoder(bytes.NewBuffer(i.Iterator.Value())).Decode(obj) } +// Outlink is a tagged outbound link. +type Outlink struct { + URL *url.URL + Tag int +} + +const ( + // TagPrimary is a primary reference (another web page). + TagPrimary = iota + + // TagRelated is a secondary resource, related to a page. + TagRelated +) + // URLInfo stores information about a crawled URL. type URLInfo struct { URL string @@ -118,7 +132,7 @@ type Crawler struct { db *gobDB queue *queue seeds []*url.URL - scopes []Scope + scope Scope fetcher Fetcher handler Handler @@ -126,17 +140,15 @@ type Crawler struct { } // Enqueue a (possibly new) URL for processing. -func (c *Crawler) Enqueue(u *url.URL, depth int) { - // Normalize the URL. - urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery) - - // See if it's in scope. Checks are ANDed. - for _, sc := range c.scopes { - if !sc.Check(u, depth) { - return - } +func (c *Crawler) Enqueue(link Outlink, depth int) { + // See if it's in scope. + if !c.scope.Check(link, depth) { + return } + // Normalize the URL. + urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery) + // Protect the read-modify-update below with a mutex. c.enqueueMx.Lock() defer c.enqueueMx.Unlock() @@ -228,7 +240,7 @@ func MustParseURLs(urls []string) []*url.URL { } // NewCrawler creates a new Crawler object with the specified behavior. -func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) { +func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) { // Open the crawl database. db, err := newGobDB(path) if err != nil { @@ -241,7 +253,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand fetcher: f, handler: h, seeds: seeds, - scopes: scopes, + scope: scope, } // Recover active tasks. @@ -255,7 +267,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand func (c *Crawler) Run(concurrency int) { // Load initial seeds into the queue. for _, u := range c.seeds { - c.Enqueue(u, 0) + c.Enqueue(Outlink{URL: u, Tag: TagPrimary}, 0) } // Start some runners and wait until they're done. @@ -291,7 +303,7 @@ func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http. if err != nil { log.Printf("error parsing Location header: %v", err) } else { - c.Enqueue(locationURL, depth+1) + c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1) } } } else { diff --git a/crawler_test.go b/crawler_test.go new file mode 100644 index 0000000..66acbe4 --- /dev/null +++ b/crawler_test.go @@ -0,0 +1,58 @@ +package crawl + +import ( + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "net/http/httptest" + "os" + "testing" +) + +func TestCrawler(t *testing.T) { + dir, err := ioutil.TempDir("", "") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(dir) + + // Run a trivial test http server just so our test Fetcher can + // return a real http.Response object. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + io.WriteString(w, "hello") + })) + defer srv.Close() + + seeds := MustParseURLs([]string{srv.URL}) + scope := AND( + NewSchemeScope([]string{"http"}), + NewSeedScope(seeds), + NewDepthScope(2), + ) + + var crawledPages int + h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error { + crawledPages++ + next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages) + log.Printf("%s -> %s", u, next) + c.Enqueue(Outlink{ + URL: mustParseURL(next), + Tag: TagPrimary, + }, depth+1) + return nil + }) + + crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h)) + if err != nil { + t.Fatal("NewCrawler", err) + } + + crawler.Run(1) + crawler.Close() + + if crawledPages != 2 { + t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10) + } +} diff --git a/scope.go b/scope.go index 6a63018..b2e90ea 100644 --- a/scope.go +++ b/scope.go @@ -10,14 +10,14 @@ import ( // Scope defines the crawling scope. type Scope interface { // Check a URL to see if it's in scope for crawling. - Check(*url.URL, int) bool + Check(Outlink, int) bool } type maxDepthScope struct { maxDepth int } -func (s *maxDepthScope) Check(uri *url.URL, depth int) bool { +func (s *maxDepthScope) Check(_ Outlink, depth int) bool { return depth < s.maxDepth } @@ -31,8 +31,8 @@ type schemeScope struct { allowedSchemes map[string]struct{} } -func (s *schemeScope) Check(uri *url.URL, depth int) bool { - _, ok := s.allowedSchemes[uri.Scheme] +func (s *schemeScope) Check(link Outlink, depth int) bool { + _, ok := s.allowedSchemes[link.URL.Scheme] return ok } @@ -81,8 +81,8 @@ type urlPrefixScope struct { prefixes URLPrefixMap } -func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool { - return s.prefixes.Contains(uri) +func (s *urlPrefixScope) Check(link Outlink, depth int) bool { + return s.prefixes.Contains(link.URL) } // NewURLPrefixScope returns a Scope that limits the crawl to a set of @@ -105,8 +105,8 @@ type regexpIgnoreScope struct { ignores []*regexp.Regexp } -func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool { - uriStr := uri.String() +func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool { + uriStr := link.URL.String() for _, i := range s.ignores { if i.MatchString(uriStr) { return false @@ -129,3 +129,50 @@ func NewRegexpIgnoreScope(ignores []string) Scope { } return &r } + +// NewIncludeRelatedScope always includes resources with TagRelated. +func NewIncludeRelatedScope() Scope { + return &includeRelatedScope{} +} + +type includeRelatedScope struct{} + +func (s *includeRelatedScope) Check(link Outlink, _ int) bool { + return link.Tag == TagRelated +} + +// AND performs a boolean AND. +func AND(elems ...Scope) Scope { + return &andScope{elems: elems} +} + +type andScope struct { + elems []Scope +} + +func (s *andScope) Check(link Outlink, depth int) bool { + for _, e := range s.elems { + if !e.Check(link, depth) { + return false + } + } + return true +} + +// OR performs a boolean OR. +func OR(elems ...Scope) Scope { + return &orScope{elems: elems} +} + +type orScope struct { + elems []Scope +} + +func (s *orScope) Check(link Outlink, depth int) bool { + for _, e := range s.elems { + if e.Check(link, depth) { + return true + } + } + return false +} diff --git a/scope_test.go b/scope_test.go index bccf93c..95366bb 100644 --- a/scope_test.go +++ b/scope_test.go @@ -19,7 +19,7 @@ type testScopeEntry struct { func runScopeTest(t *testing.T, sc Scope, testdata []testScopeEntry) { for _, td := range testdata { uri := mustParseURL(td.uri) - result := sc.Check(uri, td.depth) + result := sc.Check(Outlink{URL: uri, Tag: TagPrimary}, td.depth) if result != td.expected { t.Errorf("Check(%s, %d) -> got %v, want %v", td.uri, td.depth, result, td.expected) } -- cgit v1.2.3-54-g00ecf