From 4c82422d2e75b9b8f4d034b1f43fda566416d6af Mon Sep 17 00:00:00 2001 From: ale Date: Sat, 20 Dec 2014 11:41:24 +0000 Subject: make Scope checking more modular --- cmd/crawl/crawl.go | 6 +++- cmd/links/links.go | 6 +++- crawler.go | 61 +++++------------------------------ scope.go | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 55 deletions(-) create mode 100644 scope.go diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 1abeca6..8c02089 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -124,7 +124,11 @@ func main() { } seeds := crawl.MustParseURLs(flag.Args()) - scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ",")) + scope := []crawl.Scope{ + crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), + crawl.NewDepthScope(*depth), + crawl.NewSeedScope(seeds), + } w := warc.NewWriter(outf) defer w.Close() diff --git a/cmd/links/links.go b/cmd/links/links.go index 95388ce..e89e22d 100644 --- a/cmd/links/links.go +++ b/cmd/links/links.go @@ -38,7 +38,11 @@ func main() { flag.Parse() seeds := crawl.MustParseURLs(flag.Args()) - scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ",")) + scope := []crawl.Scope{ + crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), + crawl.NewDepthScope(*depth), + crawl.NewSeedScope(seeds), + } crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.HandlerFunc(extractLinks)) if err != nil { diff --git a/crawler.go b/crawler.go index 709ff3c..fad8b9e 100644 --- a/crawler.go +++ b/crawler.go @@ -8,7 +8,6 @@ import ( "log" "net/http" "net/url" - "strings" "sync" "time" @@ -58,10 +57,6 @@ type URLInfo struct { Error error } -type Scope interface { - Check(*url.URL, int) bool -} - type Fetcher interface { Fetch(string) (*http.Response, error) } @@ -86,7 +81,7 @@ func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Respons type Crawler struct { db *gobDB seeds []*url.URL - scope Scope + scopes []Scope fetcher Fetcher handler Handler @@ -111,9 +106,11 @@ func (c *Crawler) Enqueue(u *url.URL, depth int) { // Normalize the URL. urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery) - // See if it's in scope. - if !c.scope.Check(u, depth) { - return + // See if it's in scope. Checks are ANDed. + for _, sc := range c.scopes { + if !sc.Check(u, depth) { + return + } } c.enqueueMx.Lock() @@ -202,46 +199,6 @@ func (c *Crawler) urlHandler(queue <-chan QueuePair) { } } -type seedScope struct { - seeds []*url.URL - schemes map[string]struct{} - maxDepth int -} - -func (s *seedScope) Check(u *url.URL, depth int) bool { - // Ignore non-allowed schemes. - if _, ok := s.schemes[u.Scheme]; !ok { - return false - } - - // Do not crawl beyond maxDepth. - if depth > s.maxDepth { - return false - } - - // Check each seed prefix. - for _, seed := range s.seeds { - if u.Host == seed.Host && strings.HasPrefix(u.Path, seed.Path) { - return true - } - } - return false -} - -// NewSeedScope returns a Scope that will only allow crawling the seed -// domains, and not beyond the specified maximum link depth. -func NewSeedScope(seeds []*url.URL, maxDepth int, allowedSchemes []string) Scope { - scope := &seedScope{ - seeds: seeds, - maxDepth: maxDepth, - schemes: make(map[string]struct{}), - } - for _, s := range allowedSchemes { - scope.schemes[s] = struct{}{} - } - return scope -} - func MustParseURLs(urls []string) []*url.URL { // Parse the seed URLs. var parsed []*url.URL @@ -256,7 +213,7 @@ func MustParseURLs(urls []string) []*url.URL { } // NewCrawler creates a new Crawler object with the specified behavior. -func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) { +func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) { // Open the crawl database. db, err := newGobDB(path) if err != nil { @@ -267,7 +224,7 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler fetcher: f, handler: &standardPageHandler{h}, seeds: seeds, - scope: scope, + scopes: scopes, } return c, nil } @@ -321,8 +278,6 @@ func (wrap *standardPageHandler) Handle(c *Crawler, u string, depth int, resp *h } info.Error = err - //log.Printf("[CRAWL] %+v", info) - c.UpdateURL(info) return nil } diff --git a/scope.go b/scope.go new file mode 100644 index 0000000..de909f4 --- /dev/null +++ b/scope.go @@ -0,0 +1,94 @@ +package crawl + +import ( + "fmt" + "net/url" + "strings" +) + +type Scope interface { + Check(*url.URL, int) bool +} + +type maxDepthScope struct { + maxDepth int +} + +func (s *maxDepthScope) Check(uri *url.URL, depth int) bool { + return depth < s.maxDepth +} + +// NewDepthScope returns a Scope that will limit crawls to a +// maximum link depth with respect to the crawl seeds. +func NewDepthScope(maxDepth int) Scope { + return &maxDepthScope{maxDepth} +} + +type schemeScope struct { + allowedSchemes map[string]struct{} +} + +func (s *schemeScope) Check(uri *url.URL, depth int) bool { + _, ok := s.allowedSchemes[uri.Scheme] + return ok +} + +// NewSchemeScope limits the crawl to the specified URL schemes. +func NewSchemeScope(schemes []string) Scope { + m := make(map[string]struct{}) + for _, s := range schemes { + m[s] = struct{}{} + } + return &schemeScope{m} +} + +// A URLPrefixMap makes it easy to check for URL prefixes (even for +// very large lists). The URL scheme is ignored, along with an +// eventual "www." prefix. +type URLPrefixMap map[string]struct{} + +func normalizeUrlPrefix(uri *url.URL) string { + return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/") +} + +func (m URLPrefixMap) Add(uri *url.URL) { + m[normalizeUrlPrefix(uri)] = struct{}{} +} + +func (m URLPrefixMap) Contains(uri *url.URL) bool { + s := strings.TrimPrefix(uri.Host, "www.") + for _, p := range strings.Split(uri.Path, "/") { + if p == "" { + continue + } + s = fmt.Sprintf("%s/%s", s, p) + if _, ok := m[s]; ok { + return true + } + } + return false +} + +type urlPrefixScope struct { + prefixes URLPrefixMap +} + +func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool { + return s.prefixes.Contains(uri) +} + +// NewURLPrefixScope returns a Scope that limits the crawl to a set of +// allowed URL prefixes. +func NewURLPrefixScope(prefixes URLPrefixMap) Scope { + return &urlPrefixScope{prefixes} +} + +// NewSeedScope returns a Scope that will only allow crawling the seed +// prefixes. +func NewSeedScope(seeds []*url.URL) Scope { + pfx := make(URLPrefixMap) + for _, s := range seeds { + pfx.Add(s) + } + return NewURLPrefixScope(pfx) +} -- cgit v1.2.3-54-g00ecf