aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2018-09-02 11:16:49 +0100
committerale <ale@incal.net>2018-09-02 11:16:49 +0100
commit66ce654d5be9c26ba69cc75ac12ff6662410c69d (patch)
tree484ce9f4d6444dbfe0a9595e8cfae5e6bf4e337c
parenta5d20a9a30397cf2ddc900fc58f66ce8f515f769 (diff)
downloadcrawl-66ce654d5be9c26ba69cc75ac12ff6662410c69d.tar.gz
crawl-66ce654d5be9c26ba69cc75ac12ff6662410c69d.zip
Add --exclude and --exclude-file options
Allow users to add to the exclude regexp lists easily.
-rw-r--r--cmd/crawl/crawl.go48
-rw-r--r--scope.go22
2 files changed, 58 insertions, 12 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index bbbd65b..d0ff268 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -3,6 +3,7 @@
package main
import (
+ "bufio"
"bytes"
"flag"
"fmt"
@@ -12,6 +13,7 @@ import (
"net/http"
"os"
"os/signal"
+ "regexp"
"runtime/pprof"
"strconv"
"strings"
@@ -33,10 +35,52 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
+ cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
- cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
+ excludes []*regexp.Regexp
)
+func init() {
+ flag.Var(&excludesFlag{}, "exclude", "exclude regex URL patterns")
+ flag.Var(&excludesFileFlag{}, "exclude-from-file", "load exclude regex URL patterns from a file")
+}
+
+type excludesFlag struct{}
+
+func (f *excludesFlag) String() string { return "" }
+
+func (f *excludesFlag) Set(s string) error {
+ rx, err := regexp.Compile(s)
+ if err != nil {
+ return err
+ }
+ excludes = append(excludes, rx)
+ return nil
+}
+
+type excludesFileFlag struct{}
+
+func (f *excludesFileFlag) String() string { return "" }
+
+func (f *excludesFileFlag) Set(s string) error {
+ ff, err := os.Open(s)
+ if err != nil {
+ return err
+ }
+ defer ff.Close() // nolint
+ var lineNum int
+ scanner := bufio.NewScanner(ff)
+ for scanner.Scan() {
+ lineNum++
+ rx, err := regexp.Compile(scanner.Text())
+ if err != nil {
+ return fmt.Errorf("%s, line %d: %v", s, lineNum, err)
+ }
+ excludes = append(excludes, rx)
+ }
+ return nil
+}
+
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
@@ -221,7 +265,7 @@ func main() {
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
- crawl.NewRegexpIgnoreScope(nil),
+ crawl.NewRegexpIgnoreScope(excludes),
)
if !*excludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
diff --git a/scope.go b/scope.go
index b2e90ea..bda1035 100644
--- a/scope.go
+++ b/scope.go
@@ -115,19 +115,21 @@ func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
return true
}
+func compileDefaultIgnorePatterns() []*regexp.Regexp {
+ out := make([]*regexp.Regexp, 0, len(defaultIgnorePatterns))
+ for _, p := range defaultIgnorePatterns {
+ out = append(out, regexp.MustCompile(p))
+ }
+ return out
+}
+
// NewRegexpIgnoreScope returns a Scope that filters out URLs
// according to a list of regular expressions.
-func NewRegexpIgnoreScope(ignores []string) Scope {
- if ignores == nil {
- ignores = defaultIgnorePatterns
- }
- r := regexpIgnoreScope{
- ignores: make([]*regexp.Regexp, 0, len(ignores)),
- }
- for _, i := range ignores {
- r.ignores = append(r.ignores, regexp.MustCompile(i))
+func NewRegexpIgnoreScope(ignores []*regexp.Regexp) Scope {
+ ignores = append(compileDefaultIgnorePatterns(), ignores...)
+ return &regexpIgnoreScope{
+ ignores: ignores,
}
- return &r
}
// NewIncludeRelatedScope always includes resources with TagRelated.