diff options
-rw-r--r-- | cmd/crawl/crawl.go | 48 | ||||
-rw-r--r-- | scope.go | 22 |
2 files changed, 58 insertions, 12 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index bbbd65b..d0ff268 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -3,6 +3,7 @@ package main import ( + "bufio" "bytes" "flag" "fmt" @@ -12,6 +13,7 @@ import ( "net/http" "os" "os/signal" + "regexp" "runtime/pprof" "strconv" "strings" @@ -33,10 +35,52 @@ var ( validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + cpuprofile = flag.String("cpuprofile", "", "create cpu profile") - cpuprofile = flag.String("cpuprofile", "", "create cpu profile") + excludes []*regexp.Regexp ) +func init() { + flag.Var(&excludesFlag{}, "exclude", "exclude regex URL patterns") + flag.Var(&excludesFileFlag{}, "exclude-from-file", "load exclude regex URL patterns from a file") +} + +type excludesFlag struct{} + +func (f *excludesFlag) String() string { return "" } + +func (f *excludesFlag) Set(s string) error { + rx, err := regexp.Compile(s) + if err != nil { + return err + } + excludes = append(excludes, rx) + return nil +} + +type excludesFileFlag struct{} + +func (f *excludesFileFlag) String() string { return "" } + +func (f *excludesFileFlag) Set(s string) error { + ff, err := os.Open(s) + if err != nil { + return err + } + defer ff.Close() // nolint + var lineNum int + scanner := bufio.NewScanner(ff) + for scanner.Scan() { + lineNum++ + rx, err := regexp.Compile(scanner.Text()) + if err != nil { + return fmt.Errorf("%s, line %d: %v", s, lineNum, err) + } + excludes = append(excludes, rx) + } + return nil +} + func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { links, err := analysis.GetLinks(resp) if err != nil { @@ -221,7 +265,7 @@ func main() { crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), crawl.NewDepthScope(*depth), crawl.NewSeedScope(seeds), - crawl.NewRegexpIgnoreScope(nil), + crawl.NewRegexpIgnoreScope(excludes), ) if !*excludeRelated { scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) @@ -115,19 +115,21 @@ func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool { return true } +func compileDefaultIgnorePatterns() []*regexp.Regexp { + out := make([]*regexp.Regexp, 0, len(defaultIgnorePatterns)) + for _, p := range defaultIgnorePatterns { + out = append(out, regexp.MustCompile(p)) + } + return out +} + // NewRegexpIgnoreScope returns a Scope that filters out URLs // according to a list of regular expressions. -func NewRegexpIgnoreScope(ignores []string) Scope { - if ignores == nil { - ignores = defaultIgnorePatterns - } - r := regexpIgnoreScope{ - ignores: make([]*regexp.Regexp, 0, len(ignores)), - } - for _, i := range ignores { - r.ignores = append(r.ignores, regexp.MustCompile(i)) +func NewRegexpIgnoreScope(ignores []*regexp.Regexp) Scope { + ignores = append(compileDefaultIgnorePatterns(), ignores...) + return ®expIgnoreScope{ + ignores: ignores, } - return &r } // NewIncludeRelatedScope always includes resources with TagRelated. |