aboutsummaryrefslogtreecommitdiff
path: root/crawler.go
diff options
context:
space:
mode:
authorale <ale@incal.net>2014-12-20 10:39:53 +0000
committerale <ale@incal.net>2014-12-20 10:39:53 +0000
commitd4c561c23d016cf6a7507840153e835994915cb8 (patch)
tree53d25b35a90e5c4e475915ee8746ec24312812d9 /crawler.go
parentb09f05f8137e5bbc27a0a306de0529c59d3f2c28 (diff)
downloadcrawl-d4c561c23d016cf6a7507840153e835994915cb8.tar.gz
crawl-d4c561c23d016cf6a7507840153e835994915cb8.zip
move the WARC code into its own package
Now generates well-formed, indexable WARC files.
Diffstat (limited to 'crawler.go')
-rw-r--r--crawler.go7
1 files changed, 4 insertions, 3 deletions
diff --git a/crawler.go b/crawler.go
index ed43b1f..709ff3c 100644
--- a/crawler.go
+++ b/crawler.go
@@ -272,8 +272,9 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler
return c, nil
}
-// Run the crawl, does not exit until it is done.
-func (c *Crawler) Run() {
+// Run the crawl with the specified number of workers. This function
+// does not exit until all work is done (no URLs left in the queue).
+func (c *Crawler) Run(concurrency int) {
// Load initial seeds into the queue.
for _, u := range c.seeds {
c.Enqueue(u, 0)
@@ -282,7 +283,7 @@ func (c *Crawler) Run() {
// Start some runners and wait until they're done.
var wg sync.WaitGroup
ch := c.process()
- for i := 0; i < 3; i++ {
+ for i := 0; i < concurrency; i++ {
wg.Add(1)
go func() {
c.urlHandler(ch)