diff options
author | ale <ale@incal.net> | 2014-12-20 10:39:53 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2014-12-20 10:39:53 +0000 |
commit | d4c561c23d016cf6a7507840153e835994915cb8 (patch) | |
tree | 53d25b35a90e5c4e475915ee8746ec24312812d9 /crawler.go | |
parent | b09f05f8137e5bbc27a0a306de0529c59d3f2c28 (diff) | |
download | crawl-d4c561c23d016cf6a7507840153e835994915cb8.tar.gz crawl-d4c561c23d016cf6a7507840153e835994915cb8.zip |
move the WARC code into its own package
Now generates well-formed, indexable WARC files.
Diffstat (limited to 'crawler.go')
-rw-r--r-- | crawler.go | 7 |
1 files changed, 4 insertions, 3 deletions
@@ -272,8 +272,9 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler return c, nil } -// Run the crawl, does not exit until it is done. -func (c *Crawler) Run() { +// Run the crawl with the specified number of workers. This function +// does not exit until all work is done (no URLs left in the queue). +func (c *Crawler) Run(concurrency int) { // Load initial seeds into the queue. for _, u := range c.seeds { c.Enqueue(u, 0) @@ -282,7 +283,7 @@ func (c *Crawler) Run() { // Start some runners and wait until they're done. var wg sync.WaitGroup ch := c.process() - for i := 0; i < 3; i++ { + for i := 0; i < concurrency; i++ { wg.Add(1) go func() { c.urlHandler(ch) |