diff options
author | ale <ale@incal.net> | 2019-01-19 14:57:40 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2019-01-19 14:57:40 +0000 |
commit | cce28f44e7ad88900e6c53394a8e496f2955b784 (patch) | |
tree | 5a8955c9254c095dd37956def96161342fd71a9d | |
parent | c5ec7eb826bfd08aa6e8dd880efa15930f78ba19 (diff) | |
download | crawl-cce28f44e7ad88900e6c53394a8e496f2955b784.tar.gz crawl-cce28f44e7ad88900e6c53394a8e496f2955b784.zip |
Replace URLInfo with a simple URL presence check
The whole URLInfo structure, while neat, is unused except for the
purpose of verifying if we have already seen a specific URL.
The presence check is also now limited to Enqueue().
-rw-r--r-- | crawler.go | 45 |
1 files changed, 18 insertions, 27 deletions
@@ -92,14 +92,6 @@ const ( TagRelated ) -// URLInfo stores information about a crawled URL. -type URLInfo struct { - URL string - StatusCode int - CrawledAt time.Time - Error string -} - // A Fetcher retrieves contents from remote URLs. type Fetcher interface { // Fetch retrieves a URL and returns the response. @@ -162,6 +154,19 @@ func normalizeURL(u *url.URL) *url.URL { return u2 } +func seenKey(u *url.URL) []byte { + return []byte(fmt.Sprintf("_seen/%s", u.String())) +} + +func (c *Crawler) hasSeen(u *url.URL) bool { + _, err := c.db.Get(seenKey(u), nil) + return err == nil +} + +func (c *Crawler) setSeen(wb *leveldb.Batch, u *url.URL) { + wb.Put(seenKey(u), []byte{}) +} + // Enqueue a (possibly new) URL for processing. func (c *Crawler) Enqueue(link Outlink, depth int) error { // Normalize the URL. We are going to replace link.URL in-place, to @@ -178,22 +183,18 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error { defer c.enqueueMx.Unlock() // Check if we've already seen it. - var info URLInfo - ukey := []byte(fmt.Sprintf("url/%s", link.URL.String())) - if err := c.db.GetObj(ukey, &info); err == nil { + if c.hasSeen(link.URL) { return nil } - // Store the URL in the queue, and store an empty URLInfo to - // make sure that subsequent calls to Enqueue with the same - // URL will fail. + // Store the URL in the queue, and mark it as seen to make + // sure that subsequent calls to Enqueue with the same URL + // will fail. wb := new(leveldb.Batch) if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil { return err } - if err := c.db.PutObjBatch(wb, ukey, &info); err != nil { - return err - } + c.setSeen(wb, link.URL) return c.db.Write(wb, nil) } @@ -230,14 +231,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { return } - // Retrieve the URLInfo object from the crawl db. - // Ignore errors, we can work with an empty object. - urlkey := []byte(fmt.Sprintf("url/%s", p.URL)) - var info URLInfo - c.db.GetObj(urlkey, &info) // nolint - info.CrawledAt = time.Now() - info.URL = p.URL - // Fetch the URL and handle it. Make sure to Close the // response body (even if it gets replaced in the // Response object). @@ -246,7 +239,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { var respBody io.ReadCloser if httpErr == nil { respBody = httpResp.Body - info.StatusCode = httpResp.StatusCode } // Invoke the handler (even if the fetcher errored @@ -268,7 +260,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { } // Write the result in our database. - Must(c.db.PutObjBatch(wb, urlkey, &info)) Must(c.db.Write(wb, nil)) } } |