aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2019-01-19 14:57:40 +0000
committerale <ale@incal.net>2019-01-19 14:57:40 +0000
commitcce28f44e7ad88900e6c53394a8e496f2955b784 (patch)
tree5a8955c9254c095dd37956def96161342fd71a9d
parentc5ec7eb826bfd08aa6e8dd880efa15930f78ba19 (diff)
downloadcrawl-cce28f44e7ad88900e6c53394a8e496f2955b784.tar.gz
crawl-cce28f44e7ad88900e6c53394a8e496f2955b784.zip
Replace URLInfo with a simple URL presence check
The whole URLInfo structure, while neat, is unused except for the purpose of verifying if we have already seen a specific URL. The presence check is also now limited to Enqueue().
-rw-r--r--crawler.go45
1 files changed, 18 insertions, 27 deletions
diff --git a/crawler.go b/crawler.go
index aa1504c..e7bbf3c 100644
--- a/crawler.go
+++ b/crawler.go
@@ -92,14 +92,6 @@ const (
TagRelated
)
-// URLInfo stores information about a crawled URL.
-type URLInfo struct {
- URL string
- StatusCode int
- CrawledAt time.Time
- Error string
-}
-
// A Fetcher retrieves contents from remote URLs.
type Fetcher interface {
// Fetch retrieves a URL and returns the response.
@@ -162,6 +154,19 @@ func normalizeURL(u *url.URL) *url.URL {
return u2
}
+func seenKey(u *url.URL) []byte {
+ return []byte(fmt.Sprintf("_seen/%s", u.String()))
+}
+
+func (c *Crawler) hasSeen(u *url.URL) bool {
+ _, err := c.db.Get(seenKey(u), nil)
+ return err == nil
+}
+
+func (c *Crawler) setSeen(wb *leveldb.Batch, u *url.URL) {
+ wb.Put(seenKey(u), []byte{})
+}
+
// Enqueue a (possibly new) URL for processing.
func (c *Crawler) Enqueue(link Outlink, depth int) error {
// Normalize the URL. We are going to replace link.URL in-place, to
@@ -178,22 +183,18 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
defer c.enqueueMx.Unlock()
// Check if we've already seen it.
- var info URLInfo
- ukey := []byte(fmt.Sprintf("url/%s", link.URL.String()))
- if err := c.db.GetObj(ukey, &info); err == nil {
+ if c.hasSeen(link.URL) {
return nil
}
- // Store the URL in the queue, and store an empty URLInfo to
- // make sure that subsequent calls to Enqueue with the same
- // URL will fail.
+ // Store the URL in the queue, and mark it as seen to make
+ // sure that subsequent calls to Enqueue with the same URL
+ // will fail.
wb := new(leveldb.Batch)
if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil {
return err
}
- if err := c.db.PutObjBatch(wb, ukey, &info); err != nil {
- return err
- }
+ c.setSeen(wb, link.URL)
return c.db.Write(wb, nil)
}
@@ -230,14 +231,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
return
}
- // Retrieve the URLInfo object from the crawl db.
- // Ignore errors, we can work with an empty object.
- urlkey := []byte(fmt.Sprintf("url/%s", p.URL))
- var info URLInfo
- c.db.GetObj(urlkey, &info) // nolint
- info.CrawledAt = time.Now()
- info.URL = p.URL
-
// Fetch the URL and handle it. Make sure to Close the
// response body (even if it gets replaced in the
// Response object).
@@ -246,7 +239,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
var respBody io.ReadCloser
if httpErr == nil {
respBody = httpResp.Body
- info.StatusCode = httpResp.StatusCode
}
// Invoke the handler (even if the fetcher errored
@@ -268,7 +260,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
}
// Write the result in our database.
- Must(c.db.PutObjBatch(wb, urlkey, &info))
Must(c.db.Write(wb, nil))
}
}