diff options
author | ale <ale@incal.net> | 2018-12-27 23:16:06 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2018-12-27 23:16:06 +0000 |
commit | 2f3ca2f8677651a417d9e6819e8da3ed965a83f0 (patch) | |
tree | d8ce0357da8aec61d37dae873d18c8282d9e4893 | |
parent | 52eba2bb523961e5a46c751f9eb33ac5ec1a87c0 (diff) | |
download | crawl-2f3ca2f8677651a417d9e6819e8da3ed965a83f0.tar.gz crawl-2f3ca2f8677651a417d9e6819e8da3ed965a83f0.zip |
Normalize URLs before checking if they are in scope
-rw-r--r-- | crawler.go | 23 |
1 files changed, 18 insertions, 5 deletions
@@ -150,23 +150,36 @@ type Crawler struct { enqueueMx sync.Mutex } +func normalizeURL(u *url.URL) *url.URL { + urlStr := purell.NormalizeURL(u, + purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes| + purell.FlagRemoveFragment|purell.FlagSortQuery) + u2, err := url.Parse(urlStr) + if err != nil { + // We *really* do not expect an error here. + panic(err) + } + return u2 +} + // Enqueue a (possibly new) URL for processing. func (c *Crawler) Enqueue(link Outlink, depth int) error { + // Normalize the URL. We are going to replace link.URL in-place, to + // ensure that scope checks are applied to the normalized URL. + link.URL = normalizeURL(link.URL) + // See if it's in scope. if !c.scope.Check(link, depth) { return nil } - // Normalize the URL. - urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagSortQuery) - // Protect the read-modify-update below with a mutex. c.enqueueMx.Lock() defer c.enqueueMx.Unlock() // Check if we've already seen it. var info URLInfo - ukey := []byte(fmt.Sprintf("url/%s", urlStr)) + ukey := []byte(fmt.Sprintf("url/%s", link.URL.String())) if err := c.db.GetObj(ukey, &info); err == nil { return nil } @@ -175,7 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error { // make sure that subsequent calls to Enqueue with the same // URL will fail. wb := new(leveldb.Batch) - if err := c.queue.Add(wb, urlStr, depth, time.Now()); err != nil { + if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil { return err } if err := c.db.PutObjBatch(wb, ukey, &info); err != nil { |