aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2018-12-27 23:16:06 +0000
committerale <ale@incal.net>2018-12-27 23:16:06 +0000
commit2f3ca2f8677651a417d9e6819e8da3ed965a83f0 (patch)
treed8ce0357da8aec61d37dae873d18c8282d9e4893
parent52eba2bb523961e5a46c751f9eb33ac5ec1a87c0 (diff)
downloadcrawl-2f3ca2f8677651a417d9e6819e8da3ed965a83f0.tar.gz
crawl-2f3ca2f8677651a417d9e6819e8da3ed965a83f0.zip
Normalize URLs before checking if they are in scope
-rw-r--r--crawler.go23
1 files changed, 18 insertions, 5 deletions
diff --git a/crawler.go b/crawler.go
index d5c2e88..aa1504c 100644
--- a/crawler.go
+++ b/crawler.go
@@ -150,23 +150,36 @@ type Crawler struct {
enqueueMx sync.Mutex
}
+func normalizeURL(u *url.URL) *url.URL {
+ urlStr := purell.NormalizeURL(u,
+ purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|
+ purell.FlagRemoveFragment|purell.FlagSortQuery)
+ u2, err := url.Parse(urlStr)
+ if err != nil {
+ // We *really* do not expect an error here.
+ panic(err)
+ }
+ return u2
+}
+
// Enqueue a (possibly new) URL for processing.
func (c *Crawler) Enqueue(link Outlink, depth int) error {
+ // Normalize the URL. We are going to replace link.URL in-place, to
+ // ensure that scope checks are applied to the normalized URL.
+ link.URL = normalizeURL(link.URL)
+
// See if it's in scope.
if !c.scope.Check(link, depth) {
return nil
}
- // Normalize the URL.
- urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagSortQuery)
-
// Protect the read-modify-update below with a mutex.
c.enqueueMx.Lock()
defer c.enqueueMx.Unlock()
// Check if we've already seen it.
var info URLInfo
- ukey := []byte(fmt.Sprintf("url/%s", urlStr))
+ ukey := []byte(fmt.Sprintf("url/%s", link.URL.String()))
if err := c.db.GetObj(ukey, &info); err == nil {
return nil
}
@@ -175,7 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
// make sure that subsequent calls to Enqueue with the same
// URL will fail.
wb := new(leveldb.Batch)
- if err := c.queue.Add(wb, urlStr, depth, time.Now()); err != nil {
+ if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil {
return err
}
if err := c.db.PutObjBatch(wb, ukey, &info); err != nil {