aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2021-06-19 16:47:14 +0100
committerale <ale@incal.net>2021-06-19 16:47:14 +0100
commitc7ab870184449929fc69f9a5da36de38cae1b5ba (patch)
tree801ceb6e39df656a1de7a50ba24e58e732d7213a
parent833da3f33a5c727a8d7ccd034c43497c66793026 (diff)
downloadcrawl-c7ab870184449929fc69f9a5da36de38cae1b5ba.tar.gz
crawl-c7ab870184449929fc69f9a5da36de38cae1b5ba.zip
Ignore URL decode errors
This is an internal inconsistency that should be investigated.
-rw-r--r--crawler.go11
-rw-r--r--go.mod2
2 files changed, 11 insertions, 2 deletions
diff --git a/crawler.go b/crawler.go
index 49d124b..b2ad3d9 100644
--- a/crawler.go
+++ b/crawler.go
@@ -154,8 +154,8 @@ func normalizeURL(u *url.URL) *url.URL {
purell.FlagRemoveFragment|purell.FlagSortQuery)
u2, err := url.Parse(urlStr)
if err != nil {
- // We *really* do not expect an error here.
- panic(err)
+ // Ignore errors here.
+ return nil
}
return u2
}
@@ -178,6 +178,13 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
// Normalize the URL. We are going to replace link.URL in-place, to
// ensure that scope checks are applied to the normalized URL.
link.URL = normalizeURL(link.URL)
+ if link.URL == nil {
+ // We couldn't parse a URL that we have extracted
+ // ourselves from the documents. This is an internal
+ // inconsistency, but by ignoring the error we avoid
+ // failing the entire crawl.
+ return nil
+ }
// See if it's in scope.
if !c.scope.Check(link, depth) {
diff --git a/go.mod b/go.mod
index 488dbdb..5ca4ba4 100644
--- a/go.mod
+++ b/go.mod
@@ -1,5 +1,7 @@
module git.autistici.org/ale/crawl
+go 1.15
+
require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597