diff options
author | ale <ale@incal.net> | 2021-06-19 16:47:14 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2021-06-19 16:47:14 +0100 |
commit | c7ab870184449929fc69f9a5da36de38cae1b5ba (patch) | |
tree | 801ceb6e39df656a1de7a50ba24e58e732d7213a | |
parent | 833da3f33a5c727a8d7ccd034c43497c66793026 (diff) | |
download | crawl-c7ab870184449929fc69f9a5da36de38cae1b5ba.tar.gz crawl-c7ab870184449929fc69f9a5da36de38cae1b5ba.zip |
Ignore URL decode errors
This is an internal inconsistency that should be investigated.
-rw-r--r-- | crawler.go | 11 | ||||
-rw-r--r-- | go.mod | 2 |
2 files changed, 11 insertions, 2 deletions
@@ -154,8 +154,8 @@ func normalizeURL(u *url.URL) *url.URL { purell.FlagRemoveFragment|purell.FlagSortQuery) u2, err := url.Parse(urlStr) if err != nil { - // We *really* do not expect an error here. - panic(err) + // Ignore errors here. + return nil } return u2 } @@ -178,6 +178,13 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error { // Normalize the URL. We are going to replace link.URL in-place, to // ensure that scope checks are applied to the normalized URL. link.URL = normalizeURL(link.URL) + if link.URL == nil { + // We couldn't parse a URL that we have extracted + // ourselves from the documents. This is an internal + // inconsistency, but by ignoring the error we avoid + // failing the entire crawl. + return nil + } // See if it's in scope. if !c.scope.Check(link, depth) { @@ -1,5 +1,7 @@ module git.autistici.org/ale/crawl +go 1.15 + require ( github.com/PuerkitoBio/goquery v1.5.0 github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 |