From c7ab870184449929fc69f9a5da36de38cae1b5ba Mon Sep 17 00:00:00 2001 From: ale Date: Sat, 19 Jun 2021 16:47:14 +0100 Subject: Ignore URL decode errors This is an internal inconsistency that should be investigated. --- crawler.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'crawler.go') diff --git a/crawler.go b/crawler.go index 49d124b..b2ad3d9 100644 --- a/crawler.go +++ b/crawler.go @@ -154,8 +154,8 @@ func normalizeURL(u *url.URL) *url.URL { purell.FlagRemoveFragment|purell.FlagSortQuery) u2, err := url.Parse(urlStr) if err != nil { - // We *really* do not expect an error here. - panic(err) + // Ignore errors here. + return nil } return u2 } @@ -178,6 +178,13 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error { // Normalize the URL. We are going to replace link.URL in-place, to // ensure that scope checks are applied to the normalized URL. link.URL = normalizeURL(link.URL) + if link.URL == nil { + // We couldn't parse a URL that we have extracted + // ourselves from the documents. This is an internal + // inconsistency, but by ignoring the error we avoid + // failing the entire crawl. + return nil + } // See if it's in scope. if !c.scope.Check(link, depth) { -- cgit v1.2.3-54-g00ecf