From c7ab870184449929fc69f9a5da36de38cae1b5ba Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sat, 19 Jun 2021 16:47:14 +0100
Subject: Ignore URL decode errors

This is an internal inconsistency that should be investigated.
---
 crawler.go | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'crawler.go')

diff --git a/crawler.go b/crawler.go
index 49d124b..b2ad3d9 100644
--- a/crawler.go
+++ b/crawler.go
@@ -154,8 +154,8 @@ func normalizeURL(u *url.URL) *url.URL {
 			purell.FlagRemoveFragment|purell.FlagSortQuery)
 	u2, err := url.Parse(urlStr)
 	if err != nil {
-		// We *really* do not expect an error here.
-		panic(err)
+		// Ignore errors here.
+		return nil
 	}
 	return u2
 }
@@ -178,6 +178,13 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
 	// Normalize the URL. We are going to replace link.URL in-place, to
 	// ensure that scope checks are applied to the normalized URL.
 	link.URL = normalizeURL(link.URL)
+	if link.URL == nil {
+		// We couldn't parse a URL that we have extracted
+		// ourselves from the documents. This is an internal
+		// inconsistency, but by ignoring the error we avoid
+		// failing the entire crawl.
+		return nil
+	}
 
 	// See if it's in scope.
 	if !c.scope.Check(link, depth) {
-- 
cgit v1.2.3-54-g00ecf