diff options
author | Alexandre Flament <alex@al-f.net> | 2020-09-19 15:59:03 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-09-19 15:59:03 +0200 |
commit | 530fc4bda7ef03b0b80e1587699c8ff29fd79aea (patch) | |
tree | a167d983a0006b1dddea5b7e6025d18b772dfc10 /searx/utils.py | |
parent | ae07f4a211ecba0331bcab5903e3263c646f8bdb (diff) | |
parent | 6deb85072ad00b85d2b3c1981c37aeb75ef68cc7 (diff) | |
download | searxng-530fc4bda7ef03b0b80e1587699c8ff29fd79aea.tar.gz searxng-530fc4bda7ef03b0b80e1587699c8ff29fd79aea.zip |
Merge pull request #2190 from dalf/fix-htmltextextractor
[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/searx/utils.py b/searx/utils.py index d8842c65f..0eb9f6a34 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -77,6 +77,10 @@ def highlight_content(content, query): return content +class HTMLTextExtractorException(Exception): + pass + + class HTMLTextExtractor(HTMLParser): def __init__(self): @@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser): return if tag != self.tags[-1]: - raise Exception("invalid html") + raise HTMLTextExtractorException() self.tags.pop() @@ -128,7 +132,10 @@ def html_to_text(html): html = html.replace('\n', ' ') html = ' '.join(html.split()) s = HTMLTextExtractor() - s.feed(html) + try: + s.feed(html) + except HTMLTextExtractorException: + logger.debug("HTMLTextExtractor: invalid HTML\n%s", html) return s.get_text() |