diff options
author | Adam Tauber <asciimoo@gmail.com> | 2015-01-01 14:13:56 +0100 |
---|---|---|
committer | Adam Tauber <asciimoo@gmail.com> | 2015-01-01 14:13:56 +0100 |
commit | 1408859b4b0ca9efc590ca0e112c6bc0cb984e2c (patch) | |
tree | 74a997b29eccf6c815ac8d107bd639f9b789eee8 /searx/utils.py | |
parent | 469e08881ee17d8a180d0c0741c1552a29108f0e (diff) | |
download | searxng-1408859b4b0ca9efc590ca0e112c6bc0cb984e2c.tar.gz searxng-1408859b4b0ca9efc590ca0e112c6bc0cb984e2c.zip |
[fix] ignore scripts/styles in html_to_text
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py index b725a8b95..8a3f35531 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64', ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}" +blocked_tags = ('script', + 'style') + def gen_useragent(): # TODO @@ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.result = [] + self.tags = [] + + def handle_starttag(self, tag, attrs): + print tag + self.tags.append(tag) + + def handle_endtag(self, tag): + print tag,tag + if tag != self.tags[-1]: + raise Exception("invalid html") + self.tags.pop() + + def is_valid_tag(self): + return not self.tags or self.tags[-1] not in blocked_tags def handle_data(self, d): + if not self.is_valid_tag(): + return self.result.append(d) def handle_charref(self, number): + if not self.is_valid_tag(): + return if number[0] in (u'x', u'X'): codepoint = int(number[1:], 16) else: @@ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser): self.result.append(unichr(codepoint)) def handle_entityref(self, name): + if not self.is_valid_tag(): + return # codepoint = htmlentitydefs.name2codepoint[name] # self.result.append(unichr(codepoint)) self.result.append(name) |