summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
authorAdam Tauber <asciimoo@gmail.com>2015-01-01 14:13:56 +0100
committerAdam Tauber <asciimoo@gmail.com>2015-01-01 14:13:56 +0100
commit1408859b4b0ca9efc590ca0e112c6bc0cb984e2c (patch)
tree74a997b29eccf6c815ac8d107bd639f9b789eee8 /searx/utils.py
parent469e08881ee17d8a180d0c0741c1552a29108f0e (diff)
downloadsearxng-1408859b4b0ca9efc590ca0e112c6bc0cb984e2c.tar.gz
searxng-1408859b4b0ca9efc590ca0e112c6bc0cb984e2c.zip
[fix] ignore scripts/styles in html_to_text
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py23
1 files changed, 23 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py
index b725a8b95..8a3f35531 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64',
ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
+blocked_tags = ('script',
+ 'style')
+
def gen_useragent():
# TODO
@@ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.result = []
+ self.tags = []
+
+ def handle_starttag(self, tag, attrs):
+ print tag
+ self.tags.append(tag)
+
+ def handle_endtag(self, tag):
+ print tag,tag
+ if tag != self.tags[-1]:
+ raise Exception("invalid html")
+ self.tags.pop()
+
+ def is_valid_tag(self):
+ return not self.tags or self.tags[-1] not in blocked_tags
def handle_data(self, d):
+ if not self.is_valid_tag():
+ return
self.result.append(d)
def handle_charref(self, number):
+ if not self.is_valid_tag():
+ return
if number[0] in (u'x', u'X'):
codepoint = int(number[1:], 16)
else:
@@ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser):
self.result.append(unichr(codepoint))
def handle_entityref(self, name):
+ if not self.is_valid_tag():
+ return
# codepoint = htmlentitydefs.name2codepoint[name]
# self.result.append(unichr(codepoint))
self.result.append(name)