summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--searx/utils.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py
new file mode 100644
index 000000000..b6c448566
--- /dev/null
+++ b/searx/utils.py
@@ -0,0 +1,26 @@
+from HTMLParser import HTMLParser
+import htmlentitydefs
+
+class HTMLTextExtractor(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.result = [ ]
+
+ def handle_data(self, d):
+ self.result.append(d)
+
+ def handle_charref(self, number):
+ codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
+ self.result.append(unichr(codepoint))
+
+ def handle_entityref(self, name):
+ codepoint = htmlentitydefs.name2codepoint[name]
+ self.result.append(unichr(codepoint))
+
+ def get_text(self):
+ return u''.join(self.result)
+
+def html_to_text(html):
+ s = HTMLTextExtractor()
+ s.feed(html)
+ return s.get_text()