summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
authorasciimoo <asciimoo@gmail.com>2013-11-08 23:44:26 +0100
committerasciimoo <asciimoo@gmail.com>2013-11-08 23:44:26 +0100
commite9467524741b67ef6ceaabf299931ae5bc32e9f6 (patch)
tree364a1f26d03d14a836b920d87e6191f7e64884d8 /searx/utils.py
parent432ec664a38e7357e432df09924a55d6426a8a55 (diff)
downloadsearxng-e9467524741b67ef6ceaabf299931ae5bc32e9f6.tar.gz
searxng-e9467524741b67ef6ceaabf299931ae5bc32e9f6.zip
[enh] utils.py added
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py
new file mode 100644
index 000000000..b6c448566
--- /dev/null
+++ b/searx/utils.py
@@ -0,0 +1,26 @@
+from HTMLParser import HTMLParser
+import htmlentitydefs
+
+class HTMLTextExtractor(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.result = [ ]
+
+ def handle_data(self, d):
+ self.result.append(d)
+
+ def handle_charref(self, number):
+ codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
+ self.result.append(unichr(codepoint))
+
+ def handle_entityref(self, name):
+ codepoint = htmlentitydefs.name2codepoint[name]
+ self.result.append(unichr(codepoint))
+
+ def get_text(self):
+ return u''.join(self.result)
+
+def html_to_text(html):
+ s = HTMLTextExtractor()
+ s.feed(html)
+ return s.get_text()