diff options
author | asciimoo <asciimoo@gmail.com> | 2014-01-10 23:38:08 +0100 |
---|---|---|
committer | asciimoo <asciimoo@gmail.com> | 2014-01-10 23:38:08 +0100 |
commit | 7b4ec5c5e9a89fc1bc3b3fc8dfad26450530a2da (patch) | |
tree | d7d83df0a8910bea8aae6100749f8009b2c7c740 /searx/utils.py | |
parent | 04c408389d3d1a97a6a4b59502490372d67357cf (diff) | |
download | searxng-7b4ec5c5e9a89fc1bc3b3fc8dfad26450530a2da.tar.gz searxng-7b4ec5c5e9a89fc1bc3b3fc8dfad26450530a2da.zip |
[fix] highlighting only html
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py index 670499805..53300181f 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -3,6 +3,32 @@ from HTMLParser import HTMLParser import csv import codecs import cStringIO +import re + +def highlight_content(content, query): + + if not content: + return None + # ignoring html contents + # TODO better html content detection + if content.find('<') != -1: + return content + + query = query.decode('utf-8') + if content.lower().find(query.lower()) > -1: + query_regex = u'({0})'.format(re.escape(query)) + content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U) + else: + regex_parts = [] + for chunk in query.split(): + if len(chunk) == 1: + regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk))) + else: + regex_parts.append(u'{0}'.format(re.escape(chunk))) + query_regex = u'({0})'.format('|'.join(regex_parts)) + content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U) + + return content class HTMLTextExtractor(HTMLParser): def __init__(self): |