summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorasciimoo <asciimoo@gmail.com>2014-01-10 23:38:08 +0100
committerasciimoo <asciimoo@gmail.com>2014-01-10 23:38:08 +0100
commit7b4ec5c5e9a89fc1bc3b3fc8dfad26450530a2da (patch)
treed7d83df0a8910bea8aae6100749f8009b2c7c740
parent04c408389d3d1a97a6a4b59502490372d67357cf (diff)
downloadsearxng-7b4ec5c5e9a89fc1bc3b3fc8dfad26450530a2da.tar.gz
searxng-7b4ec5c5e9a89fc1bc3b3fc8dfad26450530a2da.zip
[fix] highlighting only html
-rw-r--r--searx/engines/__init__.py28
-rw-r--r--searx/utils.py26
-rw-r--r--searx/webapp.py9
3 files changed, 35 insertions, 28 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index e011737cf..60fb5cf55 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -25,7 +25,6 @@ from urlparse import urlparse
from searx import settings
import ConfigParser
import sys
-import re
from datetime import datetime
engine_dir = dirname(realpath(__file__))
@@ -106,31 +105,6 @@ def make_callback(engine_name, results, suggestions, callback, params):
results[engine_name] = cb_res
return process_callback
-def highlight_content(content, query):
-
- if not content:
- return None
- # ignoring html contents
- # TODO better html content detection
- if content.find('<') != -1:
- return content
-
- query = query.decode('utf-8')
- if content.lower().find(query.lower()) > -1:
- query_regex = u'({0})'.format(re.escape(query))
- content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
- else:
- regex_parts = []
- for chunk in query.split():
- if len(chunk) == 1:
- regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
- else:
- regex_parts.append(u'{0}'.format(re.escape(chunk)))
- query_regex = u'({0})'.format('|'.join(regex_parts))
- content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
-
- return content
-
def score_results(results):
flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
flat_len = len(flat_res)
@@ -218,8 +192,6 @@ def search(query, request, selected_engines):
results = score_results(results)
for result in results:
- if 'content' in result:
- result['content'] = highlight_content(result['content'], query)
for res_engine in result['engines']:
engines[result['engine']].stats['score_count'] += result['score']
diff --git a/searx/utils.py b/searx/utils.py
index 670499805..53300181f 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
import csv
import codecs
import cStringIO
+import re
+
+def highlight_content(content, query):
+
+ if not content:
+ return None
+ # ignoring html contents
+ # TODO better html content detection
+ if content.find('<') != -1:
+ return content
+
+ query = query.decode('utf-8')
+ if content.lower().find(query.lower()) > -1:
+ query_regex = u'({0})'.format(re.escape(query))
+ content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
+ else:
+ regex_parts = []
+ for chunk in query.split():
+ if len(chunk) == 1:
+ regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
+ else:
+ regex_parts.append(u'{0}'.format(re.escape(chunk)))
+ query_regex = u'({0})'.format('|'.join(regex_parts))
+ content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
+
+ return content
class HTMLTextExtractor(HTMLParser):
def __init__(self):
diff --git a/searx/webapp.py b/searx/webapp.py
index 9905bce37..606e109b9 100644
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -29,6 +29,7 @@ import json
import cStringIO
from searx.utils import UnicodeWriter
from flask import send_from_directory
+from searx.utils import highlight_content, html_to_text
@@ -104,6 +105,14 @@ def index():
results, suggestions = search(query, request, selected_engines)
for result in results:
+ if request_data.get('format', 'html') == 'html':
+ if 'content' in result:
+ result['content'] = highlight_content(result['content'], query)
+ result['title'] = highlight_content(result['title'], query)
+ else:
+ if 'content' in result:
+ result['content'] = html_to_text(result['content']).strip()
+ result['title'] = html_to_text(result['title']).strip()
if len(result['url']) > 74:
result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
else: