diff options
author | Dalf <alex@al-f.net> | 2014-09-22 22:55:51 +0200 |
---|---|---|
committer | Dalf <alex@al-f.net> | 2014-09-22 22:55:51 +0200 |
commit | 6b058962e1f87a17ce2d9c2bcb4faa73df285df3 (patch) | |
tree | c542042e46c9571ef061e3dbafbb9663d02e5b35 /searx/search.py | |
parent | bd2db71fa6921a757ff5df559535092f45010652 (diff) | |
download | searxng-6b058962e1f87a17ce2d9c2bcb4faa73df285df3.tar.gz searxng-6b058962e1f87a17ce2d9c2bcb4faa73df285df3.zip |
[fix] when two results are merged, really use the content with more text
Diffstat (limited to 'searx/search.py')
-rw-r--r-- | searx/search.py | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/searx/search.py b/searx/search.py index c861a795a..10916cc50 100644 --- a/searx/search.py +++ b/searx/search.py @@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. ''' import grequests +import re from itertools import izip_longest, chain from datetime import datetime from operator import itemgetter @@ -76,6 +77,13 @@ def make_callback(engine_name, results, suggestions, callback, params): return process_callback +# return the meaningful length of the content for a result +def content_result_len(result): + if isinstance(result.get('content'), basestring): + content = re.sub('[,;:!?\./\\\\ ()-_]', '', result.get('content')) + return len(content) + else: + return 0 # score results and remove duplications def score_results(results): @@ -110,6 +118,9 @@ def score_results(results): duplicated = False # check for duplicates + if 'content' in res: + res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', '')) + for new_res in results: # remove / from the end of the url if required p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa @@ -126,7 +137,7 @@ def score_results(results): # merge duplicates together if duplicated: # using content with more text - if res.get('content') > duplicated.get('content'): + if content_result_len(res) > content_result_len(duplicated): duplicated['content'] = res['content'] # increase result-score |