From 6948689d2a2a29f7ffc5ca9d212e76a3e8e43956 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Thu, 17 Oct 2024 04:57:21 +0000 Subject: [enh] use longest title and test get_ordered_results() --- searx/results.py | 11 ++++++--- tests/unit/test_results.py | 61 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/searx/results.py b/searx/results.py index 7c973ca8f..2b677b105 100644 --- a/searx/results.py +++ b/searx/results.py @@ -12,7 +12,6 @@ from searx import logger from searx.engines import engines from searx.metrics import histogram_observe, counter_add, count_error - CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) @@ -133,7 +132,7 @@ def result_score(result, priority): weight = 1.0 for result_engine in result['engines']: - if hasattr(engines[result_engine], 'weight'): + if hasattr(engines.get(result_engine), 'weight'): weight *= float(engines[result_engine].weight) weight *= len(result['positions']) @@ -332,10 +331,14 @@ class ResultContainer: return None def __merge_duplicated_http_result(self, duplicated, result, position): - # using content with more text + # use content with more text if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')): duplicated['content'] = result['content'] + # use title with more text + if result_content_len(result.get('title', '')) > len(duplicated.get('title', '')): + duplicated['title'] = result['title'] + # merge all result's parameters not found in duplicate for key in result.keys(): if not duplicated.get(key): @@ -347,7 +350,7 @@ class ResultContainer: # add engine to list of result-engines duplicated['engines'].add(result['engine']) - # using https if possible + # use https if possible if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': duplicated['url'] = result['parsed_url'].geturl() duplicated['parsed_url'] = result['parsed_url'] diff --git a/tests/unit/test_results.py b/tests/unit/test_results.py index 72486bbc7..608d3c8c3 100644 --- a/tests/unit/test_results.py +++ b/tests/unit/test_results.py @@ -2,9 +2,26 @@ # pylint: disable=missing-module-docstring from searx.results import ResultContainer +from searx.engines import load_engines from tests import SearxTestCase +def make_test_engine_dict(**kwargs) -> dict: + test_engine = { + # fmt: off + 'name': None, + 'engine': None, + 'categories': 'general', + 'shortcut': 'dummy', + 'timeout': 3.0, + 'tokens': [], + # fmt: on + } + + test_engine.update(**kwargs) + return test_engine + + def fake_result(url='https://aa.bb/cc?dd=ee#ff', title='aaa', content='bbb', engine='wikipedia', **kwargs): result = { # fmt: off @@ -19,23 +36,41 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff', title='aaa', content='bbb', eng class ResultContainerTestCase(SearxTestCase): # pylint: disable=missing-class-docstring + def setUp(self) -> None: + stract_engine = make_test_engine_dict(name="stract", engine="stract", shortcut="stra") + duckduckgo_engine = make_test_engine_dict(name="duckduckgo", engine="duckduckgo", shortcut="ddg") + mojeek_engine = make_test_engine_dict(name="mojeek", engine="mojeek", shortcut="mjk") + + load_engines([stract_engine, duckduckgo_engine, mojeek_engine]) + + self.container = ResultContainer() + + def tearDown(self): + load_engines([]) + def test_empty(self): - c = ResultContainer() - self.assertEqual(c.get_ordered_results(), []) + self.assertEqual(self.container.get_ordered_results(), []) def test_one_result(self): - c = ResultContainer() - c.extend('wikipedia', [fake_result()]) - self.assertEqual(c.results_length(), 1) + self.container.extend('wikipedia', [fake_result()]) + + self.assertEqual(self.container.results_length(), 1) def test_one_suggestion(self): - c = ResultContainer() - c.extend('wikipedia', [fake_result(suggestion=True)]) - self.assertEqual(len(c.suggestions), 1) - self.assertEqual(c.results_length(), 0) + self.container.extend('wikipedia', [fake_result(suggestion=True)]) + + self.assertEqual(len(self.container.suggestions), 1) + self.assertEqual(self.container.results_length(), 0) def test_result_merge(self): - c = ResultContainer() - c.extend('wikipedia', [fake_result()]) - c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')]) - self.assertEqual(c.results_length(), 2) + self.container.extend('wikipedia', [fake_result()]) + self.container.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')]) + + self.assertEqual(self.container.results_length(), 2) + + def test_result_merge_by_title(self): + self.container.extend('stract', [fake_result(engine='stract', title='short title')]) + self.container.extend('duckduckgo', [fake_result(engine='duckduckgo', title='normal title')]) + self.container.extend('mojeek', [fake_result(engine='mojeek', title='this long long title')]) + + self.assertEqual(self.container.get_ordered_results()[0].get('title', ''), 'this long long title') -- cgit v1.2.3-54-g00ecf