summaryrefslogtreecommitdiff
path: root/searx/results.py
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2020-10-26 19:22:19 +0100
committerAlexandre Flament <alex@al-f.net>2020-10-28 08:09:25 +0100
commit382fded6651c16754d91411f806e8fe390f005e5 (patch)
treec14f75128823a186b13013eb922c81940c0ab0c2 /searx/results.py
parent23f4203dfb72d2315d3fbb33402eb978469b59f4 (diff)
downloadsearxng-382fded6651c16754d91411f806e8fe390f005e5.tar.gz
searxng-382fded6651c16754d91411f806e8fe390f005e5.zip
[mod] result.py: merge infobox URL and attributes when the same label or the same entity
entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
Diffstat (limited to 'searx/results.py')
-rw-r--r--searx/results.py34
1 files changed, 26 insertions, 8 deletions
diff --git a/searx/results.py b/searx/results.py
index e4cad2e24..34a94511a 100644
--- a/searx/results.py
+++ b/searx/results.py
@@ -20,6 +20,18 @@ def result_content_len(content):
def compare_urls(url_a, url_b):
+ """Lazy compare between two URL.
+ "www.example.com" and "example.com" are equals.
+ "www.example.com/path/" and "www.example.com/path" are equals.
+ "https://www.example.com/" and "http://www.example.com/" are equals.
+
+ Args:
+ url_a (ParseResult): first URL
+ url_b (ParseResult): second URL
+
+ Returns:
+ bool: True if url_a and url_b are equals
+ """
# ignore www. in comparison
if url_a.netloc.startswith('www.'):
host_a = url_a.netloc.replace('www.', '', 1)
@@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
for url2 in infobox2.get('urls', []):
unique_url = True
parsed_url2 = urlparse(url2.get('url', ''))
+ entity_url2 = url2.get('entity')
for url1 in urls1:
- if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
+ if (entity_url2 is not None and url1.get('entity') == entity_url2)\
+ or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
unique_url = False
break
if unique_url:
@@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['img_src'] = img2
if 'attributes' in infobox2:
- attributes1 = infobox1.get('attributes', None)
+ attributes1 = infobox1.get('attributes')
if attributes1 is None:
- attributes1 = []
- infobox1['attributes'] = attributes1
+ infobox1['attributes'] = attributes1 = []
attributeSet = set()
- for attribute in infobox1.get('attributes', []):
- if attribute.get('label', None) not in attributeSet:
- attributeSet.add(attribute.get('label', None))
+ for attribute in attributes1:
+ label = attribute.get('label')
+ if label not in attributeSet:
+ attributeSet.add(label)
+ entity = attribute.get('entity')
+ if entity not in attributeSet:
+ attributeSet.add(entity)
for attribute in infobox2.get('attributes', []):
- if attribute.get('label', None) not in attributeSet:
+ if attribute.get('label') not in attributeSet\
+ and attribute.get('entity') not in attributeSet:
attributes1.append(attribute)
if 'content' in infobox2: