diff options
author | Alexandre Flament <alex@al-f.net> | 2020-10-26 19:22:19 +0100 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2020-10-28 08:09:25 +0100 |
commit | 382fded6651c16754d91411f806e8fe390f005e5 (patch) | |
tree | c14f75128823a186b13013eb922c81940c0ab0c2 | |
parent | 23f4203dfb72d2315d3fbb33402eb978469b59f4 (diff) | |
download | searxng-382fded6651c16754d91411f806e8fe390f005e5.tar.gz searxng-382fded6651c16754d91411f806e8fe390f005e5.zip |
[mod] result.py: merge infobox URL and attributes when the same label or the same entity
entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
-rw-r--r-- | searx/results.py | 34 |
1 files changed, 26 insertions, 8 deletions
diff --git a/searx/results.py b/searx/results.py index e4cad2e24..34a94511a 100644 --- a/searx/results.py +++ b/searx/results.py @@ -20,6 +20,18 @@ def result_content_len(content): def compare_urls(url_a, url_b): + """Lazy compare between two URL. + "www.example.com" and "example.com" are equals. + "www.example.com/path/" and "www.example.com/path" are equals. + "https://www.example.com/" and "http://www.example.com/" are equals. + + Args: + url_a (ParseResult): first URL + url_b (ParseResult): second URL + + Returns: + bool: True if url_a and url_b are equals + """ # ignore www. in comparison if url_a.netloc.startswith('www.'): host_a = url_a.netloc.replace('www.', '', 1) @@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2): for url2 in infobox2.get('urls', []): unique_url = True parsed_url2 = urlparse(url2.get('url', '')) + entity_url2 = url2.get('entity') for url1 in urls1: - if compare_urls(urlparse(url1.get('url', '')), parsed_url2): + if (entity_url2 is not None and url1.get('entity') == entity_url2)\ + or compare_urls(urlparse(url1.get('url', '')), parsed_url2): unique_url = False break if unique_url: @@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2): infobox1['img_src'] = img2 if 'attributes' in infobox2: - attributes1 = infobox1.get('attributes', None) + attributes1 = infobox1.get('attributes') if attributes1 is None: - attributes1 = [] - infobox1['attributes'] = attributes1 + infobox1['attributes'] = attributes1 = [] attributeSet = set() - for attribute in infobox1.get('attributes', []): - if attribute.get('label', None) not in attributeSet: - attributeSet.add(attribute.get('label', None)) + for attribute in attributes1: + label = attribute.get('label') + if label not in attributeSet: + attributeSet.add(label) + entity = attribute.get('entity') + if entity not in attributeSet: + attributeSet.add(entity) for attribute in infobox2.get('attributes', []): - if attribute.get('label', None) not in attributeSet: + if attribute.get('label') not in attributeSet\ + and attribute.get('entity') not in attributeSet: attributes1.append(attribute) if 'content' in infobox2: |