diff options
author | Dalf <alex@al-f.net> | 2014-09-28 16:51:41 +0200 |
---|---|---|
committer | Dalf <alex@al-f.net> | 2014-09-28 16:51:41 +0200 |
commit | 6bfd5663539052a64c984f5bdb7135d0d652c923 (patch) | |
tree | acf2049046d62c0b4849d0b36815dfe21117bcec /searx/engines/duckduckgo_definitions.py | |
parent | e39d9fe5423a0fceed1d15dc63c1f8aa30d72e44 (diff) | |
download | searxng-6bfd5663539052a64c984f5bdb7135d0d652c923.tar.gz searxng-6bfd5663539052a64c984f5bdb7135d0d652c923.zip |
[enh] add infoboxes and answers
Diffstat (limited to 'searx/engines/duckduckgo_definitions.py')
-rw-r--r-- | searx/engines/duckduckgo_definitions.py | 121 |
1 files changed, 114 insertions, 7 deletions
diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 3037aae53..3da7352a4 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,10 +1,25 @@ import json from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text -url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1' +url = 'https://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1&d=1' +def result_to_text(url, text, htmlResult): + # TODO : remove result ending with "Meaning" or "Category" + dom = html.fromstring(htmlResult) + a = dom.xpath('//a') + if len(a)>=1: + return extract_text(a[0]) + else: + return text + +def html_to_text(htmlFragment): + dom = html.fromstring(htmlFragment) + return extract_text(dom) def request(query, params): + # TODO add kl={locale} params['url'] = url.format(query=urlencode({'q': query})) return params @@ -12,12 +27,104 @@ def request(query, params): def response(resp): search_res = json.loads(resp.text) results = [] + + content = '' + heading = search_res.get('Heading', '') + attributes = [] + urls = [] + infobox_id = None + relatedTopics = [] + + # add answer if there is one + answer = search_res.get('Answer', '') + if answer != '': + results.append({ 'answer' : html_to_text(answer) }) + + # add infobox if 'Definition' in search_res: - if search_res.get('AbstractURL'): - res = {'title': search_res.get('Heading', ''), - 'content': search_res.get('Definition', ''), - 'url': search_res.get('AbstractURL', ''), - 'class': 'definition_result'} - results.append(res) + content = content + search_res.get('Definition', '') + + if 'Abstract' in search_res: + content = content + search_res.get('Abstract', '') + + + # image + image = search_res.get('Image', '') + image = None if image == '' else image + + # attributes + if 'Infobox' in search_res: + infobox = search_res.get('Infobox', None) + if 'content' in infobox: + for info in infobox.get('content'): + attributes.append({'label': info.get('label'), 'value': info.get('value')}) + + # urls + for ddg_result in search_res.get('Results', []): + if 'FirstURL' in ddg_result: + firstURL = ddg_result.get('FirstURL', '') + text = ddg_result.get('Text', '') + urls.append({'title':text, 'url':firstURL}) + results.append({'title':heading, 'url': firstURL}) + + # related topics + for ddg_result in search_res.get('RelatedTopics', None): + if 'FirstURL' in ddg_result: + suggestion = result_to_text(ddg_result.get('FirstURL', None), ddg_result.get('Text', None), ddg_result.get('Result', None)) + if suggestion != heading: + results.append({'suggestion': suggestion}) + elif 'Topics' in ddg_result: + suggestions = [] + relatedTopics.append({ 'name' : ddg_result.get('Name', ''), 'suggestions': suggestions }) + for topic_result in ddg_result.get('Topics', []): + suggestion = result_to_text(topic_result.get('FirstURL', None), topic_result.get('Text', None), topic_result.get('Result', None)) + if suggestion != heading: + suggestions.append(suggestion) + + # abstract + abstractURL = search_res.get('AbstractURL', '') + if abstractURL != '': + # add as result ? problem always in english + infobox_id = abstractURL + urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL}) + + # definition + definitionURL = search_res.get('DefinitionURL', '') + if definitionURL != '': + # add as result ? as answer ? problem always in english + infobox_id = definitionURL + urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) + + # entity + entity = search_res.get('Entity', None) + # TODO continent / country / department / location / waterfall / mountain range : link to map search, get weather, near by locations + # TODO musician : link to music search + # TODO concert tour : ?? + # TODO film / actor / television / media franchise : links to IMDB / rottentomatoes (or scrap result) + # TODO music : link tu musicbrainz / last.fm + # TODO book : ?? + # TODO artist / playwright : ?? + # TODO compagny : ?? + # TODO software / os : ?? + # TODO software engineer : ?? + # TODO prepared food : ?? + # TODO website : ?? + # TODO performing art : ?? + # TODO prepared food : ?? + # TODO programming language : ?? + # TODO file format : ?? + + if len(heading)>0: + # TODO get infobox.meta.value where .label='article_title' + results.append({ + 'infobox': heading, + 'id': infobox_id, + 'entity': entity, + 'content': content, + 'img_src' : image, + 'attributes': attributes, + 'urls': urls, + 'relatedTopics': relatedTopics + }) return results |