diff options
author | Alexandre Flament <alex@al-f.net> | 2022-04-02 15:21:58 +0200 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2022-04-09 18:01:57 +0200 |
commit | 74c7aee9ec52e6b954e48817501a334f23a40e25 (patch) | |
tree | f1ac4584c317655a3f712a7fdeba08a48766cea0 /searx/engines/jisho.py | |
parent | 19fa0095a0ab12ed1f7a79d91edf862faf6fdfcf (diff) | |
download | searxng-74c7aee9ec52e6b954e48817501a334f23a40e25.tar.gz searxng-74c7aee9ec52e6b954e48817501a334f23a40e25.zip |
jisho : code refactoring
Diffstat (limited to 'searx/engines/jisho.py')
-rw-r--r-- | searx/engines/jisho.py | 143 |
1 files changed, 76 insertions, 67 deletions
diff --git a/searx/engines/jisho.py b/searx/engines/jisho.py index a34d8e421..87bbe983d 100644 --- a/searx/engines/jisho.py +++ b/searx/engines/jisho.py @@ -17,7 +17,6 @@ about = { } categories = ['dictionaries'] -engine_type = 'online_dictionary' paging = False URL = 'https://jisho.org' @@ -34,19 +33,19 @@ def request(query, params): def response(resp): results = [] - infoboxed = False + first_result = True search_results = resp.json() - pages = search_results.get('data', []) - for page in pages: + for page in search_results.get('data', []): # Entries that are purely from Wikipedia are excluded. - if page['senses'][0]['parts_of_speech'] != [] and page['senses'][0]['parts_of_speech'][0] == 'Wikipedia definition': + parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech') + if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition': pass + # Process alternative forms - japanese = page['japanese'] alt_forms = [] - for title_raw in japanese: + for title_raw in page['japanese']: if 'word' not in title_raw: alt_forms.append(title_raw['reading']) else: @@ -54,74 +53,84 @@ def response(resp): if 'reading' in title_raw: title += ' (' + title_raw['reading'] + ')' alt_forms.append(title) - # Process definitions - definitions = [] - def_raw = page['senses'] - for defn_raw in def_raw: - extra = '' - if not infoboxed: - # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. - if defn_raw['tags'] != []: - if defn_raw['info'] != []: - extra += defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ' # "usually written as kana: <kana>" - else: - extra += ', '.join(defn_raw['tags']) + '. ' # abbreviation, archaism, etc. - elif defn_raw['info'] != []: - extra += ', '.join(defn_raw['info']).capitalize() + '. ' # inconsistent - if defn_raw['restrictions'] != []: - extra += 'Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ' - extra = extra[:-1] - definitions.append(( - ', '.join(defn_raw['parts_of_speech']), - '; '.join(defn_raw['english_definitions']), - extra - )) - content = '' - infobox_content = ''' - <small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a> - and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a> - by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small><ul> - ''' - for pos, engdef, extra in definitions: - if pos == 'Wikipedia definition': - infobox_content += '</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>' - if pos == '': - infobox_content += f"<li>{engdef}" - else: - infobox_content += f"<li><i>{pos}</i>: {engdef}" - if extra != '': - infobox_content += f" ({extra})" - infobox_content += '</li>' - content += f"{engdef}. " - infobox_content += '</ul>' + # + result_url = urljoin(BASE_URL, page['slug']) + definitions = get_definitions(page) + # For results, we'll return the URL, all alternative forms (as title), # and all definitions (as description) truncated to 300 characters. + content = " ".join(f"{engdef}." for _, engdef, _ in definitions) results.append({ - 'url': urljoin(BASE_URL, page['slug']), + 'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...') }) # Like Wordnik, we'll return the first result in an infobox too. - if not infoboxed: - infoboxed = True - infobox_urls = [] - infobox_urls.append({ - 'title': 'Jisho.org', - 'url': urljoin(BASE_URL, page['slug']) - }) - infobox = { - 'infobox': alt_forms[0], - 'urls': infobox_urls - } - alt_forms.pop(0) - alt_content = '' - if len(alt_forms) > 0: - alt_content = '<p><i>Other forms:</i> ' - alt_content += ", ".join(alt_forms) - alt_content += '</p>' - infobox['content'] = alt_content + infobox_content - results.append(infobox) + if first_result: + first_result = False + results.append(get_infobox(alt_forms, result_url, definitions)) return results + + +def get_definitions(page): + # Process definitions + definitions = [] + for defn_raw in page['senses']: + extra = [] + # Extra data. Since they're not documented, this implementation is based solely by the author's assumptions. + if defn_raw.get('tags'): + if defn_raw.get('info'): + # "usually written as kana: <kana>" + extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ') + else: + # abbreviation, archaism, etc. + extra.append(', '.join(defn_raw['tags']) + '. ') + elif defn_raw.get('info'): + # inconsistent + extra.append(', '.join(defn_raw['info']).capitalize() + '. ') + if defn_raw.get('restrictions'): + extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ') + definitions.append(( + ', '.join(defn_raw['parts_of_speech']), + '; '.join(defn_raw['english_definitions']), + ''.join(extra)[:-1], + )) + return definitions + + +def get_infobox(alt_forms, result_url, definitions): + infobox_content = [] + # title & alt_forms + infobox_title = alt_forms[0] + if len(alt_forms) > 1: + infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>') + + # definitions + infobox_content.append(''' + <small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a> + and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a> + by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small> + <ul> + ''') + for pos, engdef, extra in definitions: + if pos == 'Wikipedia definition': + infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>') + pos = f'<i>{pos}</i>: ' if pos else '' + extra = f' ({extra})' if extra else '' + infobox_content.append(f'<li>{pos}{engdef}{extra}</li>') + infobox_content.append('</ul>') + + # + return { + 'infobox': infobox_title, + 'content': ''.join(infobox_content), + 'urls': [ + { + 'title': 'Jisho.org', + 'url': result_url, + } + ] + } |