diff options
author | Marc Abonce Seguin <marc-abonce@mailbox.org> | 2020-09-14 00:07:45 -0700 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2020-09-22 11:37:44 +0200 |
commit | c86504b47a321378a50f06fffc31d916036f3b08 (patch) | |
tree | 0dd011e737a046a1c2abefc85d10b2d67cdcc040 /utils | |
parent | 41800835f9ea82f4b5342525df1c8174eddb9efc (diff) | |
download | searxng-c86504b47a321378a50f06fffc31d916036f3b08.tar.gz searxng-c86504b47a321378a50f06fffc31d916036f3b08.zip |
fix utils/fetch_languages to work with new languages
Diffstat (limited to 'utils')
-rw-r--r-- | utils/fetch_languages.py | 210 |
1 files changed, 110 insertions, 100 deletions
diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py index 0fb9d9c18..1f8dc7786 100644 --- a/utils/fetch_languages.py +++ b/utils/fetch_languages.py @@ -6,7 +6,7 @@ # are written in current directory to avoid overwriting in case something goes wrong. import json -import io +from pprint import pformat from sys import path from babel import Locale, UnknownLocaleError from babel.languages import get_global @@ -23,7 +23,7 @@ languages_file = 'languages.py' # Fetchs supported languages for each engine and writes json file with those. def fetch_supported_languages(): - engines_languages = {} + engines_languages = dict() names = list(engines) names.sort() @@ -51,19 +51,9 @@ def get_locale(lang_code): return None -# Append engine_name to list of engines that support locale. -def add_engine_counter(lang_code, engine_name, languages): - if lang_code in languages: - if 'counter' not in languages[lang_code]: - languages[lang_code]['counter'] = [engine_name] - elif engine_name not in languages[lang_code]['counter']: - languages[lang_code]['counter'].append(engine_name) - - # Join all language lists. -# TODO: Add language names from engine's language list if name not known by babel. def join_language_lists(engines_languages): - language_list = {} + language_list = dict() for engine_name in engines_languages: for lang_code in engines_languages[engine_name]: @@ -76,32 +66,51 @@ def join_language_lists(engines_languages): # ensure that lang_code uses standard language and country codes if locale and locale.territory: - lang_code = locale.language + '-' + locale.territory + lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory) + short_code = lang_code.split('-')[0] - # add locale if it's not in list - if lang_code not in language_list: + # add language without country if not in list + if short_code not in language_list: if locale: - language_list[lang_code] = {'name': locale.get_language_name().title(), - 'english_name': locale.english_name, - 'country': locale.get_territory_name() or ''} - - # also add language without country - if locale.language not in language_list: - language_list[locale.language] = {'name': locale.get_language_name().title(), - 'english_name': locale.english_name} + # get language's data from babel's Locale object + language_name = locale.get_language_name().title() + english_name = locale.english_name.split(' (')[0] + elif short_code in engines_languages['wikipedia']: + # get language's data from wikipedia if not known by babel + language_name = engines_languages['wikipedia'][short_code]['name'] + english_name = engines_languages['wikipedia'][short_code]['english_name'] else: - language_list[lang_code] = {} + language_name = None + english_name = None + + # add language to list + language_list[short_code] = {'name': language_name, + 'english_name': english_name, + 'counter': set(), + 'countries': dict()} + + # add language with country if not in list + if lang_code != short_code and lang_code not in language_list[short_code]['countries']: + country_name = '' + if locale: + # get country name from babel's Locale object + country_name = locale.get_territory_name() + + language_list[short_code]['countries'][lang_code] = {'country_name': country_name, + 'counter': set()} # count engine for both language_country combination and language alone - add_engine_counter(lang_code, engine_name, language_list) - add_engine_counter(lang_code.split('-')[0], engine_name, language_list) + language_list[short_code]['counter'].add(engine_name) + if lang_code != short_code: + language_list[short_code]['countries'][lang_code]['counter'].add(engine_name) return language_list -# Filter language list so it only includes the most supported languages and countries. +# Filter language list so it only includes the most supported languages and countries def filter_language_list(all_languages): - min_supported_engines = 10 + min_engines_per_lang = 15 + min_engines_per_country = 10 main_engines = [engine_name for engine_name in engines.keys() if 'general' in engines[engine_name].categories and engines[engine_name].supported_languages and @@ -110,82 +119,84 @@ def filter_language_list(all_languages): # filter list to include only languages supported by most engines or all default general engines filtered_languages = {code: lang for code, lang in all_languages.items() - if (len(lang.get('counter', [])) >= min_supported_engines or - all(main_engine in lang.get('counter', []) + if (len(lang['counter']) >= min_engines_per_lang or + all(main_engine in lang['counter'] for main_engine in main_engines))} - return filtered_languages - - -# Add country codes to languages without one and filter out language codes. -def assign_country_codes(filtered_languages, all_languages): - sorted_languages = sorted(all_languages, - key=lambda lang: len(all_languages[lang].get('counter', [])), - reverse=True) - previous_lang = None - previous_code = None - countries = 0 - for current_code in sorted(filtered_languages): - current_lang = current_code.split('-')[0] - - # count country codes per language - if current_lang == previous_lang: - countries += 1 - - else: - if previous_lang is not None: - # if language has no single country code - if countries == 0: - # try to get country code with most supported engines - for l in sorted_languages: - l_parts = l.split('-') - if len(l_parts) == 2 and l_parts[0] == previous_lang: - filtered_languages[l] = all_languages[l] - filtered_languages[l]['country'] = '' - countries = 1 - break - - if countries == 0: - # get most likely country code from babel - subtags = get_global('likely_subtags').get(previous_lang) - if subtags: - subtag_parts = subtags.split('_') - new_code = subtag_parts[0] + '-' + subtag_parts[-1] - filtered_languages[new_code] = all_languages[previous_lang] - countries = 1 - - if countries == 1: - # remove countryless version of language if there's only one country - del filtered_languages[previous_lang] - if previous_code in filtered_languages: - filtered_languages[previous_code]['country'] = '' - - countries = 0 - previous_lang = current_lang - - previous_code = current_code + def _copy_lang_data(lang, country_name=None): + new_dict = dict() + new_dict['name'] = all_languages[lang]['name'] + new_dict['english_name'] = all_languages[lang]['english_name'] + if country_name: + new_dict['country_name'] = country_name + return new_dict + + def _country_count(i): + return len(countries[sorted_countries[i]]['counter']) + + # for each language get country codes supported by most engines or at least one country code + filtered_languages_with_countries = dict() + for lang, lang_data in filtered_languages.items(): + countries = lang_data['countries'] + filtered_countries = dict() + + # get language's country codes with enough supported engines + for lang_country, country_data in countries.items(): + if len(country_data['counter']) >= min_engines_per_country: + filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name']) + + # add language without countries too if there's more than one country to choose from + if len(filtered_countries) > 1: + filtered_countries[lang] = _copy_lang_data(lang) + elif len(filtered_countries) == 1: + # if there's only one country per language, it's not necessary to show country name + lang_country = next(iter(filtered_countries)) + filtered_countries[lang_country]['country_name'] = None + + # if no country has enough engines try to get most likely country code from babel + if not filtered_countries: + lang_country = None + subtags = get_global('likely_subtags').get(lang) + if subtags: + country_code = subtags.split('_')[-1] + if len(country_code) == 2: + lang_country = "{lang}-{country}".format(lang=lang, country=country_code) + + if lang_country: + filtered_countries[lang_country] = _copy_lang_data(lang) + else: + filtered_countries[lang] = _copy_lang_data(lang) + + filtered_languages_with_countries.update(filtered_countries) + + return filtered_languages_with_countries # Write languages.py. def write_languages_file(languages): - new_file = open(languages_file, 'wb') - file_content = '# -*- coding: utf-8 -*-\n'\ - + '# list of language codes\n'\ - + '# this file is generated automatically by utils/update_search_languages.py\n'\ - + '\nlanguage_codes = (' - for code in sorted(languages): - if 'name' in languages[code]: - file_content += '\n ("' + code + '"'\ - + ', "' + languages[code]['name'].split(' (')[0] + '"'\ - + ', "' + languages[code].get('country', '') + '"'\ - + ', "' + languages[code].get('english_name', '').split(' (')[0] + '"),' - else: - print('ignore ',languages[code]) - # remove last comma - file_content = file_content[:-1] - file_content += '\n)\n' - new_file.write(file_content.encode()) - new_file.close() + file_headers = ( + "# -*- coding: utf-8 -*-", + "# list of language codes", + "# this file is generated automatically by utils/fetch_languages.py", + "language_codes =" + ) + + language_codes = tuple([ + ( + code, + languages[code]['name'].split(' (')[0], + languages[code].get('country_name') or '', + languages[code].get('english_name') or '' + ) for code in sorted(languages) + ]) + + with open(languages_file, 'w') as new_file: + file_content = "{file_headers} \\\n{language_codes}".format( + file_headers='\n'.join(file_headers), + language_codes=pformat(language_codes, indent=4) + ) + new_file.write(file_content) + new_file.close() if __name__ == "__main__": @@ -193,5 +204,4 @@ if __name__ == "__main__": engines_languages = fetch_supported_languages() all_languages = join_language_lists(engines_languages) filtered_languages = filter_language_list(all_languages) - assign_country_codes(filtered_languages, all_languages) write_languages_file(filtered_languages) |