summaryrefslogtreecommitdiff
path: root/utils/fetch_languages.py
diff options
context:
space:
mode:
authormarc <a01200356@itesm.mx>2016-12-16 22:14:14 -0600
committermarc <a01200356@itesm.mx>2016-12-16 22:14:14 -0600
commit4a1ff56389d6ad560594ba82b448aef1d70bbbf4 (patch)
treed55a24910d102b682ec570e561a3613584655818 /utils/fetch_languages.py
parentaf35eee10b98940c51c6e5e18629de514b4bd48d (diff)
downloadsearxng-4a1ff56389d6ad560594ba82b448aef1d70bbbf4.tar.gz
searxng-4a1ff56389d6ad560594ba82b448aef1d70bbbf4.zip
minor fixes in utils/fetch_languages.py
Diffstat (limited to 'utils/fetch_languages.py')
-rw-r--r--utils/fetch_languages.py49
1 files changed, 26 insertions, 23 deletions
diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py
index aed875daf..3510a3234 100644
--- a/utils/fetch_languages.py
+++ b/utils/fetch_languages.py
@@ -32,23 +32,26 @@ languages = {}
def valid_code(lang_code):
# filter invalid codes
# sl-SL is technically not invalid, but still a mistake
+ invalid_codes = ['sl-SL', 'wt-WT', 'jw']
+ invalid_countries = ['UK', 'XA', 'XL']
if lang_code[:2] == 'xx'\
- or lang_code == 'sl-SL'\
- or lang_code == 'wt-WT'\
- or lang_code == 'jw'\
- or lang_code[-2:] == 'UK'\
- or lang_code[-2:] == 'XA'\
- or lang_code[-2:] == 'XL':
+ or lang_code in invalid_codes\
+ or lang_code[-2:] in invalid_countries\
+ or is_dialect(lang_code):
return False
- # filter dialects
+ return True
+
+
+# Language codes with any additional tags other than language and country.
+def is_dialect(lang_code):
lang_code = lang_code.split('-')
if len(lang_code) > 2 or len(lang_code[0]) > 3:
- return False
+ return True
if len(lang_code) == 2 and len(lang_code[1]) > 2:
- return False
+ return True
- return True
+ return False
# Get country name in specified language.
@@ -83,19 +86,17 @@ def fetch_supported_languages():
print e
# write json file
- f = io.open(engines_languages_file, "w", encoding="utf-8")
- f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
- f.close()
+ with io.open(engines_languages_file, "w", encoding="utf-8") as f:
+ f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
# Join all language lists.
# Iterate all languages supported by each engine.
def join_language_lists():
# include wikipedia first for more accurate language names
- # exclude languages with too few articles
languages.update({code: lang for code, lang
in engines_languages['wikipedia'].iteritems()
- if valid_code(code) and lang['articles'] >= 100000})
+ if valid_code(code)})
for engine_name in engines_languages:
for locale in engines_languages[engine_name]:
@@ -104,25 +105,27 @@ def join_language_lists():
# if language is not on list or if it has no name yet
if locale not in languages or not languages[locale].get('name'):
- if isinstance(engines_languages[engine_name], dict) \
- and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
+ if isinstance(engines_languages[engine_name], dict):
languages[locale] = engines_languages[engine_name][locale]
else:
languages[locale] = {}
# get locales that have no name or country yet
for locale in languages.keys():
+ # try to get language names
if not languages[locale].get('name'):
- # try to get language names
name = languages.get(locale.split('-')[0], {}).get('name', None)
if name:
languages[locale]['name'] = name
- languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
else:
# filter out locales with no name
del languages[locale]
continue
+ # try to get language name in english
+ if not languages[locale].get('english_name'):
+ languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
+
# try to get country name
if locale.find('-') > 0 and not languages[locale].get('country'):
languages[locale]['country'] = get_country_name(locale) or ''
@@ -145,10 +148,10 @@ def filter_single_country_languages():
# Write languages.py.
def write_languages_file():
new_file = open(languages_file, 'w')
- file_content = '# -*- coding: utf-8 -*-\n'
- file_content += '# list of language codes\n'
- file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
- file_content += '\nlanguage_codes = ('
+ file_content = '# -*- coding: utf-8 -*-\n'\
+ + '# list of language codes\n'\
+ + '# this file is generated automatically by utils/update_search_languages.py\n'\
+ + '\nlanguage_codes = ('
for code in sorted(languages):
file_content += '\n (u"' + code + '"'\
+ ', u"' + languages[code]['name'].split(' (')[0] + '"'\