summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authormarc <a01200356@itesm.mx>2016-11-05 20:51:38 -0600
committermarc <a01200356@itesm.mx>2016-12-13 19:58:10 -0600
commitf62ce21f50b540315a708ebfbf36878ddec9d1c4 (patch)
tree79f69b171e8d2d08fa30aa32a3592286622f9fcc /utils
parent92c6e88ad3e5ba57bd6e2ba64d0c38e8fd72ea09 (diff)
downloadsearxng-f62ce21f50b540315a708ebfbf36878ddec9d1c4.tar.gz
searxng-f62ce21f50b540315a708ebfbf36878ddec9d1c4.zip
[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language.
Diffstat (limited to 'utils')
-rw-r--r--utils/fetch_languages.py164
-rw-r--r--utils/update_languages.py169
2 files changed, 164 insertions, 169 deletions
diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py
new file mode 100644
index 000000000..ae4a2def9
--- /dev/null
+++ b/utils/fetch_languages.py
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+
+# This script generates languages.py from intersecting each engine's supported languages.
+#
+# The country names are obtained from http://api.geonames.org which requires registering as a user.
+#
+# Output files (engines_languages.json and languages.py)
+# are written in current directory to avoid overwriting in case something goes wrong.
+
+from requests import get
+from urllib import urlencode
+from lxml.html import fromstring
+from json import loads, dumps
+import io
+from sys import path
+path.append('../searx') # noqa
+from searx.engines import engines
+
+# Geonames API for country names.
+geonames_user = '' # ADD USER NAME HERE
+country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
+
+# Output files.
+engines_languages_file = 'engines_languages.json'
+languages_file = 'languages.py'
+
+engines_languages = {}
+languages = {}
+
+
+# To filter out invalid codes and dialects.
+def valid_code(lang_code):
+ # filter invalid codes
+ # sl-SL is technically not invalid, but still a mistake
+ if lang_code[:2] == 'xx'\
+ or lang_code == 'sl-SL'\
+ or lang_code == 'wt-WT'\
+ or lang_code == 'jw'\
+ or lang_code[-2:] == 'UK'\
+ or lang_code[-2:] == 'XA'\
+ or lang_code[-2:] == 'XL':
+ return False
+
+ # filter dialects
+ lang_code = lang_code.split('-')
+ if len(lang_code) > 2 or len(lang_code[0]) > 3:
+ return False
+ if len(lang_code) == 2 and len(lang_code[1]) > 2:
+ return False
+
+ return True
+
+
+# Get country name in specified language.
+def get_country_name(locale):
+ if geonames_user is '':
+ return ''
+
+ locale = locale.split('-')
+ if len(locale) != 2:
+ return ''
+
+ url = country_names_url.format(parameters=urlencode({'lang': locale[0],
+ 'country': locale[1],
+ 'username': geonames_user}))
+ response = get(url)
+ json = loads(response.text)
+ content = json.get('geonames', None)
+ if content is None or len(content) != 1:
+ print "No country name found for " + locale[0] + "-" + locale[1]
+ return ''
+
+ return content[0].get('countryName', '')
+
+
+# Fetchs supported languages for each engine and writes json file with those.
+def fetch_supported_languages():
+ for engine_name in engines:
+ if hasattr(engines[engine_name], 'fetch_supported_languages'):
+ try:
+ engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
+ except Exception as e:
+ print e
+
+ # write json file
+ f = io.open(engines_languages_file, "w", encoding="utf-8")
+ f.write(unicode(dumps(engines_languages, indent=4, ensure_ascii=False, encoding="utf-8")))
+ f.close()
+
+
+# Join all language lists.
+# Iterate all languages supported by each engine.
+def join_language_lists():
+ # include wikipedia first for more accurate language names
+ # exclude languages with too few articles
+ languages.update({code: lang for code, lang
+ in engines_languages['wikipedia'].iteritems()
+ if valid_code(code) and lang['articles'] >= 100000})
+
+ for engine_name in engines_languages:
+ for locale in engines_languages[engine_name]:
+ if not valid_code(locale):
+ continue
+
+ # if language is not on list or if it has no name yet
+ if locale not in languages or not languages[locale].get('name'):
+ if isinstance(engines_languages[engine_name], dict) \
+ and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
+ languages[locale] = engines_languages[engine_name][locale]
+ else:
+ languages[locale] = {}
+
+ # get locales that have no name yet
+ for locale in languages.keys():
+ if not languages[locale].get('name'):
+ # try to get language and country names
+ name = languages.get(locale.split('-')[0], {}).get('name', None)
+ if name:
+ languages[locale]['name'] = name
+ languages[locale]['country'] = get_country_name(locale) or ''
+ languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
+ else:
+ # filter out locales with no name
+ del languages[locale]
+
+
+# Remove countryless language if language is featured in only one country.
+def filter_single_country_languages():
+ prev_lang = None
+ for code in sorted(languages):
+ lang = code.split('-')[0]
+ if lang == prev_lang:
+ countries += 1
+ else:
+ if prev_lang is not None and countries == 1:
+ del languages[prev_lang]
+ countries = 0
+ prev_lang = lang
+
+
+# Write languages.py.
+def write_languages_file():
+ new_file = open(languages_file, 'w')
+ file_content = '# -*- coding: utf-8 -*-\n'
+ file_content += '# list of language codes\n'
+ file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
+ file_content += '\nlanguage_codes = ('
+ for code in sorted(languages):
+ file_content += '\n (u"' + code + '"'\
+ + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
+ + ', u"' + languages[code].get('country', '') + '"'\
+ + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
+ # remove last comma
+ file_content = file_content[:-1]
+ file_content += '\n)\n'
+ new_file.write(file_content.encode('utf8'))
+ new_file.close()
+
+
+if __name__ == "__main__":
+ fetch_supported_languages()
+ join_language_lists()
+ filter_single_country_languages()
+ write_languages_file()
diff --git a/utils/update_languages.py b/utils/update_languages.py
deleted file mode 100644
index cc3fa29cc..000000000
--- a/utils/update_languages.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# This script generates languages.py from
-# intersecting each engine's supported languages.
-#
-# The language's native names are obtained from
-# Wikipedia and Google's supported languages.
-#
-# The country names are obtained from http://api.geonames.org
-# which requires registering as a user.
-#
-# Output file (languages.py) is written in current directory
-# to avoid overwriting in case something goes wrong.
-
-from requests import get
-from urllib import urlencode
-from lxml.html import fromstring
-from json import loads
-from sys import path
-path.append('../searx')
-from searx.engines import engines
-
-# list of names
-wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
-google_languages_url = 'https://www.google.com/preferences?#languages'
-country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
-
-geonames_user = '' # add user name here
-
-google_json_name = 'google.preferences.langMap'
-
-languages = {}
-
-
-# To filter out invalid codes and dialects.
-def valid_code(lang_code):
- # filter invalid codes
- # sl-SL is technically not invalid, but still a mistake
- if lang_code[:2] == 'xx'\
- or lang_code == 'sl-SL'\
- or lang_code == 'jw'\
- or lang_code[-2:] == 'UK'\
- or lang_code[-2:] == 'XA'\
- or lang_code[-2:] == 'XL':
- return False
-
- # filter dialects
- lang_code = lang_code.split('-')
- if len(lang_code) > 2 or len(lang_code[0]) > 3:
- return False
- if len(lang_code) == 2 and len(lang_code[1]) > 2:
- return False
-
- return True
-
-
-# Get country name in specified language.
-def get_country_name(locale):
- if geonames_user is '':
- return ''
-
- locale = locale.split('-')
- if len(locale) != 2:
- return ''
-
- url = country_names_url.format(parameters=urlencode({'lang': locale[0],
- 'country': locale[1],
- 'username': geonames_user}))
- response = get(url)
- json = loads(response.text)
- content = json.get('geonames', None)
- if content is None or len(content) != 1:
- print "No country name found for " + locale[0] + "-" + locale[1]
- print json
- return ''
-
- return content[0].get('countryName', '')
-
-
-# Get language names from Wikipedia.
-def get_wikipedia_languages():
- response = get(wiki_languages_url)
- dom = fromstring(response.text)
- tables = dom.xpath('//table[contains(@class,"sortable")]')
- for table in tables:
- # exclude header row
- trs = table.xpath('.//tr')[1:]
- for tr in trs:
- td = tr.xpath('./td')
- code = td[3].xpath('./a')[0].text
- name = td[2].xpath('./a')[0].text
- english_name = td[1].xpath('./a')[0].text
- articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
-
- # exclude language variants and languages with few articles
- if code not in languages and articles >= 10000 and valid_code(code):
- languages[code] = (name, '', english_name)
-
-
-# Get language names from Google.
-def get_google_languages():
- response = get(google_languages_url)
- dom = fromstring(response.text)
- options = dom.xpath('//select[@name="hl"]/option')
- for option in options:
- code = option.xpath('./@value')[0].split('-')[0]
- name = option.text[:-1].title()
-
- if code not in languages and valid_code(code):
- languages[code] = (name, '', '')
-
-
-# Join all language lists.
-# iterate all languages supported by each engine
-def join_language_lists():
- for engine_name in engines:
- for locale in engines[engine_name].supported_languages:
- locale = locale.replace('_', '-')
- if locale not in languages and valid_code(locale):
- # try to get language name
- language = languages.get(locale.split('-')[0], None)
- if language == None:
- print engine_name + ": " + locale
- continue
-
- country = get_country_name(locale)
- languages[locale] = (language[0], country, language[2])
-
-
-# Remove countryless language if language is featured in only one country.
-def filter_single_country_languages():
- prev_lang = None
- for code in sorted(languages):
- lang = code.split('-')[0]
- if lang == prev_lang:
- countries += 1
- else:
- if prev_lang is not None and countries == 1:
- del languages[prev_lang]
- countries = 0
- prev_lang = lang
-
-
-# Write languages.py.
-def write_languages_file():
- new_file = open('languages.py', 'w')
- file_content = '# -*- coding: utf-8 -*-\n'
- file_content += '# list of language codes\n'
- file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
- file_content += '\nlanguage_codes = ('
- for code in sorted(languages):
- (name, country, english) = languages[code]
- file_content += '\n (u"' + code + '"'\
- + ', u"' + name + '"'\
- + ', u"' + country + '"'\
- + ', u"' + english + '"),'
- # remove last comma
- file_content = file_content[:-1]
- file_content += '\n)\n'
- new_file.write(file_content.encode('utf8'))
- new_file.close()
-
-
-if __name__ == "__main__":
- get_wikipedia_languages()
- get_google_languages()
- join_language_lists()
- filter_single_country_languages()
- write_languages_file()