summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authormarc <a01200356@itesm.mx>2016-11-05 20:51:38 -0600
committermarc <a01200356@itesm.mx>2016-12-13 19:58:10 -0600
commitf62ce21f50b540315a708ebfbf36878ddec9d1c4 (patch)
tree79f69b171e8d2d08fa30aa32a3592286622f9fcc /searx/engines
parent92c6e88ad3e5ba57bd6e2ba64d0c38e8fd72ea09 (diff)
downloadsearxng-f62ce21f50b540315a708ebfbf36878ddec9d1c4.tar.gz
searxng-f62ce21f50b540315a708ebfbf36878ddec9d1c4.zip
[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language.
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/__init__.py6
-rw-r--r--searx/engines/bing.py15
-rw-r--r--searx/engines/bing_images.py2
-rw-r--r--searx/engines/bing_news.py2
-rw-r--r--searx/engines/dailymotion.py41
-rw-r--r--searx/engines/duckduckgo.py27
-rw-r--r--searx/engines/duckduckgo_definitions.py2
-rw-r--r--searx/engines/gigablast.py22
-rw-r--r--searx/engines/google.py30
-rw-r--r--searx/engines/google_news.py2
-rw-r--r--searx/engines/mediawiki.py1
-rw-r--r--searx/engines/qwant.py15
-rw-r--r--searx/engines/startpage.py5
-rw-r--r--searx/engines/subtitleseeker.py5
-rw-r--r--searx/engines/swisscows.py21
-rw-r--r--searx/engines/wikidata.py6
-rw-r--r--searx/engines/wikipedia.py53
-rw-r--r--searx/engines/yahoo.py20
-rw-r--r--searx/engines/yahoo_news.py2
19 files changed, 162 insertions, 115 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index ab3677984..7a64fd25b 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -20,6 +20,7 @@ from os.path import realpath, dirname
import sys
from flask_babel import gettext
from operator import itemgetter
+from json import loads
from searx import settings
from searx import logger
from searx.utils import load_module
@@ -78,6 +79,9 @@ def load_engine(engine_data):
if not hasattr(engine, arg_name):
setattr(engine, arg_name, arg_value)
+ if engine_data['name'] in languages:
+ setattr(engine, 'supported_languages', languages[engine_data['name']])
+
# checking required variables
for engine_attr in dir(engine):
if engine_attr.startswith('_'):
@@ -207,6 +211,8 @@ if 'engines' not in settings or not settings['engines']:
logger.error('No engines found. Edit your settings.yml')
exit(2)
+languages = loads(open(engine_dir + '/../data/engines_languages.json').read())
+
for engine_data in settings['engines']:
engine = load_engine(engine_data)
if engine is not None:
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index 052b66448..354003399 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -15,12 +15,14 @@
from urllib import urlencode
from lxml import html
+from requests import get
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
paging = True
language_support = True
+supported_languages_url = 'https://www.bing.com/account/general'
# search-url
base_url = 'https://www.bing.com/'
@@ -81,3 +83,16 @@ def response(resp):
# return results
return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ supported_languages = []
+ response = get(supported_languages_url)
+ dom = html.fromstring(response.text)
+ options = dom.xpath('//div[@id="limit-languages"]//input')
+ for option in options:
+ code = option.xpath('./@id')[0].replace('_', '-')
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index c0deaf6b2..746d3abc4 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -19,7 +19,7 @@ from urllib import urlencode
from lxml import html
from json import loads
import re
-from searx.engines.bing import supported_languages
+from searx.engines.bing import fetch_supported_languages
# engine dependent config
categories = ['images']
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index 4bac5bbce..2d936fa53 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -17,7 +17,7 @@ from datetime import datetime
from dateutil import parser
from lxml import etree
from searx.utils import list_get
-from searx.engines.bing import supported_languages
+from searx.engines.bing import fetch_supported_languages
# engine dependent config
categories = ['news']
diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py
index 4a7d7b6a8..813dd951f 100644
--- a/searx/engines/dailymotion.py
+++ b/searx/engines/dailymotion.py
@@ -15,29 +15,12 @@
from urllib import urlencode
from json import loads
from datetime import datetime
+from requests import get
# engine dependent config
categories = ['videos']
paging = True
language_support = True
-supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
- "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
- "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
- "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
- "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
- "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
- "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
- "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
- "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
- "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
- "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
- "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
- "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
- "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
- "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
- "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
- "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
- "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
# search-url
# see http://www.dailymotion.com/doc/api/obj-video.html
@@ -45,6 +28,8 @@ search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,descr
embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
+supported_languages_url = 'https://api.dailymotion.com/languages'
+
# do search-request
def request(query, params):
@@ -92,3 +77,23 @@ def response(resp):
# return results
return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ supported_languages = {}
+
+ response = get(supported_languages_url)
+ response_json = loads(response.text)
+
+ for language in response_json['list']:
+ supported_languages[language['code']] = {}
+
+ name = language['native_name']
+ if name:
+ supported_languages[language['code']]['name'] = name
+ english_name = language['name']
+ if english_name:
+ supported_languages[language['code']]['english_name'] = english_name
+
+ return supported_languages
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index 3e1752dd0..d37d2778b 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -15,19 +15,15 @@
from urllib import urlencode
from lxml.html import fromstring
+from requests import get
+from json import loads
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
paging = True
language_support = True
-supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT",
- "es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE",
- "el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP",
- "kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO",
- "es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG",
- "sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW",
- "th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"]
+supported_languages_url = 'https://duckduckgo.com/d2030.js'
time_range_support = True
# search-url
@@ -65,8 +61,6 @@ def request(query, params):
locale = 'xa' + params['language'].split('-')[0]
elif params['language'][-2:] == 'GB':
locale = 'uk' + params['language'].split('-')[0]
- elif params['language'] == 'es-419':
- locale = 'xl-es'
else:
locale = params['language'].split('-')
if len(locale) == 2:
@@ -120,3 +114,18 @@ def response(resp):
# return results
return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ response = get(supported_languages_url)
+
+ # response is a js file with regions as an embedded object
+ response_page = response.text
+ response_page = response_page[response_page.find('regions:{') + 8:]
+ response_page = response_page[:response_page.find('}') + 1]
+
+ regions_json = loads(response_page)
+ supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
+
+ return supported_languages
diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py
index 23a2f3be3..b965c02e9 100644
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@@ -4,7 +4,7 @@ from re import compile, sub
from lxml import html
from searx.utils import html_to_text
from searx.engines.xpath import extract_text
-from searx.engines.duckduckgo import supported_languages
+from searx.engines.duckduckgo import fetch_supported_languages
url = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index f012e1df2..e598e55c4 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -14,6 +14,8 @@ from json import loads
from random import randint
from time import time
from urllib import urlencode
+from requests import get
+from lxml.html import fromstring
# engine dependent config
categories = ['general']
@@ -40,11 +42,7 @@ url_xpath = './/url'
title_xpath = './/title'
content_xpath = './/sum'
-supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de",
- "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
- "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
- "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
- "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
+supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
# do search-request
@@ -90,3 +88,17 @@ def response(resp):
# return results
return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ supported_languages = []
+ response = get(supported_languages_url)
+ dom = fromstring(response.text)
+ links = dom.xpath('//span[@id="menu2"]/a')
+ for link in links:
+ code = link.xpath('./@href')[0][-2:]
+ if code != 'xx' and code not in supported_languages:
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 31035be69..a82a0b5a7 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -12,6 +12,7 @@ import re
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html, etree
+from requests import get
from searx.engines.xpath import extract_text, extract_url
from searx.search import logger
@@ -23,20 +24,6 @@ categories = ['general']
paging = True
language_support = True
use_locale_domain = True
-supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca",
- "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et",
- "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr",
- "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw",
- "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw",
- "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz",
- "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso",
- "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT",
- "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st",
- "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum",
- "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk",
- "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps",
- "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te",
- "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"]
time_range_support = True
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
@@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.'
maps_path = '/maps'
redirect_path = '/url'
images_path = '/images'
+supported_languages_url = 'https://www.google.com/preferences?#languages'
# specific xpath variables
results_xpath = '//div[@class="g"]'
@@ -373,3 +361,17 @@ def attributes_to_html(attributes):
retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
retval = retval + '</table>'
return retval
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ supported_languages = {}
+ response = get(supported_languages_url)
+ dom = html.fromstring(response.text)
+ options = dom.xpath('//select[@name="hl"]/option')
+ for option in options:
+ code = option.xpath('./@value')[0].split('-')[0]
+ name = option.text[:-1].title()
+ supported_languages[code] = {"name": name}
+
+ return supported_languages
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index 6d1430248..d138f99f5 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -13,7 +13,7 @@
from lxml import html
from urllib import urlencode
from json import loads
-from searx.engines.google import supported_languages
+from searx.engines.google import fetch_supported_languages
# search-url
categories = ['news']
diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py
index ea607dd60..93d98d3aa 100644
--- a/searx/engines/mediawiki.py
+++ b/searx/engines/mediawiki.py
@@ -15,7 +15,6 @@
from json import loads
from string import Formatter
from urllib import urlencode, quote
-from searx.engines.wikipedia import supported_languages
# engine dependent config
categories = ['general']
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index 200e9ada9..67803fa94 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -20,11 +20,6 @@ from searx.utils import html_to_text
categories = None
paging = True
language_support = True
-supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
- "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
- "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
- "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
- "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
category_to_keyword = {'general': 'web',
'images': 'images',
@@ -51,15 +46,7 @@ def request(query, params):
# add language tag if specified
if params['language'] != 'all':
- locale = params['language'].split('-')
- if len(locale) == 2 and params['language'] in supported_languages:
- params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
- else:
- # try to get a country code for language
- for lang in supported_languages:
- if locale[0] == lang.split('-')[0]:
- params['url'] += '&locale=' + lang.replace('-', '_').lower()
- break
+ params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
return params
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 3814d9949..54aafdee5 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -24,11 +24,6 @@ categories = ['general']
# paging = False
language_support = True
-supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
- "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
- "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
- "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
- "sv", "tl", "th", "tr", "uk", "vi"]
# search-url
base_url = 'https://startpage.com/'
diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py
index 2c0a94f08..f979d0141 100644
--- a/searx/engines/subtitleseeker.py
+++ b/searx/engines/subtitleseeker.py
@@ -22,7 +22,7 @@ language = ""
# search-url
url = 'http://www.subtitleseeker.com/'
-search_url = url + 'search/TITLES/{query}&p={pageno}'
+search_url = url + 'search/TITLES/{query}?p={pageno}'
# specific xpath variables
results_xpath = '//div[@class="boxRows"]'
@@ -51,7 +51,8 @@ def response(resp):
elif resp.search_params['language'] != 'all':
search_lang = [lc[3]
for lc in language_codes
- if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
+ if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
+ search_lang = search_lang[0].split(' (')[0]
# parse results
for result in dom.xpath(results_xpath):
diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py
index 5c6b051a9..7f85019a6 100644
--- a/searx/engines/swisscows.py
+++ b/searx/engines/swisscows.py
@@ -13,17 +13,13 @@
from json import loads
from urllib import urlencode, unquote
import re
+from requests import get
+from lxml.html import fromstring
# engine dependent config
categories = ['general', 'images']
paging = True
language_support = True
-supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
- "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
- "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
- "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
- "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
- "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
# search-url
base_url = 'https://swisscows.ch/'
@@ -114,3 +110,16 @@ def response(resp):
# return results
return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ supported_languages = []
+ response = get(base_url)
+ dom = fromstring(response.text)
+ options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
+ for option in options:
+ code = option.xpath('./@data-val')[0]
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index edb6d75fe..9c0a768e0 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -15,7 +15,7 @@ from searx import logger
from searx.poolrequests import get
from searx.engines.xpath import extract_text
from searx.utils import format_date_by_locale
-from searx.engines.wikipedia import supported_languages
+from searx.engines.wikipedia import fetch_supported_languages
from json import loads
from lxml.html import fromstring
@@ -57,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
def request(query, params):
- language = params['language'].split('_')[0]
+ language = params['language'].split('-')[0]
if language == 'all':
language = 'en'
@@ -72,7 +72,7 @@ def response(resp):
html = fromstring(resp.content)
wikidata_ids = html.xpath(wikidata_ids_xpath)
- language = resp.search_params['language'].split('_')[0]
+ language = resp.search_params['language'].split('-')[0]
if language == 'all':
language = 'en'
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index fdba5ed68..0dee325a7 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -12,36 +12,9 @@
from json import loads
from urllib import urlencode, quote
+from requests import get
+from lxml.html import fromstring
-supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
- "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
- "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
- "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
- "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
- "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
- "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
- "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
- "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
- "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
- "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
- "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
- "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
- "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
- "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
- "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
- "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
- "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
- "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
- "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
- "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
- "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
- "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
- "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
- "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
- "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
- "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
- "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
- "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
# search-url
base_url = 'https://{language}.wikipedia.org/'
@@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\
'&explaintext'\
'&pithumbsize=300'\
'&redirects'
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url
@@ -142,3 +116,24 @@ def response(resp):
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ supported_languages = {}
+ response = get(supported_languages_url)
+ dom = fromstring(response.text)
+ tables = dom.xpath('//table[contains(@class,"sortable")]')
+ for table in tables:
+ # exclude header row
+ trs = table.xpath('.//tr')[1:]
+ for tr in trs:
+ td = tr.xpath('./td')
+ code = td[3].xpath('./a')[0].text
+ name = td[2].xpath('./a')[0].text
+ english_name = td[1].xpath('./a')[0].text
+ articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
+ if articles >= 10000:
+ supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
+
+ return supported_languages
diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py
index c00e42368..db10c8939 100644
--- a/searx/engines/yahoo.py
+++ b/searx/engines/yahoo.py
@@ -14,16 +14,13 @@
from urllib import urlencode
from urlparse import unquote
from lxml import html
+from requests import get
from searx.engines.xpath import extract_text, extract_url
# engine dependent config
categories = ['general']
paging = True
language_support = True
-supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en",
- "et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja",
- "ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr",
- "sl", "es", "sv", "th", "tr"]
time_range_support = True
# search-url
@@ -31,6 +28,8 @@ base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
+supported_languages_url = 'https://search.yahoo.com/web/advanced'
+
# specific xpath variables
results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
url_xpath = './/h3/a/@href'
@@ -142,3 +141,16 @@ def response(resp):
# return results
return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+ supported_languages = []
+ response = get(supported_languages_url)
+ dom = html.fromstring(response.text)
+ options = dom.xpath('//div[@id="yschlang"]/span/label/input')
+ for option in options:
+ code = option.xpath('./@value')[0][5:]
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py
index 613513e59..bc7b5c368 100644
--- a/searx/engines/yahoo_news.py
+++ b/searx/engines/yahoo_news.py
@@ -12,7 +12,7 @@
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text, extract_url
-from searx.engines.yahoo import parse_url, supported_languages
+from searx.engines.yahoo import parse_url, fetch_supported_languages
from datetime import datetime, timedelta
import re
from dateutil import parser