diff options
author | Adam Tauber <asciimoo@gmail.com> | 2015-02-12 10:52:55 +0100 |
---|---|---|
committer | Adam Tauber <asciimoo@gmail.com> | 2015-02-12 10:52:55 +0100 |
commit | f6db77d81ea87d99462b4c3cc40a8a27e0264724 (patch) | |
tree | b26fb71a62082aeec81c7bb1bb3d7447d006aed3 /searx/engines | |
parent | 516105c570a920dadeb87b34ee5ee434ad5cb16f (diff) | |
parent | f96154b7c454a3b02bf688f248b4471c2020c28f (diff) | |
download | searxng-f6db77d81ea87d99462b4c3cc40a8a27e0264724.tar.gz searxng-f6db77d81ea87d99462b4c3cc40a8a27e0264724.zip |
Merge pull request #210 from Cqoicebordel/unit-tests
unit tests
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/currency_convert.py | 9 | ||||
-rw-r--r-- | searx/engines/duckduckgo.py | 10 | ||||
-rw-r--r-- | searx/engines/duckduckgo_definitions.py | 5 | ||||
-rw-r--r-- | searx/engines/faroo.py | 8 | ||||
-rw-r--r-- | searx/engines/openstreetmap.py | 9 | ||||
-rw-r--r-- | searx/engines/photon.py | 2 | ||||
-rw-r--r-- | searx/engines/startpage.py | 13 | ||||
-rw-r--r-- | searx/engines/subtitleseeker.py | 15 | ||||
-rw-r--r-- | searx/engines/twitter.py | 13 | ||||
-rw-r--r-- | searx/engines/yacy.py | 19 | ||||
-rw-r--r-- | searx/engines/yahoo.py | 4 |
11 files changed, 54 insertions, 53 deletions
diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index d8841c1d1..4618c82b1 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -13,12 +13,9 @@ def request(query, params): if not m: # wrong query return params - try: - ammount, from_currency, to_currency = m.groups() - ammount = float(ammount) - except: - # wrong params - return params + + ammount, from_currency, to_currency = m.groups() + ammount = float(ammount) q = (from_currency + to_currency).upper() diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 583e33f73..e35a6334c 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -15,7 +15,7 @@ from urllib import urlencode from lxml.html import fromstring -from searx.utils import html_to_text +from searx.engines.xpath import extract_text # engine dependent config categories = ['general'] @@ -28,8 +28,8 @@ url = 'https://duckduckgo.com/html?{query}&s={offset}' # specific xpath variables result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa url_xpath = './/a[@class="large"]/@href' -title_xpath = './/a[@class="large"]//text()' -content_xpath = './/div[@class="snippet"]//text()' +title_xpath = './/a[@class="large"]' +content_xpath = './/div[@class="snippet"]' # do search-request @@ -64,8 +64,8 @@ def response(resp): if not res_url: continue - title = html_to_text(''.join(r.xpath(title_xpath))) - content = html_to_text(''.join(r.xpath(content_xpath))) + title = extract_text(r.xpath(title_xpath)) + content = extract_text(r.xpath(content_xpath)) # append result results.append({'title': title, diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index b66d6c0f2..793e97d22 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -25,9 +25,10 @@ def request(query, params): def response(resp): - search_res = json.loads(resp.text) results = [] + search_res = json.loads(resp.text) + content = '' heading = search_res.get('Heading', '') attributes = [] @@ -68,7 +69,7 @@ def response(resp): results.append({'title': heading, 'url': firstURL}) # related topics - for ddg_result in search_res.get('RelatedTopics', None): + for ddg_result in search_res.get('RelatedTopics', []): if 'FirstURL' in ddg_result: suggestion = result_to_text(ddg_result.get('FirstURL', None), ddg_result.get('Text', None), diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py index 5360ea156..4a5e60a60 100644 --- a/searx/engines/faroo.py +++ b/searx/engines/faroo.py @@ -37,7 +37,7 @@ search_category = {'general': 'web', # do search-request def request(query, params): - offset = (params['pageno']-1) * number_of_results + 1 + offset = (params['pageno'] - 1) * number_of_results + 1 categorie = search_category.get(params['category'], 'web') if params['language'] == 'all': @@ -45,11 +45,11 @@ def request(query, params): else: language = params['language'].split('_')[0] - # skip, if language is not supported + # if language is not supported, put it in english if language != 'en' and\ language != 'de' and\ language != 'zh': - return params + language = 'en' params['url'] = search_url.format(offset=offset, number_of_results=number_of_results, @@ -69,12 +69,10 @@ def response(resp): # HTTP-Code 401: api-key is not valide if resp.status_code == 401: raise Exception("API key is not valide") - return [] # HTTP-Code 429: rate limit exceeded if resp.status_code == 429: raise Exception("rate limit has been exceeded!") - return [] results = [] diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 68446ef5f..60c3c13ca 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -38,6 +38,9 @@ def response(resp): # parse results for r in json: + if 'display_name' not in r: + continue + title = r['display_name'] osm_type = r.get('osm_type', r.get('type')) url = result_base_url.format(osm_type=osm_type, @@ -49,10 +52,8 @@ def response(resp): geojson = r.get('geojson') # if no geojson is found and osm_type is a node, add geojson Point - if not geojson and\ - osm_type == 'node': - geojson = {u'type': u'Point', - u'coordinates': [r['lon'], r['lat']]} + if not geojson and osm_type == 'node': + geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]} address_raw = r.get('address') address = {} diff --git a/searx/engines/photon.py b/searx/engines/photon.py index 16340d24a..a9c558c4b 100644 --- a/searx/engines/photon.py +++ b/searx/engines/photon.py @@ -61,7 +61,7 @@ def response(resp): continue # get title - title = properties['name'] + title = properties.get('name') # get osm-type if properties.get('osm_type') == 'N': diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index d60ecd978..9d5b4befe 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -13,6 +13,7 @@ from lxml import html from cgi import escape import re +from searx.engines.xpath import extract_text # engine dependent config categories = ['general'] @@ -45,8 +46,7 @@ def request(query, params): # set language if specified if params['language'] != 'all': - params['data']['with_language'] = ('lang_' + - params['language'].split('_')[0]) + params['data']['with_language'] = ('lang_' + params['language'].split('_')[0]) return params @@ -64,18 +64,15 @@ def response(resp): continue link = links[0] url = link.attrib.get('href') - try: - title = escape(link.text_content()) - except UnicodeDecodeError: - continue # block google-ad url's if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): continue + title = escape(extract_text(link)) + if result.xpath('./p[@class="desc"]'): - content = escape(result.xpath('./p[@class="desc"]')[0] - .text_content()) + content = escape(extract_text(result.xpath('./p[@class="desc"]'))) else: content = '' diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index 9aaf1947b..acefe30ea 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -12,6 +12,7 @@ from cgi import escape from urllib import quote_plus from lxml import html from searx.languages import language_codes +from searx.engines.xpath import extract_text # engine dependent config categories = ['videos'] @@ -20,7 +21,7 @@ language = "" # search-url url = 'http://www.subtitleseeker.com/' -search_url = url+'search/TITLES/{query}&p={pageno}' +search_url = url + 'search/TITLES/{query}&p={pageno}' # specific xpath variables results_xpath = '//div[@class="boxRows"]' @@ -44,7 +45,7 @@ def response(resp): if resp.search_params['language'] != 'all': search_lang = [lc[1] for lc in language_codes - if lc[0][:2] == resp.search_params['language']][0] + if lc[0][:2] == resp.search_params['language'].split('_')[0]][0] # parse results for result in dom.xpath(results_xpath): @@ -56,17 +57,17 @@ def response(resp): elif search_lang: href = href + search_lang + '/' - title = escape(link.xpath(".//text()")[0]) + title = escape(extract_text(link)) - content = result.xpath('.//div[contains(@class,"red")]//text()')[0] + content = extract_text(result.xpath('.//div[contains(@class,"red")]')) content = content + " - " - text = result.xpath('.//div[contains(@class,"grey-web")]')[0] - content = content + html.tostring(text, method='text') + text = extract_text(result.xpath('.//div[contains(@class,"grey-web")]')[0]) + content = content + text if result.xpath(".//span") != []: content = content +\ " - (" +\ - result.xpath(".//span//text()")[0].strip() +\ + extract_text(result.xpath(".//span")) +\ ")" # append result diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index bd9a8c2fc..0e35e6188 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -13,8 +13,8 @@ from urlparse import urljoin from urllib import urlencode from lxml import html -from cgi import escape from datetime import datetime +from searx.engines.xpath import extract_text # engine dependent config categories = ['social media'] @@ -22,12 +22,12 @@ language_support = True # search-url base_url = 'https://twitter.com/' -search_url = base_url+'search?' +search_url = base_url + 'search?' # specific xpath variables results_xpath = '//li[@data-item-type="tweet"]' link_xpath = './/small[@class="time"]//a' -title_xpath = './/span[@class="username js-action-profile-name"]//text()' +title_xpath = './/span[@class="username js-action-profile-name"]' content_xpath = './/p[@class="js-tweet-text tweet-text"]' timestamp_xpath = './/span[contains(@class,"_timestamp")]' @@ -39,6 +39,8 @@ def request(query, params): # set language if specified if params['language'] != 'all': params['cookies']['lang'] = params['language'].split('_')[0] + else: + params['cookies']['lang'] = 'en' return params @@ -53,8 +55,9 @@ def response(resp): for tweet in dom.xpath(results_xpath): link = tweet.xpath(link_xpath)[0] url = urljoin(base_url, link.attrib.get('href')) - title = ''.join(tweet.xpath(title_xpath)) - content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8")) + title = extract_text(tweet.xpath(title_xpath)) + content = extract_text(tweet.xpath(content_xpath)[0]) + pubdate = tweet.xpath(timestamp_xpath) if len(pubdate) > 0: timestamp = float(pubdate[0].attrib.get('data-time')) diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 17e2a7aab..3d26c9cc4 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -25,10 +25,10 @@ number_of_results = 5 # search-url base_url = 'http://localhost:8090' search_url = '/yacysearch.json?{query}'\ - '&startRecord={offset}'\ - '&maximumRecords={limit}'\ - '&contentdom={search_type}'\ - '&resource=global' # noqa + '&startRecord={offset}'\ + '&maximumRecords={limit}'\ + '&contentdom={search_type}'\ + '&resource=global' # yacy specific type-definitions search_types = {'general': 'text', @@ -41,7 +41,7 @@ search_types = {'general': 'text', # do search-request def request(query, params): offset = (params['pageno'] - 1) * number_of_results - search_type = search_types.get(params['category'], '0') + search_type = search_types.get(params.get('category'), '0') params['url'] = base_url +\ search_url.format(query=urlencode({'query': query}), @@ -66,9 +66,12 @@ def response(resp): if not raw_search_results: return [] - search_results = raw_search_results.get('channels', {})[0].get('items', []) + search_results = raw_search_results.get('channels', []) - for result in search_results: + if len(search_results) == 0: + return [] + + for result in search_results[0].get('items', []): # parse image results if result.get('image'): # append result @@ -88,7 +91,7 @@ def response(resp): 'content': result['description'], 'publishedDate': publishedDate}) - #TODO parse video, audio and file results + # TODO parse video, audio and file results # return results return results diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index c6c5b0d0d..161f7513b 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -35,7 +35,7 @@ suggestion_xpath = '//div[@id="satat"]//a' def parse_url(url_string): endings = ['/RS', '/RK'] endpositions = [] - start = url_string.find('http', url_string.find('/RU=')+1) + start = url_string.find('http', url_string.find('/RU=') + 1) for ending in endings: endpos = url_string.rfind(ending) @@ -91,7 +91,7 @@ def response(resp): 'content': content}) # if no suggestion found, return results - if not suggestion_xpath: + if not dom.xpath(suggestion_xpath): return results # parse suggestion |