From 2179079a9173b33b81e1084fc1e8e181c19ef8e9 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 2 Aug 2019 13:37:13 +0200 Subject: [fix] fix flickr_noapi decoding (#1655) Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function). --- tests/unit/test_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tests') diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index fbaed2bd1..b09b9d414 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -90,6 +90,13 @@ class TestUtils(SearxTestCase): self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL') self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL') + def test_ecma_unscape(self): + self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') + self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), + u'text using %xx: ó') + self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), + u'text using %u: 吉, 世界') + class TestHTMLTextExtractor(SearxTestCase): -- cgit v1.2.3-54-g00ecf From 72029d27ded8d93ab891c616d6bffbe8d3a67dd2 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 2 Aug 2019 13:50:51 +0200 Subject: [enh] Add timeout limit per request (#1640) The new url parameter "timeout_limit" set timeout limit defined in second. Example "timeout_limit=1.5" means the timeout limit is 1.5 seconds. In addition, the query can start with <[number] to set the timeout limit. For number between 0 and 99, the unit is the second : Example: "<30 searx" means the timeout limit is 3 seconds For number above 100, the unit is the millisecond: Example: "<850 searx" means the timeout is 850 milliseconds. In addition, there is a new optional setting: outgoing.max_request_timeout. If not set, the user timeout can't go above searx configuration (as before: the max timeout of selected engine for a query). If the value is set, the user can set a timeout between 0 and max_request_timeout using <[number] or timeout_limit query parameter. Related to #1077 Updated version of PR #1413 from @isj-privacore --- searx/query.py | 19 ++++++++++++- searx/search.py | 55 +++++++++++++++++++++++++++++++----- searx/settings.yml | 3 +- searx/templates/oscar/results.html | 1 + searx/templates/simple/infobox.html | 5 ++++ searx/templates/simple/results.html | 12 ++++++-- searx/templates/simple/search.html | 1 + searx/webapp.py | 3 +- tests/unit/test_query.py | 42 ++++++++++++++++++++++++++++ tests/unit/test_search.py | 56 +++++++++++++++++++++++++++++++++++-- 10 files changed, 181 insertions(+), 16 deletions(-) (limited to 'tests') diff --git a/searx/query.py b/searx/query.py index 5265ac914..382aed871 100644 --- a/searx/query.py +++ b/searx/query.py @@ -43,6 +43,7 @@ class RawTextQuery(object): self.query_parts = [] self.engines = [] self.languages = [] + self.timeout_limit = None self.specific = False # parse query, if tags are set, which @@ -69,6 +70,21 @@ class RawTextQuery(object): self.query_parts.append(query_part) continue + # this force the timeout + if query_part[0] == '<': + try: + raw_timeout_limit = int(query_part[1:]) + if raw_timeout_limit < 100: + # below 100, the unit is the second ( <3 = 3 seconds timeout ) + self.timeout_limit = float(raw_timeout_limit) + else: + # 100 or above, the unit is the millisecond ( <850 = 850 milliseconds timeout ) + self.timeout_limit = raw_timeout_limit / 1000.0 + parse_next = True + except ValueError: + # error not reported to the user + pass + # this force a language if query_part[0] == ':': lang = query_part[1:].lower().replace('_', '-') @@ -161,7 +177,7 @@ class RawTextQuery(object): class SearchQuery(object): """container for all the search parameters (query, language, etc...)""" - def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range): + def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range, timeout_limit=None): self.query = query.encode('utf-8') self.engines = engines self.categories = categories @@ -169,6 +185,7 @@ class SearchQuery(object): self.safesearch = safesearch self.pageno = pageno self.time_range = time_range + self.timeout_limit = timeout_limit def __str__(self): return str(self.query) + ";" + str(self.engines) diff --git a/searx/search.py b/searx/search.py index 1472073bd..a2c1c85f2 100644 --- a/searx/search.py +++ b/searx/search.py @@ -45,6 +45,16 @@ if sys.version_info[0] == 3: logger = logger.getChild('search') number_of_searches = 0 +max_request_timeout = settings.get('outgoing', {}).get('max_request_timeout' or None) +if max_request_timeout is None: + logger.info('max_request_timeout={0}'.format(max_request_timeout)) +else: + if isinstance(max_request_timeout, float): + logger.info('max_request_timeout={0} second(s)'.format(max_request_timeout)) + else: + logger.critical('outgoing.max_request_timeout if defined has to be float') + from sys import exit + exit(1) def send_http_request(engine, request_params): @@ -265,6 +275,15 @@ def get_search_query_from_webapp(preferences, form): # query_engines query_engines = raw_text_query.engines + # timeout_limit + query_timeout = raw_text_query.timeout_limit + if query_timeout is None and 'timeout_limit' in form: + raw_time_limit = form.get('timeout_limit') + try: + query_timeout = float(raw_time_limit) + except ValueError: + raise SearxParameterException('timeout_limit', raw_time_limit) + # query_categories query_categories = [] @@ -338,7 +357,8 @@ def get_search_query_from_webapp(preferences, form): query_engines = deduplicate_query_engines(query_engines) return (SearchQuery(query, query_engines, query_categories, - query_lang, query_safesearch, query_pageno, query_time_range), + query_lang, query_safesearch, query_pageno, + query_time_range, query_timeout), raw_text_query) @@ -351,6 +371,7 @@ class Search(object): super(Search, self).__init__() self.search_query = search_query self.result_container = ResultContainer() + self.actual_timeout = None # do search-request def search(self): @@ -380,7 +401,7 @@ class Search(object): search_query = self.search_query # max of all selected engine timeout - timeout_limit = 0 + default_timeout = 0 # start search-reqest for all selected engines for selected_engine in search_query.engines: @@ -420,12 +441,32 @@ class Search(object): # append request to list requests.append((selected_engine['name'], search_query.query, request_params)) - # update timeout_limit - timeout_limit = max(timeout_limit, engine.timeout) - + # update default_timeout + default_timeout = max(default_timeout, engine.timeout) + + # adjust timeout + self.actual_timeout = default_timeout + query_timeout = self.search_query.timeout_limit + + if max_request_timeout is None and query_timeout is None: + # No max, no user query: default_timeout + pass + elif max_request_timeout is None and query_timeout is not None: + # No max, but user query: From user query except if above default + self.actual_timeout = min(default_timeout, query_timeout) + elif max_request_timeout is not None and query_timeout is None: + # Max, no user query: Default except if above max + self.actual_timeout = min(default_timeout, max_request_timeout) + elif max_request_timeout is not None and query_timeout is not None: + # Max & user query: From user query except if above max + self.actual_timeout = min(query_timeout, max_request_timeout) + + logger.debug("actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})" + .format(self.actual_timeout, default_timeout, query_timeout, max_request_timeout)) + + # send all search-request if requests: - # send all search-request - search_multiple_requests(requests, self.result_container, start_time, timeout_limit) + search_multiple_requests(requests, self.result_container, start_time, self.actual_timeout) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes diff --git a/searx/settings.yml b/searx/settings.yml index 504a9fbe0..6659c1298 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -34,7 +34,8 @@ ui: # key : !!binary "your_morty_proxy_key" outgoing: # communication with search engines - request_timeout : 2.0 # seconds + request_timeout : 2.0 # default timeout in seconds, can be override by engine + # max_request_timeout: 10.0 # the maximum timeout in seconds useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator pool_connections : 100 # Number of different hosts pool_maxsize : 10 # Number of simultaneous requests by host diff --git a/searx/templates/oscar/results.html b/searx/templates/oscar/results.html index f712e5779..ce557daf9 100644 --- a/searx/templates/oscar/results.html +++ b/searx/templates/oscar/results.html @@ -5,6 +5,7 @@ + {%- endmacro %} {%- macro search_url() %}{{ base_url }}?q={{ q|urlencode }}{% if selected_categories %}&categories={{ selected_categories|join(",") | replace(' ','+') }}{% endif %}{% if pageno > 1 %}&pageno={{ pageno }}{% endif %}{% if time_range %}&time_range={{ time_range }}{% endif %}{% if current_language != 'all' %}&language={{ current_language }}{% endif %}{% endmacro -%} diff --git a/searx/templates/simple/infobox.html b/searx/templates/simple/infobox.html index d99806ac4..50b568919 100644 --- a/searx/templates/simple/infobox.html +++ b/searx/templates/simple/infobox.html @@ -36,6 +36,11 @@ {% for suggestion in topic.suggestions %}
+ + + + + {% if timeout_limit %}{% endif %}
{% endfor %} diff --git a/searx/templates/simple/results.html b/searx/templates/simple/results.html index a8e899e57..770eebe81 100644 --- a/searx/templates/simple/results.html +++ b/searx/templates/simple/results.html @@ -51,9 +51,11 @@ {% for suggestion in suggestions %}
+ + {% if timeout_limit %}{% endif %}
{% endfor %} @@ -63,7 +65,7 @@

{{ _('Search URL') }} :

-
{{ base_url }}?q={{ q|urlencode }}&language={{ current_language }}&time_range={{ time_range }}&safesearch={{ safesearch }}{% if pageno > 1 %}&pageno={{ pageno }}{% endif %}{% if selected_categories %}&categories={{ selected_categories|join(",") | replace(' ','+') }}{% endif %}
+
{{ base_url }}?q={{ q|urlencode }}&language={{ current_language }}&time_range={{ time_range }}&safesearch={{ safesearch }}{% if pageno > 1 %}&pageno={{ pageno }}{% endif %}{% if selected_categories %}&categories={{ selected_categories|join(",") | replace(' ','+') }}{% endif %}{% if timeout_limit %}&timeout_limit={{ timeout_limit|urlencode }}{% endif %}

{{ _('Download results') }}

@@ -79,6 +81,7 @@ + {% if timeout_limit %}{% endif %}
@@ -97,6 +100,7 @@ + {% if timeout_limit %}{% endif %} @@ -134,7 +138,8 @@ - + {% if timeout_limit %}{% endif %} + {% endif %} @@ -149,7 +154,8 @@ - + {% if timeout_limit %}{% endif %} + diff --git a/searx/templates/simple/search.html b/searx/templates/simple/search.html index 9c4a99b68..e9023b420 100644 --- a/searx/templates/simple/search.html +++ b/searx/templates/simple/search.html @@ -14,4 +14,5 @@ {% include 'simple/categories.html' %} + {% if timeout_limit %}{% endif %} diff --git a/searx/webapp.py b/searx/webapp.py index 8dd4af071..ffe9b4da9 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -628,7 +628,8 @@ def index(): fallback=settings['search']['language']), base_url=get_base_url(), theme=get_current_theme_name(), - favicons=global_favicons[themes.index(get_current_theme_name())] + favicons=global_favicons[themes.index(get_current_theme_name())], + timeout_limit=request.form.get('timeout_limit', None) ) diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 49ccb608f..e4c0bdeed 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -62,3 +62,45 @@ class TestQuery(SearxTestCase): self.assertEquals(len(query.query_parts), 1) self.assertEquals(len(query.languages), 0) self.assertFalse(query.specific) + + def test_timeout_below100(self): + query_text = '<3 the query' + query = RawTextQuery(query_text, []) + query.parse_query() + + self.assertEquals(query.getFullQuery(), query_text) + self.assertEquals(len(query.query_parts), 3) + self.assertEquals(query.timeout_limit, 3) + self.assertFalse(query.specific) + + def test_timeout_above100(self): + query_text = '<350 the query' + query = RawTextQuery(query_text, []) + query.parse_query() + + self.assertEquals(query.getFullQuery(), query_text) + self.assertEquals(len(query.query_parts), 3) + self.assertEquals(query.timeout_limit, 0.35) + self.assertFalse(query.specific) + + def test_timeout_above1000(self): + query_text = '<3500 the query' + query = RawTextQuery(query_text, []) + query.parse_query() + + self.assertEquals(query.getFullQuery(), query_text) + self.assertEquals(len(query.query_parts), 3) + self.assertEquals(query.timeout_limit, 3.5) + self.assertFalse(query.specific) + + def test_timeout_invalid(self): + # invalid number: it is not bang but it is part of the query + query_text = ' Date: Sat, 3 Aug 2019 13:23:36 +0200 Subject: [fix] fix monkey patch in test_webapp.py (#1667) at the end of test_webapp.py, the monkey patch of searx.search.Search was not revert which lead to side effects on other tests close #1663 --- searx/testing.py | 12 ++++++++++++ tests/unit/test_webapp.py | 13 +++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'tests') diff --git a/searx/testing.py b/searx/testing.py index 08a53e3f4..a3616dc12 100644 --- a/searx/testing.py +++ b/searx/testing.py @@ -80,6 +80,18 @@ class SearxTestCase(TestCase): layer = SearxTestLayer + def setattr4test(self, obj, attr, value): + """ + setattr(obj, attr, value) + but reset to the previous value in the cleanup. + """ + previous_value = getattr(obj, attr) + + def cleanup_patch(): + setattr(obj, attr, previous_value) + self.addCleanup(cleanup_patch) + setattr(obj, attr, value) + if __name__ == '__main__': import sys diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index dcf8b583c..72ace4850 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -15,7 +15,7 @@ class ViewsTestCase(SearxTestCase): self.app = webapp.app.test_client() # set some defaults - self.test_results = [ + test_results = [ { 'content': 'first test content', 'title': 'First Test', @@ -47,25 +47,25 @@ class ViewsTestCase(SearxTestCase): ] def search_mock(search_self, *args): - search_self.result_container = Mock(get_ordered_results=lambda: self.test_results, + search_self.result_container = Mock(get_ordered_results=lambda: test_results, answers=set(), corrections=set(), suggestions=set(), infoboxes=[], unresponsive_engines=set(), - results=self.test_results, + results=test_results, results_number=lambda: 3, - results_length=lambda: len(self.test_results), + results_length=lambda: len(test_results), get_timings=lambda: timings) - Search.search = search_mock + self.setattr4test(Search, 'search', search_mock) def get_current_theme_name_mock(override=None): if override: return override return 'legacy' - webapp.get_current_theme_name = get_current_theme_name_mock + self.setattr4test(webapp, 'get_current_theme_name', get_current_theme_name_mock) self.maxDiff = None # to see full diffs @@ -91,6 +91,7 @@ class ViewsTestCase(SearxTestCase): result_dict = json.loads(result.data.decode('utf-8')) self.assertEqual('test', result_dict['query']) + self.assertEqual(len(result_dict['results']), 2) self.assertEqual(result_dict['results'][0]['content'], 'first test content') self.assertEqual(result_dict['results'][0]['url'], 'http://first.test.xyz') -- cgit v1.2.3-54-g00ecf From 9ff500181624e996febb60bf4f6378fc749f7f9a Mon Sep 17 00:00:00 2001 From: Dalf Date: Mon, 5 Aug 2019 15:43:01 +0200 Subject: [fix] arxiv engine --- searx/engines/arxiv.py | 2 +- tests/unit/engines/test_arxiv.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tests') diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index 5ef84f0c1..182861892 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -29,7 +29,7 @@ def request(query, params): # basic search offset = (params['pageno'] - 1) * number_of_results - string_args = dict(query=query, + string_args = dict(query=query.decode('utf-8'), offset=offset, number_of_results=number_of_results) diff --git a/tests/unit/engines/test_arxiv.py b/tests/unit/engines/test_arxiv.py index b32c0e605..83c4f8595 100644 --- a/tests/unit/engines/test_arxiv.py +++ b/tests/unit/engines/test_arxiv.py @@ -8,7 +8,7 @@ from searx.testing import SearxTestCase class TestBaseEngine(SearxTestCase): def test_request(self): - query = 'test_query' + query = 'test_query'.encode('utf-8') dicto = defaultdict(dict) dicto['pageno'] = 1 params = arxiv.request(query, dicto) -- cgit v1.2.3-54-g00ecf From fcc9587ee9b1a4b31a511a8580f2d534e400e96b Mon Sep 17 00:00:00 2001 From: Dalf Date: Mon, 5 Aug 2019 15:44:02 +0200 Subject: [fix] fdroid engine --- searx/engines/fdroid.py | 29 ++++++++++----------- tests/unit/engines/test_fdroid.py | 55 +++++++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 37 deletions(-) (limited to 'tests') diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py index a6b01a8ee..4066dc716 100644 --- a/searx/engines/fdroid.py +++ b/searx/engines/fdroid.py @@ -18,13 +18,13 @@ categories = ['files'] paging = True # search-url -base_url = 'https://f-droid.org/' -search_url = base_url + 'repository/browse/?{query}' +base_url = 'https://search.f-droid.org/' +search_url = base_url + '?{query}' # do search-request def request(query, params): - query = urlencode({'fdfilter': query, 'fdpage': params['pageno']}) + query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''}) params['url'] = search_url.format(query=query) return params @@ -35,17 +35,16 @@ def response(resp): dom = html.fromstring(resp.text) - for app in dom.xpath('//div[@id="appheader"]'): - url = app.xpath('./ancestor::a/@href')[0] - title = app.xpath('./p/span/text()')[0] - img_src = app.xpath('.//img/@src')[0] - - content = extract_text(app.xpath('./p')[0]) - content = content.replace(title, '', 1).strip() - - results.append({'url': url, - 'title': title, - 'content': content, - 'img_src': img_src}) + for app in dom.xpath('//a[@class="package-header"]'): + app_url = app.xpath('./@href')[0] + app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()')) + app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \ + + ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip() + app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0] + + results.append({'url': app_url, + 'title': app_title, + 'content': app_content, + 'img_src': app_img_src}) return results diff --git a/tests/unit/engines/test_fdroid.py b/tests/unit/engines/test_fdroid.py index d75f4f0b4..42a0a7148 100644 --- a/tests/unit/engines/test_fdroid.py +++ b/tests/unit/engines/test_fdroid.py @@ -13,29 +13,40 @@ class TestFdroidEngine(SearxTestCase): params = fdroid.request(query, dic) self.assertTrue('url' in params) self.assertTrue(query in params['url']) - self.assertTrue('f-droid.org' in params['url']) + self.assertTrue('search.f-droid.org' in params['url']) - def test_response(self): + def test_response_empty(self): resp = mock.Mock(text='') self.assertEqual(fdroid.response(resp), []) + def test_response_oneresult(self): html = """ - -
-
- -
-
-

Details...

-
-

- Sample title -
- Sample content -

-
-
+ + + + test + + + + + """ resp = mock.Mock(text=html) @@ -43,7 +54,7 @@ class TestFdroidEngine(SearxTestCase): self.assertEqual(type(results), list) self.assertEqual(len(results), 1) - self.assertEqual(results[0]['url'], 'https://google.com/qwerty') - self.assertEqual(results[0]['title'], 'Sample title') - self.assertEqual(results[0]['content'], 'Sample content') - self.assertEqual(results[0]['img_src'], 'http://example.com/image.png') + self.assertEqual(results[0]['url'], 'https://example.com/app.url') + self.assertEqual(results[0]['title'], 'App Example 1') + self.assertEqual(results[0]['content'], 'Description App Example 1 - GPL-3.0-only') + self.assertEqual(results[0]['img_src'], 'https://example.com/appexample.logo.png') -- cgit v1.2.3-54-g00ecf From 88261e111ca2186f080c4048ab41b4c54cd5cf87 Mon Sep 17 00:00:00 2001 From: Léo Bourrel Date: Mon, 5 Aug 2019 16:15:40 +0200 Subject: Fix bing engine results count (#1387) This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible. For example : If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results. --- searx/engines/bing.py | 37 +++++++++--- tests/unit/engines/test_bing.py | 125 +++++++++++++++++++++++++++++----------- 2 files changed, 119 insertions(+), 43 deletions(-) (limited to 'tests') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index ba22cc6b4..1e614867b 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -13,11 +13,15 @@ @todo publishedDate """ +import re from lxml import html +from searx import logger, utils from searx.engines.xpath import extract_text from searx.url_utils import urlencode from searx.utils import match_language, gen_useragent +logger = logger.getChild('bing engine') + # engine dependent config categories = ['general'] paging = True @@ -30,9 +34,13 @@ base_url = 'https://www.bing.com/' search_string = 'search?{query}&first={offset}' +def _get_offset_from_pageno(pageno): + return (pageno - 1) * 10 + 1 + + # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + 1 + offset = _get_offset_from_pageno(params.get('pageno', 0)) if params['language'] == 'all': lang = 'EN' @@ -53,15 +61,9 @@ def request(query, params): # get response from search-request def response(resp): results = [] + result_len = 0 dom = html.fromstring(resp.text) - - try: - results.append({'number_of_results': int(dom.xpath('//span[@class="sb_count"]/text()')[0] - .split()[0].replace(',', ''))}) - except: - pass - # parse results for result in dom.xpath('//div[@class="sa_cc"]'): link = result.xpath('.//h3/a')[0] @@ -86,7 +88,24 @@ def response(resp): 'title': title, 'content': content}) - # return results + try: + result_len_container = "".join(dom.xpath('//span[@class="sb_count"]/text()')) + result_len_container = utils.to_string(result_len_container) + if "-" in result_len_container: + # Remove the part "from-to" for paginated request ... + result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:] + + result_len_container = re.sub('[^0-9]', '', result_len_container) + if len(result_len_container) > 0: + result_len = int(result_len_container) + except Exception as e: + logger.debug('result error :\n%s', e) + pass + + if _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: + return [] + + results.append({'number_of_results': result_len}) return results diff --git a/tests/unit/engines/test_bing.py b/tests/unit/engines/test_bing.py index 21191ff25..387034735 100644 --- a/tests/unit/engines/test_bing.py +++ b/tests/unit/engines/test_bing.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from collections import defaultdict import mock from searx.engines import bing @@ -10,7 +11,7 @@ class TestBingEngine(SearxTestCase): bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR'] query = u'test_query' dicto = defaultdict(dict) - dicto['pageno'] = 0 + dicto['pageno'] = 1 dicto['language'] = 'fr-FR' params = bing.request(query.encode('utf-8'), dicto) self.assertTrue('url' in params) @@ -23,70 +24,126 @@ class TestBingEngine(SearxTestCase): self.assertTrue('language' in params['url']) def test_response(self): + dicto = defaultdict(dict) + dicto['pageno'] = 1 + dicto['language'] = 'fr-FR' self.assertRaises(AttributeError, bing.response, None) self.assertRaises(AttributeError, bing.response, []) self.assertRaises(AttributeError, bing.response, '') self.assertRaises(AttributeError, bing.response, '[]') response = mock.Mock(text='') + response.search_params = dicto self.assertEqual(bing.response(response), []) response = mock.Mock(text='') + response.search_params = dicto self.assertEqual(bing.response(response), []) html = """ -
-
- -
this.meta.com - - - - -
-

This should be the content.

+
+
+ 23 900 000 résultats
+
    +
    +
    + +
    this.meta.com + + + + +
    +

    This should be the content.

    +
    +
    +
""" response = mock.Mock(text=html) + response.search_params = dicto results = bing.response(response) self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) + self.assertEqual(len(results), 2) self.assertEqual(results[0]['title'], 'This should be the title') self.assertEqual(results[0]['url'], 'http://this.should.be.the.link/') self.assertEqual(results[0]['content'], 'This should be the content.') + self.assertEqual(results[-1]['number_of_results'], 23900000) html = """ -
  • -
    - -
    this.meta.com - - - - -
    -

    This should be the content.

    +
    +
    + 9-18 résultats sur 23 900 000
    -
  • +
      +
    1. +
      + +
      this.meta.com + + + + +
      +

      This should be the content.

      +
      +
    2. +
    +
    """ + dicto['pageno'] = 2 response = mock.Mock(text=html) + response.search_params = dicto results = bing.response(response) self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) + self.assertEqual(len(results), 2) self.assertEqual(results[0]['title'], 'This should be the title') self.assertEqual(results[0]['url'], 'http://this.should.be.the.link/') self.assertEqual(results[0]['content'], 'This should be the content.') + self.assertEqual(results[-1]['number_of_results'], 23900000) + + html = """ +
    +
    + 23 900 000 résultats +
    +
      +
    1. +
      + +
      this.meta.com + + + + +
      +

      This should be the content.

      +
      +
    2. +
    +
    + """ + dicto['pageno'] = 33900000 + response = mock.Mock(text=html) + response.search_params = dicto + results = bing.response(response) + self.assertEqual(bing.response(response), []) def test_fetch_supported_languages(self): html = """""" -- cgit v1.2.3-54-g00ecf From e74bdf84296a4173dd625e25b699196e87db8ac8 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Mon, 14 Oct 2019 15:09:25 +0200 Subject: [fix] engine test --- tests/unit/engines/test_startpage.py | 123 +++++++---------------------------- 1 file changed, 25 insertions(+), 98 deletions(-) (limited to 'tests') diff --git a/tests/unit/engines/test_startpage.py b/tests/unit/engines/test_startpage.py index a4704ce22..ac4454738 100644 --- a/tests/unit/engines/test_startpage.py +++ b/tests/unit/engines/test_startpage.py @@ -18,12 +18,9 @@ class TestStartpageEngine(SearxTestCase): self.assertIn('data', params) self.assertIn('query', params['data']) self.assertIn(query, params['data']['query']) - self.assertIn('with_language', params['data']) - self.assertIn('lang_fr', params['data']['with_language']) dicto['language'] = 'all' params = startpage.request(query, dicto) - self.assertNotIn('with_language', params['data']) def test_response(self): self.assertRaises(AttributeError, startpage.response, None) @@ -35,33 +32,32 @@ class TestStartpageEngine(SearxTestCase): self.assertEqual(startpage.response(response), []) html = """ -
  • -

    - - This should be the title +

    -

    - This should be the content. -

    -

    - www.speedtest.net/fr/ - - - - - Navigation avec Ixquick Proxy - - - - - Mis en surbrillance - -

    -
  • - """ + +

    This should be the content.

    +
    + """ # noqa response = mock.Mock(text=html.encode('utf-8')) results = startpage.response(response) self.assertEqual(type(results), list) @@ -69,72 +65,3 @@ class TestStartpageEngine(SearxTestCase): self.assertEqual(results[0]['title'], 'This should be the title') self.assertEqual(results[0]['url'], 'http://this.should.be.the.link/') self.assertEqual(results[0]['content'], 'This should be the content.') - - html = """ -
  • -

    - - This should be the title - - -

    -

    - This should be the content. -

    -

    - www.speedtest.net/fr/ - - - - - Navigation avec Ixquick Proxy - - - - - Mis en surbrillance - -

    -
  • -
  • -

    - -

    -

    - This should be the content. -

    -

    - www.speedtest.net/fr/ - -

    -
  • -
  • -

    - - This should be the title - - -

    -

    - www.speedtest.net/fr/ - - - - - Navigation avec Ixquick Proxy - - - - - Mis en surbrillance - -

    -
  • - """ - response = mock.Mock(text=html.encode('utf-8')) - results = startpage.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['content'], '') -- cgit v1.2.3-54-g00ecf From 3c425f09c14bbc297346271dac5b30af7a294b9e Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Wed, 16 Oct 2019 15:32:21 +0200 Subject: [fix] remove useless engine tests --- tests/unit/engines/test_deviantart.py | 71 ----------------------------- tests/unit/engines/test_digg.py | 85 ----------------------------------- tests/unit/engines/test_www1x.py | 43 ------------------ 3 files changed, 199 deletions(-) (limited to 'tests') diff --git a/tests/unit/engines/test_deviantart.py b/tests/unit/engines/test_deviantart.py index bd2cf182f..a31151037 100644 --- a/tests/unit/engines/test_deviantart.py +++ b/tests/unit/engines/test_deviantart.py @@ -22,74 +22,3 @@ class TestDeviantartEngine(SearxTestCase): dicto['time_range'] = 'year' params = deviantart.request(query, dicto) self.assertEqual({}, params['url']) - - def test_response(self): - self.assertRaises(AttributeError, deviantart.response, None) - self.assertRaises(AttributeError, deviantart.response, []) - self.assertRaises(AttributeError, deviantart.response, '') - self.assertRaises(AttributeError, deviantart.response, '[]') - - response = mock.Mock(text='') - self.assertEqual(deviantart.response(response), []) - - response = mock.Mock(status_code=302) - self.assertEqual(deviantart.response(response), []) - - html = """ -
    - - - - - Title of image -
    - """ - response = mock.Mock(text=html) - results = deviantart.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['title'], 'Title of image') - self.assertEqual(results[0]['url'], 'https://url.of.image') - self.assertNotIn('content', results[0]) - self.assertEqual(results[0]['thumbnail_src'], 'https://url.of.thumbnail') - - html = """ - - - - - - Test - - - - - - - Title of image - - - - 5 years ago - - in Animation - - - - More Like This - - - """ - response = mock.Mock(text=html) - results = deviantart.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 0) diff --git a/tests/unit/engines/test_digg.py b/tests/unit/engines/test_digg.py index 6e7c9cc99..8bc4c67c2 100644 --- a/tests/unit/engines/test_digg.py +++ b/tests/unit/engines/test_digg.py @@ -14,88 +14,3 @@ class TestDiggEngine(SearxTestCase): self.assertIn('url', params) self.assertIn(query, params['url']) self.assertIn('digg.com', params['url']) - - def test_response(self): - self.assertRaises(AttributeError, digg.response, None) - self.assertRaises(AttributeError, digg.response, []) - self.assertRaises(AttributeError, digg.response, '') - self.assertRaises(AttributeError, digg.response, '[]') - - response = mock.Mock(text='{}') - self.assertEqual(digg.response(response), []) - - response = mock.Mock(text='{"data": []}') - self.assertEqual(digg.response(response), []) - - json = """ - { - "status": "ok", - "num": 10, - "next_position": 20, - "html": "" - } - """ - json = json.replace('\r\n', '').replace('\n', '').replace('\r', '') - response = mock.Mock(text=json) - results = digg.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['title'], 'Title of article') - self.assertEqual(results[0]['url'], 'http://url.of.link') - self.assertEqual(results[0]['thumbnail'], 'http://url.of.image.jpeg') - self.assertEqual(results[0]['content'], '') - - json = """ - { - "status": "error", - "num": 10, - "next_position": 20 - } - """ - response = mock.Mock(text=json) - results = digg.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 0) diff --git a/tests/unit/engines/test_www1x.py b/tests/unit/engines/test_www1x.py index 9df8de6bf..40f5200fd 100644 --- a/tests/unit/engines/test_www1x.py +++ b/tests/unit/engines/test_www1x.py @@ -12,46 +12,3 @@ class TestWww1xEngine(SearxTestCase): self.assertTrue('url' in params) self.assertTrue(query in params['url']) self.assertTrue('1x.com' in params['url']) - - def test_response(self): - self.assertRaises(AttributeError, www1x.response, None) - self.assertRaises(AttributeError, www1x.response, []) - self.assertRaises(AttributeError, www1x.response, '') - self.assertRaises(AttributeError, www1x.response, '[]') - - response = mock.Mock(text='') - self.assertEqual(www1x.response(response), []) - html = """ - - - - - - - ] - > - - -
    Photos
    - - - - ]]>
    - """ - response = mock.Mock(text=html) - results = www1x.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['url'], 'https://1x.com/photo/123456') - self.assertEqual(results[0]['thumbnail_src'], 'https://1x.com/images/user/testimage-123456.jpg') - self.assertEqual(results[0]['content'], '') - self.assertEqual(results[0]['template'], 'images.html') -- cgit v1.2.3-54-g00ecf From 9299355570e32c4d24d7274d716eca1a93119d13 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 24 Nov 2019 20:21:37 -0700 Subject: add seedpeer again --- searx/engines/seedpeer.py | 78 ++++++++++++++++++++++ searx/settings.yml | 5 ++ .../courgette/result_templates/torrent.html | 2 +- .../templates/legacy/result_templates/torrent.html | 2 +- .../templates/oscar/result_templates/torrent.html | 2 +- .../templates/simple/result_templates/torrent.html | 2 +- tests/unit/engines/test_seedpeer.py | 66 ++++++++++++++++++ 7 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 searx/engines/seedpeer.py create mode 100644 tests/unit/engines/test_seedpeer.py (limited to 'tests') diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py new file mode 100644 index 000000000..f9b1f99c8 --- /dev/null +++ b/searx/engines/seedpeer.py @@ -0,0 +1,78 @@ +# Seedpeer (Videos, Music, Files) +# +# @website https://seedpeer.me +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + +from lxml import html +from json import loads +from operator import itemgetter +from searx.url_utils import quote, urljoin +from searx.engines.xpath import extract_text + + +url = 'https://seedpeer.me/' +search_url = url + 'search/{search_term}?page={page_no}' +torrent_file_url = url + 'torrent/{torrent_hash}' + +# specific xpath variables +script_xpath = '//script[@type="text/javascript"][not(@src)]' +torrent_xpath = '(//table)[2]/tbody/tr' +link_xpath = '(./td)[1]/a/@href' +age_xpath = '(./td)[2]' +size_xpath = '(./td)[3]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + page_no=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + dom = html.fromstring(resp.text) + result_rows = dom.xpath(torrent_xpath) + + try: + script_element = dom.xpath(script_xpath)[0] + json_string = script_element.text[script_element.text.find('{'):] + torrents_json = loads(json_string) + except: + return [] + + # parse results + for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']): + title = torrent_json['name'] + seed = int(torrent_json['seeds']) + leech = int(torrent_json['peers']) + size = int(torrent_json['size']) + torrent_hash = torrent_json['hash'] + + torrentfile = torrent_file_url.format(torrent_hash=torrent_hash) + magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash) + + age = extract_text(torrent_row.xpath(age_xpath)) + link = torrent_row.xpath(link_xpath)[0] + + href = urljoin(url, link) + + # append result + results.append({'url': href, + 'title': title, + 'content': age, + 'seed': seed, + 'leech': leech, + 'filesize': size, + 'torrentfile': torrentfile, + 'magnetlink': magnetlink, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/settings.yml b/searx/settings.yml index 835fbe5f6..25d90d4db 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -748,6 +748,11 @@ engines: page_size : 10 disabled : True + - name : seedpeer + shortcut : speu + engine : seedpeer + categories: files, music, videos + # - name : yacy # engine : yacy # shortcut : ya diff --git a/searx/templates/courgette/result_templates/torrent.html b/searx/templates/courgette/result_templates/torrent.html index d659064d9..7f94a221e 100644 --- a/searx/templates/courgette/result_templates/torrent.html +++ b/searx/templates/courgette/result_templates/torrent.html @@ -4,7 +4,7 @@ {% endif %}

    {{ result.title|safe }}

    {% if result.content %}{{ result.content|safe }}
    {% endif %} - {% if result.seed %}{{ _('Seeder') }} : {{ result.seed }}, {{ _('Leecher') }} : {{ result.leech }}
    {% endif %} + {% if result.seed is defined %}{{ _('Seeder') }} : {{ result.seed }}, {{ _('Leecher') }} : {{ result.leech }}
    {% endif %} {% if result.magnetlink %}{{ _('magnet link') }}{% endif %} {% if result.torrentfile %}{{ _('torrent file') }}{% endif %} diff --git a/searx/templates/legacy/result_templates/torrent.html b/searx/templates/legacy/result_templates/torrent.html index 7a8ac33de..068e05373 100644 --- a/searx/templates/legacy/result_templates/torrent.html +++ b/searx/templates/legacy/result_templates/torrent.html @@ -8,6 +8,6 @@

    {% if result.magnetlink %}{{ _('magnet link') }}{% endif %} {% if result.torrentfile %}{{ _('torrent file') }}{% endif %} - - {% if result.seed %}{{ _('Seeder') }} : {{ result.seed }}, {{ _('Leecher') }} : {{ result.leech }}{% endif %} + {% if result.seed is defined %}{{ _('Seeder') }} : {{ result.seed }}, {{ _('Leecher') }} : {{ result.leech }}{% endif %}

    diff --git a/searx/templates/oscar/result_templates/torrent.html b/searx/templates/oscar/result_templates/torrent.html index f5ea415e2..089367e36 100644 --- a/searx/templates/oscar/result_templates/torrent.html +++ b/searx/templates/oscar/result_templates/torrent.html @@ -3,7 +3,7 @@ {{ result_header(result, favicons) }} {{ result_sub_header(result) }} -{% if result.seed %}

    {{ icon('transfer') }} {{ _('Seeder') }} {{ result.seed }} • {{ _('Leecher') }} {{ result.leech }}{% endif %} +{% if result.seed is defined %}

    {{ icon('transfer') }} {{ _('Seeder') }} {{ result.seed }} • {{ _('Leecher') }} {{ result.leech }}{% endif %} {% if result.filesize %}
    {{ icon('floppy-disk') }} {{ _('Filesize') }} {% if result.filesize < 1024 %}{{ result.filesize }} {{ _('Bytes') }} diff --git a/searx/templates/simple/result_templates/torrent.html b/searx/templates/simple/result_templates/torrent.html index 3c7fd15e8..71c775bc9 100644 --- a/searx/templates/simple/result_templates/torrent.html +++ b/searx/templates/simple/result_templates/torrent.html @@ -6,7 +6,7 @@ {% if result.magnetlink %}

    {% endif %} {% if result.torrentfile %}{% endif %} -{% if result.seed %}

    • {{ icon('arrow-swap') }} {{ _('Seeder') }} {{ result.seed }} • {{ _('Leecher') }} {{ result.leech }}

    {% endif %} +{% if result.seed is defined %}

    • {{ icon('arrow-swap') }} {{ _('Seeder') }} {{ result.seed }} • {{ _('Leecher') }} {{ result.leech }}

    {% endif %} {%- if result.filesize %}

    {{ icon('floppy-disk') }} {{ _('Filesize') }} {%- if result.filesize < 1024 %}{{ result.filesize }} {{ _('Bytes') }} diff --git a/tests/unit/engines/test_seedpeer.py b/tests/unit/engines/test_seedpeer.py new file mode 100644 index 000000000..2057c1cb1 --- /dev/null +++ b/tests/unit/engines/test_seedpeer.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import seedpeer +from searx.testing import SearxTestCase + + +class TestBtdiggEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = seedpeer.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('seedpeer', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, seedpeer.response, None) + self.assertRaises(AttributeError, seedpeer.response, []) + self.assertRaises(AttributeError, seedpeer.response, '') + self.assertRaises(AttributeError, seedpeer.response, '[]') + + response = mock.Mock(text='') + self.assertEqual(seedpeer.response(response), []) + + html = u""" + + + + + + + +
    + + + + + + + + + + + + +
    Title1 year1 KB1020
    + + + """ + response = mock.Mock(text=html) + results = seedpeer.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Title') + self.assertEqual(results[0]['url'], 'https://seedpeer.me/link') + self.assertEqual(results[0]['seed'], 10) + self.assertEqual(results[0]['leech'], 20) + self.assertEqual(results[0]['filesize'], 1024) + self.assertEqual(results[0]['torrentfile'], 'https://seedpeer.me/torrent/abc123') + self.assertEqual(results[0]['magnetlink'], 'magnet:?xt=urn:btih:abc123') -- cgit v1.2.3-54-g00ecf From ccaf6ca02c5bdc63f78e01a66429afaa5fb3cb68 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Wed, 26 Jun 2019 00:45:20 -0500 Subject: [fix] update xpaths for new google results page --- searx/engines/google.py | 21 +++---- tests/unit/engines/test_google.py | 117 ++++++++++++-------------------------- 2 files changed, 44 insertions(+), 94 deletions(-) (limited to 'tests') diff --git a/searx/engines/google.py b/searx/engines/google.py index 19bde710d..eed3a044e 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -107,13 +107,12 @@ images_path = '/images' supported_languages_url = 'https://www.google.com/preferences?#languages' # specific xpath variables -results_xpath = '//div[@class="g"]' -url_xpath = './/h3/a/@href' -title_xpath = './/h3' -content_xpath = './/span[@class="st"]' -content_misc_xpath = './/div[@class="f slp"]' -suggestion_xpath = '//p[@class="_Bmc"]' -spelling_suggestion_xpath = '//a[@class="spell"]' +results_xpath = '//div[contains(@class, "ZINbbc")]' +url_xpath = './/div[@class="kCrYT"][1]/a/@href' +title_xpath = './/div[@class="kCrYT"][1]/a/div[1]' +content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]' +suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]' +spelling_suggestion_xpath = '//div[@id="scc"]//a' # map : detail location map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' @@ -199,10 +198,6 @@ def request(query, params): params['headers']['Accept-Language'] = language + ',' + language + '-' + country params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' - # Force Safari 3.1 on Mac OS X (Leopard) user agent to avoid loading the new UI that Searx can't parse - params['headers']['User-Agent'] = ("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_4)" - "AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1") - params['google_hostname'] = google_hostname return params @@ -274,9 +269,7 @@ def response(resp): content = extract_text_from_dom(result, content_xpath) if content is None: continue - content_misc = extract_text_from_dom(result, content_misc_xpath) - if content_misc is not None: - content = content_misc + "
    " + content + # append result results.append({'url': url, 'title': title, diff --git a/tests/unit/engines/test_google.py b/tests/unit/engines/test_google.py index a73e9d2be..9d0edd439 100644 --- a/tests/unit/engines/test_google.py +++ b/tests/unit/engines/test_google.py @@ -58,93 +58,50 @@ class TestGoogleEngine(SearxTestCase): self.assertEqual(google.response(response), []) html = """ -

    -

    - - This is the title - -

    -
    -
    - - test.psychologies.com/ - -
    ‎ -