diff options
author | Adam Tauber <asciimoo@gmail.com> | 2016-12-10 00:06:39 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-12-10 00:06:39 +0100 |
commit | ceb8ae6439f092dad6e26406cd178c2195eb0898 (patch) | |
tree | 8b285b2dd483006d08c03b9fec49cba49ff16a87 /searx/search.py | |
parent | d80fb2c8e8995facb3a25c152c47a93eecf1fee4 (diff) | |
parent | e48f07a367e55bf8aa881902b977bd7ce1cd2bb6 (diff) | |
download | searxng-ceb8ae6439f092dad6e26406cd178c2195eb0898.tar.gz searxng-ceb8ae6439f092dad6e26406cd178c2195eb0898.zip |
Merge pull request #751 from dalf/searchpy2
Simplify search.py
Diffstat (limited to 'searx/search.py')
-rw-r--r-- | searx/search.py | 210 |
1 files changed, 112 insertions, 98 deletions
diff --git a/searx/search.py b/searx/search.py index 0095de821..7dfcd4eeb 100644 --- a/searx/search.py +++ b/searx/search.py @@ -36,14 +36,53 @@ logger = logger.getChild('search') number_of_searches = 0 -def search_request_wrapper(fn, url, engine_name, **kwargs): - ret = None - engine = engines[engine_name] +def send_http_request(engine, request_params, timeout_limit): + response = None try: - ret = fn(url, **kwargs) + # create dictionary which contain all + # informations about the request + request_args = dict( + headers=request_params['headers'], + cookies=request_params['cookies'], + timeout=timeout_limit, + verify=request_params['verify'] + ) + # specific type of request (GET or POST) + if request_params['method'] == 'GET': + req = requests_lib.get + else: + req = requests_lib.post + request_args['data'] = request_params['data'] + + # for page_load_time stats + time_before_request = time() + + # send the request + response = req(request_params['url'], **request_args) + with threading.RLock(): + # no error : reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0 + # update stats with current page-load-time + # only the HTTP request + engine.stats['page_load_time'] += time() - time_before_request + engine.stats['page_load_count'] += 1 + + # is there a timeout (no parsing in this case) + timeout_overhead = 0.2 # seconds + search_duration = time() - request_params['started'] + if search_duration > timeout_limit + timeout_overhead: + logger.exception('engine timeout on HTTP request:' + '{0} (search duration : {1} ms, time-out: {2} )' + .format(engine.name, search_duration, timeout_limit)) + with threading.RLock(): + engine.stats['errors'] += 1 + return False + + # everything is ok : return the response + return response + except: # increase errors stats with threading.RLock(): @@ -52,20 +91,62 @@ def search_request_wrapper(fn, url, engine_name, **kwargs): engine.suspend_end_time = time() + min(60, engine.continuous_errors) # print engine name and specific error message - logger.exception('engine crash: {0}'.format(engine_name)) - return ret + logger.exception('engine crash: {0}'.format(engine.name)) + return False + + +def search_one_request(engine_name, query, request_params, result_container, timeout_limit): + engine = engines[engine_name] + + # update request parameters dependent on + # search-engine (contained in engines folder) + engine.request(query, request_params) + + # TODO add support of offline engines + if request_params['url'] is None: + return False + + # ignoring empty urls + if not request_params['url']: + return False + + # send request + response = send_http_request(engine, request_params, timeout_limit) + + # parse response + success = None + if response: + # parse the response + response.search_params = request_params + search_results = engine.response(response) + + # add results + for result in search_results: + result['engine'] = engine.name + + result_container.extend(engine.name, search_results) + + success = True + else: + success = False + + with threading.RLock(): + # update stats : total time + engine.stats['engine_time'] += time() - request_params['started'] + engine.stats['engine_time_count'] += 1 + # + return success -def threaded_requests(requests): - timeout_limit = max(r[2]['timeout'] for r in requests) - search_start = time() + +def search_multiple_requests(requests, result_container, timeout_limit): + start_time = time() search_id = uuid4().__str__() - for fn, url, request_args, engine_name in requests: - request_args['timeout'] = timeout_limit + + for engine_name, query, request_params in requests: th = threading.Thread( - target=search_request_wrapper, - args=(fn, url, engine_name), - kwargs=request_args, + target=search_one_request, + args=(engine_name, query, request_params, result_container, timeout_limit), name=search_id, ) th._engine_name = engine_name @@ -73,7 +154,7 @@ def threaded_requests(requests): for th in threading.enumerate(): if th.name == search_id: - remaining_time = max(0.0, timeout_limit - (time() - search_start)) + remaining_time = max(0.0, timeout_limit - (time() - start_time)) th.join(remaining_time) if th.isAlive(): logger.warning('engine timeout: {0}'.format(th._engine_name)) @@ -91,44 +172,6 @@ def default_request_params(): } -# create a callback wrapper for the search engine results -def make_callback(engine_name, callback, params, result_container): - - # creating a callback wrapper for the search engine results - def process_callback(response, **kwargs): - # check if redirect comparing to the True value, - # because resp can be a Mock object, and any attribut name returns something. - if response.is_redirect is True: - logger.debug('{0} redirect on: {1}'.format(engine_name, response)) - return - - response.search_params = params - - search_duration = time() - params['started'] - # update stats with current page-load-time - with threading.RLock(): - engines[engine_name].stats['page_load_time'] += search_duration - - timeout_overhead = 0.2 # seconds - timeout_limit = engines[engine_name].timeout + timeout_overhead - - if search_duration > timeout_limit: - with threading.RLock(): - engines[engine_name].stats['errors'] += 1 - return - - # callback - search_results = callback(response) - - # add results - for result in search_results: - result['engine'] = engine_name - - result_container.extend(engine_name, search_results) - - return process_callback - - def get_search_query_from_webapp(preferences, form): query = None query_engines = [] @@ -255,6 +298,10 @@ class Search(object): def search(self): global number_of_searches + # start time + start_time = time() + + # answeres ? answerers_results = ask(self.search_query) if answerers_results: @@ -274,6 +321,9 @@ class Search(object): search_query = self.search_query + # max of all selected engine timeout + timeout_limit = 0 + # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: @@ -303,7 +353,7 @@ class Search(object): request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] - request_params['started'] = time() + request_params['started'] = start_time request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: @@ -315,52 +365,16 @@ class Search(object): request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range - # update request parameters dependent on - # search-engine (contained in engines folder) - engine.request(search_query.query.encode('utf-8'), request_params) - - if request_params['url'] is None: - # TODO add support of offline engines - pass - - # create a callback wrapper for the search engine results - callback = make_callback( - selected_engine['name'], - engine.response, - request_params, - self.result_container) - - # create dictionary which contain all - # informations about the request - request_args = dict( - headers=request_params['headers'], - hooks=dict(response=callback), - cookies=request_params['cookies'], - timeout=engine.timeout, - verify=request_params['verify'] - ) - - # specific type of request (GET or POST) - if request_params['method'] == 'GET': - req = requests_lib.get - else: - req = requests_lib.post - request_args['data'] = request_params['data'] - - # ignoring empty urls - if not request_params['url']: - continue - # append request to list - requests.append((req, request_params['url'], - request_args, - selected_engine['name'])) + requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params)) - if not requests: - return self.result_container - # send all search-request - threaded_requests(requests) - start_new_thread(gc.collect, tuple()) + # update timeout_limit + timeout_limit = max(timeout_limit, engine.timeout) + + if requests: + # send all search-request + search_multiple_requests(requests, self.result_container, timeout_limit - (time() - start_time)) + start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container |