summaryrefslogtreecommitdiff
path: root/searx/search.py
diff options
context:
space:
mode:
authorAdam Tauber <asciimoo@gmail.com>2016-12-10 00:06:39 +0100
committerGitHub <noreply@github.com>2016-12-10 00:06:39 +0100
commitceb8ae6439f092dad6e26406cd178c2195eb0898 (patch)
tree8b285b2dd483006d08c03b9fec49cba49ff16a87 /searx/search.py
parentd80fb2c8e8995facb3a25c152c47a93eecf1fee4 (diff)
parente48f07a367e55bf8aa881902b977bd7ce1cd2bb6 (diff)
downloadsearxng-ceb8ae6439f092dad6e26406cd178c2195eb0898.tar.gz
searxng-ceb8ae6439f092dad6e26406cd178c2195eb0898.zip
Merge pull request #751 from dalf/searchpy2
Simplify search.py
Diffstat (limited to 'searx/search.py')
-rw-r--r--searx/search.py210
1 files changed, 112 insertions, 98 deletions
diff --git a/searx/search.py b/searx/search.py
index 0095de821..7dfcd4eeb 100644
--- a/searx/search.py
+++ b/searx/search.py
@@ -36,14 +36,53 @@ logger = logger.getChild('search')
number_of_searches = 0
-def search_request_wrapper(fn, url, engine_name, **kwargs):
- ret = None
- engine = engines[engine_name]
+def send_http_request(engine, request_params, timeout_limit):
+ response = None
try:
- ret = fn(url, **kwargs)
+ # create dictionary which contain all
+ # informations about the request
+ request_args = dict(
+ headers=request_params['headers'],
+ cookies=request_params['cookies'],
+ timeout=timeout_limit,
+ verify=request_params['verify']
+ )
+ # specific type of request (GET or POST)
+ if request_params['method'] == 'GET':
+ req = requests_lib.get
+ else:
+ req = requests_lib.post
+ request_args['data'] = request_params['data']
+
+ # for page_load_time stats
+ time_before_request = time()
+
+ # send the request
+ response = req(request_params['url'], **request_args)
+
with threading.RLock():
+ # no error : reset the suspend variables
engine.continuous_errors = 0
engine.suspend_end_time = 0
+ # update stats with current page-load-time
+ # only the HTTP request
+ engine.stats['page_load_time'] += time() - time_before_request
+ engine.stats['page_load_count'] += 1
+
+ # is there a timeout (no parsing in this case)
+ timeout_overhead = 0.2 # seconds
+ search_duration = time() - request_params['started']
+ if search_duration > timeout_limit + timeout_overhead:
+ logger.exception('engine timeout on HTTP request:'
+ '{0} (search duration : {1} ms, time-out: {2} )'
+ .format(engine.name, search_duration, timeout_limit))
+ with threading.RLock():
+ engine.stats['errors'] += 1
+ return False
+
+ # everything is ok : return the response
+ return response
+
except:
# increase errors stats
with threading.RLock():
@@ -52,20 +91,62 @@ def search_request_wrapper(fn, url, engine_name, **kwargs):
engine.suspend_end_time = time() + min(60, engine.continuous_errors)
# print engine name and specific error message
- logger.exception('engine crash: {0}'.format(engine_name))
- return ret
+ logger.exception('engine crash: {0}'.format(engine.name))
+ return False
+
+
+def search_one_request(engine_name, query, request_params, result_container, timeout_limit):
+ engine = engines[engine_name]
+
+ # update request parameters dependent on
+ # search-engine (contained in engines folder)
+ engine.request(query, request_params)
+
+ # TODO add support of offline engines
+ if request_params['url'] is None:
+ return False
+
+ # ignoring empty urls
+ if not request_params['url']:
+ return False
+
+ # send request
+ response = send_http_request(engine, request_params, timeout_limit)
+
+ # parse response
+ success = None
+ if response:
+ # parse the response
+ response.search_params = request_params
+ search_results = engine.response(response)
+
+ # add results
+ for result in search_results:
+ result['engine'] = engine.name
+
+ result_container.extend(engine.name, search_results)
+
+ success = True
+ else:
+ success = False
+
+ with threading.RLock():
+ # update stats : total time
+ engine.stats['engine_time'] += time() - request_params['started']
+ engine.stats['engine_time_count'] += 1
+ #
+ return success
-def threaded_requests(requests):
- timeout_limit = max(r[2]['timeout'] for r in requests)
- search_start = time()
+
+def search_multiple_requests(requests, result_container, timeout_limit):
+ start_time = time()
search_id = uuid4().__str__()
- for fn, url, request_args, engine_name in requests:
- request_args['timeout'] = timeout_limit
+
+ for engine_name, query, request_params in requests:
th = threading.Thread(
- target=search_request_wrapper,
- args=(fn, url, engine_name),
- kwargs=request_args,
+ target=search_one_request,
+ args=(engine_name, query, request_params, result_container, timeout_limit),
name=search_id,
)
th._engine_name = engine_name
@@ -73,7 +154,7 @@ def threaded_requests(requests):
for th in threading.enumerate():
if th.name == search_id:
- remaining_time = max(0.0, timeout_limit - (time() - search_start))
+ remaining_time = max(0.0, timeout_limit - (time() - start_time))
th.join(remaining_time)
if th.isAlive():
logger.warning('engine timeout: {0}'.format(th._engine_name))
@@ -91,44 +172,6 @@ def default_request_params():
}
-# create a callback wrapper for the search engine results
-def make_callback(engine_name, callback, params, result_container):
-
- # creating a callback wrapper for the search engine results
- def process_callback(response, **kwargs):
- # check if redirect comparing to the True value,
- # because resp can be a Mock object, and any attribut name returns something.
- if response.is_redirect is True:
- logger.debug('{0} redirect on: {1}'.format(engine_name, response))
- return
-
- response.search_params = params
-
- search_duration = time() - params['started']
- # update stats with current page-load-time
- with threading.RLock():
- engines[engine_name].stats['page_load_time'] += search_duration
-
- timeout_overhead = 0.2 # seconds
- timeout_limit = engines[engine_name].timeout + timeout_overhead
-
- if search_duration > timeout_limit:
- with threading.RLock():
- engines[engine_name].stats['errors'] += 1
- return
-
- # callback
- search_results = callback(response)
-
- # add results
- for result in search_results:
- result['engine'] = engine_name
-
- result_container.extend(engine_name, search_results)
-
- return process_callback
-
-
def get_search_query_from_webapp(preferences, form):
query = None
query_engines = []
@@ -255,6 +298,10 @@ class Search(object):
def search(self):
global number_of_searches
+ # start time
+ start_time = time()
+
+ # answeres ?
answerers_results = ask(self.search_query)
if answerers_results:
@@ -274,6 +321,9 @@ class Search(object):
search_query = self.search_query
+ # max of all selected engine timeout
+ timeout_limit = 0
+
# start search-reqest for all selected engines
for selected_engine in search_query.engines:
if selected_engine['name'] not in engines:
@@ -303,7 +353,7 @@ class Search(object):
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
- request_params['started'] = time()
+ request_params['started'] = start_time
request_params['pageno'] = search_query.pageno
if hasattr(engine, 'language') and engine.language:
@@ -315,52 +365,16 @@ class Search(object):
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
- # update request parameters dependent on
- # search-engine (contained in engines folder)
- engine.request(search_query.query.encode('utf-8'), request_params)
-
- if request_params['url'] is None:
- # TODO add support of offline engines
- pass
-
- # create a callback wrapper for the search engine results
- callback = make_callback(
- selected_engine['name'],
- engine.response,
- request_params,
- self.result_container)
-
- # create dictionary which contain all
- # informations about the request
- request_args = dict(
- headers=request_params['headers'],
- hooks=dict(response=callback),
- cookies=request_params['cookies'],
- timeout=engine.timeout,
- verify=request_params['verify']
- )
-
- # specific type of request (GET or POST)
- if request_params['method'] == 'GET':
- req = requests_lib.get
- else:
- req = requests_lib.post
- request_args['data'] = request_params['data']
-
- # ignoring empty urls
- if not request_params['url']:
- continue
-
# append request to list
- requests.append((req, request_params['url'],
- request_args,
- selected_engine['name']))
+ requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))
- if not requests:
- return self.result_container
- # send all search-request
- threaded_requests(requests)
- start_new_thread(gc.collect, tuple())
+ # update timeout_limit
+ timeout_limit = max(timeout_limit, engine.timeout)
+
+ if requests:
+ # send all search-request
+ search_multiple_requests(requests, self.result_container, timeout_limit - (time() - start_time))
+ start_new_thread(gc.collect, tuple())
# return results, suggestions, answers and infoboxes
return self.result_container