diff options
-rw-r--r-- | searx/__init__.py | 3 | ||||
-rw-r--r-- | searx/engines/__init__.py | 11 | ||||
-rw-r--r-- | searx/query.py | 43 | ||||
-rw-r--r-- | searx/search.py | 104 | ||||
-rw-r--r-- | searx/utils.py | 9 | ||||
-rw-r--r-- | searx/webapp.py | 31 |
6 files changed, 121 insertions, 80 deletions
diff --git a/searx/__init__.py b/searx/__init__.py index c4c363bf8..46685817a 100644 --- a/searx/__init__.py +++ b/searx/__init__.py @@ -28,7 +28,8 @@ except: searx_dir = abspath(dirname(__file__)) engine_dir = dirname(realpath(__file__)) -# if possible set path to settings using the enviroment variable SEARX_SETTINGS_PATH +# if possible set path to settings using the +# enviroment variable SEARX_SETTINGS_PATH if 'SEARX_SETTINGS_PATH' in environ: settings_path = environ['SEARX_SETTINGS_PATH'] # otherwise using default path diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index e63dd7189..80356a8cd 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -41,7 +41,7 @@ def load_module(filename): module.name = modname return module -if not 'engines' in settings or not settings['engines']: +if 'engines' not in settings or not settings['engines']: print '[E] Error no engines found. Edit your settings.yml' exit(2) @@ -68,15 +68,15 @@ for engine_data in settings['engines']: engine.categories = ['general'] if not hasattr(engine, 'language_support'): - #engine.language_support = False + # engine.language_support = False engine.language_support = True if not hasattr(engine, 'timeout'): - #engine.language_support = False + # engine.language_support = False engine.timeout = settings['server']['request_timeout'] if not hasattr(engine, 'shortcut'): - #engine.shortcut = ''' + # engine.shortcut = ''' engine.shortcut = '' # checking required variables @@ -161,7 +161,8 @@ def get_engines_stats(): for engine in scores_per_result: if max_score_per_result: - engine['percentage'] = int(engine['avg'] / max_score_per_result * 100) + engine['percentage'] = int(engine['avg'] + / max_score_per_result * 100) else: engine['percentage'] = 0 diff --git a/searx/query.py b/searx/query.py index 612d46f4b..9f711e982 100644 --- a/searx/query.py +++ b/searx/query.py @@ -31,30 +31,31 @@ class Query(object): def __init__(self, query, blocked_engines): self.query = query self.blocked_engines = [] - + if blocked_engines: self.blocked_engines = blocked_engines - + self.query_parts = [] self.engines = [] self.languages = [] - - # parse query, if tags are set, which change the serch engine or search-language + + # parse query, if tags are set, which + # change the serch engine or search-language def parse_query(self): self.query_parts = [] - + # split query, including whitespaces raw_query_parts = re.split(r'(\s+)', self.query) - + parse_next = True - + for query_part in raw_query_parts: if not parse_next: self.query_parts[-1] += query_part continue - + parse_next = False - + # part does only contain spaces, skip if query_part.isspace()\ or query_part == '': @@ -62,15 +63,17 @@ class Query(object): self.query_parts.append(query_part) continue - # this force a language + # this force a language if query_part[0] == ':': lang = query_part[1:].lower() - # check if any language-code is equal with declared language-codes + # check if any language-code is equal with + # declared language-codes for lc in language_codes: lang_id, lang_name, country = map(str.lower, lc) - # if correct language-code is found, set it as new search-language + # if correct language-code is found + # set it as new search-language if lang == lang_id\ or lang_id.startswith(lang)\ or lang == lang_name\ @@ -89,23 +92,24 @@ class Query(object): parse_next = True self.engines.append({'category': 'none', 'name': engine_shortcuts[prefix]}) - + # check if prefix is equal with engine name elif prefix in engines\ - and not prefix in self.blocked_engines: + and prefix not in self.blocked_engines: parse_next = True self.engines.append({'category': 'none', 'name': prefix}) # check if prefix is equal with categorie name elif prefix in categories: - # using all engines for that search, which are declared under that categorie name + # using all engines for that search, which + # are declared under that categorie name parse_next = True self.engines.extend({'category': prefix, 'name': engine.name} for engine in categories[prefix] - if not engine in self.blocked_engines) - + if engine not in self.blocked_engines) + # append query part to query_part list self.query_parts.append(query_part) @@ -114,14 +118,13 @@ class Query(object): self.query_parts[-1] = search_query else: self.query_parts.append(search_query) - + def getSearchQuery(self): if len(self.query_parts): return self.query_parts[-1] else: return '' - + def getFullQuery(self): # get full querry including whitespaces return string.join(self.query_parts, '') - diff --git a/searx/search.py b/searx/search.py index 064c68844..f051d6df2 100644 --- a/searx/search.py +++ b/searx/search.py @@ -22,7 +22,7 @@ from datetime import datetime from operator import itemgetter from urlparse import urlparse, unquote from searx.engines import ( - categories, engines, engine_shortcuts + categories, engines ) from searx.languages import language_codes from searx.utils import gen_useragent @@ -39,7 +39,13 @@ def default_request_params(): # create a callback wrapper for the search engine results -def make_callback(engine_name, results, suggestions, answers, infoboxes, callback, params): +def make_callback(engine_name, + results, + suggestions, + answers, + infoboxes, + callback, + params): # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): @@ -95,7 +101,7 @@ def make_callback(engine_name, results, suggestions, answers, infoboxes, callbac def content_result_len(content): if isinstance(content, basestring): content = re.sub('[,;:!?\./\\\\ ()-_]', '', content) - return len(content) + return len(content) else: return 0 @@ -126,7 +132,8 @@ def score_results(results): # strip multiple spaces and cariage returns from content if 'content' in res: - res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', '')) + res['content'] = re.sub(' +', ' ', + res['content'].strip().replace('\n', '')) # get weight of this engine if possible if hasattr(engines[res['engine']], 'weight'): @@ -139,8 +146,12 @@ def score_results(results): duplicated = False for new_res in results: # remove / from the end of the url if required - p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa - p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa + p1 = res['parsed_url'].path[:-1]\ + if res['parsed_url'].path.endswith('/')\ + else res['parsed_url'].path + p2 = new_res['parsed_url'].path[:-1]\ + if new_res['parsed_url'].path.endswith('/')\ + else new_res['parsed_url'].path # check if that result is a duplicate if res['host'] == new_res['host'] and\ @@ -153,7 +164,8 @@ def score_results(results): # merge duplicates together if duplicated: # using content with more text - if content_result_len(res.get('content', '')) > content_result_len(duplicated.get('content', '')): + if content_result_len(res.get('content', '')) >\ + content_result_len(duplicated.get('content', '')): duplicated['content'] = res['content'] # increase result-score @@ -182,17 +194,25 @@ def score_results(results): for i, res in enumerate(results): # FIXME : handle more than one category per engine - category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template'] - - current = None if category not in categoryPositions else categoryPositions[category] - - # group with previous results using the same category if the group can accept more result and is not too far from the current position - if current != None and (current['count'] > 0) and (len(gresults) - current['index'] < 20): - # group with the previous results using the same category with this one + category = engines[res['engine']].categories[0] + ':' + ''\ + if 'template' not in res\ + else res['template'] + + current = None if category not in categoryPositions\ + else categoryPositions[category] + + # group with previous results using the same category + # if the group can accept more result and is not too far + # from the current position + if current is not None and (current['count'] > 0)\ + and (len(gresults) - current['index'] < 20): + # group with the previous results using + # the same category with this one index = current['index'] gresults.insert(index, res) - # update every index after the current one (including the current one) + # update every index after the current one + # (including the current one) for k in categoryPositions: v = categoryPositions[k]['index'] if v >= index: @@ -206,7 +226,7 @@ def score_results(results): gresults.append(res) # update categoryIndex - categoryPositions[category] = { 'index' : len(gresults), 'count' : 8 } + categoryPositions[category] = {'index': len(gresults), 'count': 8} # return gresults return gresults @@ -215,21 +235,21 @@ def score_results(results): def merge_two_infoboxes(infobox1, infobox2): if 'urls' in infobox2: urls1 = infobox1.get('urls', None) - if urls1 == None: + if urls1 is None: urls1 = [] infobox1.set('urls', urls1) urlSet = set() for url in infobox1.get('urls', []): urlSet.add(url.get('url', None)) - + for url in infobox2.get('urls', []): if url.get('url', None) not in urlSet: urls1.append(url) if 'attributes' in infobox2: attributes1 = infobox1.get('attributes', None) - if attributes1 == None: + if attributes1 is None: attributes1 = [] infobox1.set('attributes', attributes1) @@ -237,14 +257,14 @@ def merge_two_infoboxes(infobox1, infobox2): for attribute in infobox1.get('attributes', []): if attribute.get('label', None) not in attributeSet: attributeSet.add(attribute.get('label', None)) - + for attribute in infobox2.get('attributes', []): attributes1.append(attribute) if 'content' in infobox2: content1 = infobox1.get('content', None) content2 = infobox2.get('content', '') - if content1 != None: + if content1 is not None: if content_result_len(content2) > content_result_len(content1): infobox1['content'] = content2 else: @@ -257,12 +277,12 @@ def merge_infoboxes(infoboxes): for infobox in infoboxes: add_infobox = True infobox_id = infobox.get('id', None) - if infobox_id != None: + if infobox_id is not None: existingIndex = infoboxes_id.get(infobox_id, None) - if existingIndex != None: + if existingIndex is not None: merge_two_infoboxes(results[existingIndex], infobox) - add_infobox=False - + add_infobox = False + if add_infobox: results.append(infobox) infoboxes_id[infobox_id] = len(results)-1 @@ -318,7 +338,8 @@ class Search(object): self.pageno = int(pageno_param) - # parse query, if tags are set, which change the serch engine or search-language + # parse query, if tags are set, which change + # the serch engine or search-language query_obj = Query(self.request_data['q'], self.blocked_engines) query_obj.parse_query() @@ -334,25 +355,29 @@ class Search(object): self.categories = [] - # if engines are calculated from query, set categories by using that informations + # if engines are calculated from query, + # set categories by using that informations if self.engines: self.categories = list(set(engine['category'] for engine in self.engines)) - # otherwise, using defined categories to calculate which engines should be used + # otherwise, using defined categories to + # calculate which engines should be used else: # set used categories for pd_name, pd in self.request_data.items(): if pd_name.startswith('category_'): category = pd_name[9:] # if category is not found in list, skip - if not category in categories: + if category not in categories: continue # add category to list self.categories.append(category) - # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie) + # if no category is specified for this search, + # using user-defined default-configuration which + # (is stored in cookie) if not self.categories: cookie_categories = request.cookies.get('categories', '') cookie_categories = cookie_categories.split(',') @@ -360,16 +385,18 @@ class Search(object): if ccateg in categories: self.categories.append(ccateg) - # if still no category is specified, using general as default-category + # if still no category is specified, using general + # as default-category if not self.categories: self.categories = ['general'] - # using all engines for that search, which are declared under the specific categories + # using all engines for that search, which are + # declared under the specific categories for categ in self.categories: self.engines.extend({'category': categ, 'name': x.name} for x in categories[categ] - if not x.name in self.blocked_engines) + if x.name not in self.blocked_engines) # do search-request def search(self, request): @@ -386,7 +413,7 @@ class Search(object): number_of_searches += 1 # set default useragent - #user_agent = request.headers.get('User-Agent', '') + # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() # start search-reqest for all selected engines @@ -400,7 +427,8 @@ class Search(object): if self.pageno > 1 and not engine.paging: continue - # if search-language is set and engine does not provide language-support, skip + # if search-language is set and engine does not + # provide language-support, skip if self.lang != 'all' and not engine.language_support: continue @@ -412,7 +440,8 @@ class Search(object): request_params['pageno'] = self.pageno request_params['language'] = self.lang - # update request parameters dependent on search-engine (contained in engines folder) + # update request parameters dependent on + # search-engine (contained in engines folder) request_params = engine.request(self.query.encode('utf-8'), request_params) @@ -431,7 +460,8 @@ class Search(object): request_params ) - # create dictionary which contain all informations about the request + # create dictionary which contain all + # informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), diff --git a/searx/utils.py b/searx/utils.py index 28e42b272..7764291fc 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -1,4 +1,4 @@ -#import htmlentitydefs +# import htmlentitydefs from codecs import getincrementalencoder from HTMLParser import HTMLParser from random import choice @@ -22,7 +22,8 @@ def gen_useragent(): def searx_useragent(): return 'searx' - + + def highlight_content(content, query): if not content: @@ -67,8 +68,8 @@ class HTMLTextExtractor(HTMLParser): self.result.append(unichr(codepoint)) def handle_entityref(self, name): - #codepoint = htmlentitydefs.name2codepoint[name] - #self.result.append(unichr(codepoint)) + # codepoint = htmlentitydefs.name2codepoint[name] + # self.result.append(unichr(codepoint)) self.result.append(name) def get_text(self): diff --git a/searx/webapp.py b/searx/webapp.py index 830cf440a..0555d6ca9 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -71,7 +71,7 @@ app.secret_key = settings['server']['secret_key'] babel = Babel(app) -#TODO configurable via settings.yml +# TODO configurable via settings.yml favicons = ['wikipedia', 'youtube', 'vimeo', 'soundcloud', 'twitter', 'stackoverflow', 'github'] @@ -146,14 +146,14 @@ def render(template_name, override_theme=None, **kwargs): nonblocked_categories = set(chain.from_iterable(nonblocked_categories)) - if not 'categories' in kwargs: + if 'categories' not in kwargs: kwargs['categories'] = ['general'] kwargs['categories'].extend(x for x in sorted(categories.keys()) if x != 'general' and x in nonblocked_categories) - if not 'selected_categories' in kwargs: + if 'selected_categories' not in kwargs: kwargs['selected_categories'] = [] for arg in request.args: if arg.startswith('category_'): @@ -168,7 +168,7 @@ def render(template_name, override_theme=None, **kwargs): if not kwargs['selected_categories']: kwargs['selected_categories'] = ['general'] - if not 'autocomplete' in kwargs: + if 'autocomplete' not in kwargs: kwargs['autocomplete'] = autocomplete kwargs['method'] = request.cookies.get('method', 'POST') @@ -202,14 +202,15 @@ def index(): 'index.html', ) - search.results, search.suggestions, search.answers, search.infoboxes = search.search(request) + search.results, search.suggestions,\ + search.answers, search.infoboxes = search.search(request) for result in search.results: if not search.paging and engines[result['engine']].paging: search.paging = True - # check if HTTPS rewrite is required + # check if HTTPS rewrite is required if settings['server']['https_rewrite']\ and result['parsed_url'].scheme == 'http': @@ -236,7 +237,7 @@ def index(): try: # TODO, precompile rule p = re.compile(rule[0]) - + # rewrite url if possible new_result_url = p.sub(rule[1], result['url']) except: @@ -250,17 +251,21 @@ def index(): continue # get domainname from result - # TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de + # TODO, does only work correct with TLD's like + # asdf.com, not for asdf.com.de # TODO, using publicsuffix instead of this rewrite rule - old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:]) - new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:]) + old_result_domainname = '.'.join( + result['parsed_url'].hostname.split('.')[-2:]) + new_result_domainname = '.'.join( + new_parsed_url.hostname.split('.')[-2:]) - # check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules + # check if rewritten hostname is the same, + # to protect against wrong or malicious rewrite rules if old_result_domainname == new_result_domainname: # set new url result['url'] = new_result_url - # target has matched, do not search over the other rules + # target has matched, do not search over the other rules break if search.request_data.get('format', 'html') == 'html': @@ -429,7 +434,7 @@ def preferences(): for pd_name, pd in request.form.items(): if pd_name.startswith('category_'): category = pd_name[9:] - if not category in categories: + if category not in categories: continue selected_categories.append(category) elif pd_name == 'locale' and pd in settings['locales']: |