summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2023-06-18 16:43:48 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2023-06-19 19:49:44 +0200
commitfa1ef9a07b79ab740c127bac0d11b8315a5130ff (patch)
tree80306333c2e5a13a0b3a286e7dad7b5df633689e /searx
parent71b6ff07ca137a39576c3084abec348ded40564e (diff)
downloadsearxng-fa1ef9a07b79ab740c127bac0d11b8315a5130ff.tar.gz
searxng-fa1ef9a07b79ab740c127bac0d11b8315a5130ff.zip
[mod] move some code from webapp module to webutils module (no functional change)
Over the years the webapp module became more and more a mess. To improve the modulaization a little this patch moves some implementations from the webapp module to webutils module. HINT: this patch brings non functional change Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r--searx/results.py17
-rwxr-xr-xsearx/webapp.py151
-rw-r--r--searx/webutils.py117
3 files changed, 159 insertions, 126 deletions
diff --git a/searx/results.py b/searx/results.py
index 5dd1bff21..caf02213d 100644
--- a/searx/results.py
+++ b/searx/results.py
@@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set
from urllib.parse import urlparse, unquote
from searx import logger
+from searx import utils
from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error
@@ -353,6 +354,10 @@ class ResultContainer:
for result in self._merged_results:
score = result_score(result)
result['score'] = score
+ if result.get('content'):
+ result['content'] = utils.html_to_text(result['content']).strip()
+ # removing html content and whitespace duplications
+ result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())
for result_engine in result['engines']:
counter_add(score, 'engine', result_engine, 'score')
@@ -415,11 +420,19 @@ class ResultContainer:
def results_length(self):
return len(self._merged_results)
- def results_number(self):
+ @property
+ def number_of_results(self) -> int:
+ """Returns the average of results number, returns zero if the average
+ result number is smaller than the actual result count."""
+
resultnum_sum = sum(self._number_of_results)
if not resultnum_sum or not self._number_of_results:
return 0
- return resultnum_sum / len(self._number_of_results)
+
+ average = int(resultnum_sum / len(self._number_of_results))
+ if average < self.results_length():
+ average = 0
+ return average
def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
if engines[engine_name].display_error_messages:
diff --git a/searx/webapp.py b/searx/webapp.py
index d6322447a..59c1dd1a1 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -58,7 +58,7 @@ from searx import (
from searx import infopage
from searx.data import ENGINE_DESCRIPTIONS
-from searx.results import Timing, UnresponsiveEngine
+from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path
from searx.exceptions import SearxParameterException
@@ -68,18 +68,18 @@ from searx.engines import (
engines,
engine_shortcuts,
)
+
+from searx import webutils
from searx.webutils import (
- UnicodeWriter,
highlight_content,
get_static_files,
get_result_templates,
get_themes,
- prettify_url,
+ exception_classname_to_text,
new_hmac,
is_hmac_of,
is_flask_run_cmdline,
group_engines_in_tab,
- searxng_l10n_timespan,
)
from searx.webadapter import (
get_search_query_from_webapp,
@@ -87,7 +87,6 @@ from searx.webadapter import (
parse_lang,
)
from searx.utils import (
- html_to_text,
gen_useragent,
dict_subset,
)
@@ -165,39 +164,6 @@ app.jinja_env.add_extension('jinja2.ext.loopcontrols') # pylint: disable=no-mem
app.jinja_env.filters['group_engines_in_tab'] = group_engines_in_tab # pylint: disable=no-member
app.secret_key = settings['server']['secret_key']
-timeout_text = gettext('timeout')
-parsing_error_text = gettext('parsing error')
-http_protocol_error_text = gettext('HTTP protocol error')
-network_error_text = gettext('network error')
-ssl_cert_error_text = gettext("SSL error: certificate validation has failed")
-exception_classname_to_text = {
- None: gettext('unexpected crash'),
- 'timeout': timeout_text,
- 'asyncio.TimeoutError': timeout_text,
- 'httpx.TimeoutException': timeout_text,
- 'httpx.ConnectTimeout': timeout_text,
- 'httpx.ReadTimeout': timeout_text,
- 'httpx.WriteTimeout': timeout_text,
- 'httpx.HTTPStatusError': gettext('HTTP error'),
- 'httpx.ConnectError': gettext("HTTP connection error"),
- 'httpx.RemoteProtocolError': http_protocol_error_text,
- 'httpx.LocalProtocolError': http_protocol_error_text,
- 'httpx.ProtocolError': http_protocol_error_text,
- 'httpx.ReadError': network_error_text,
- 'httpx.WriteError': network_error_text,
- 'httpx.ProxyError': gettext("proxy error"),
- 'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"),
- 'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"),
- 'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"),
- 'searx.exceptions.SearxEngineAPIException': gettext("server API error"),
- 'searx.exceptions.SearxEngineXPathException': parsing_error_text,
- 'KeyError': parsing_error_text,
- 'json.decoder.JSONDecodeError': parsing_error_text,
- 'lxml.etree.ParserError': parsing_error_text,
- 'ssl.SSLCertVerificationError': ssl_cert_error_text, # for Python > 3.7
- 'ssl.CertificateError': ssl_cert_error_text, # for Python 3.7
-}
-
class ExtendedRequest(flask.Request):
"""This class is never initialized and only used for type checking."""
@@ -686,9 +652,7 @@ def search():
search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp(
request.preferences, request.form
)
- # search = Search(search_query) # without plugins
search = SearchWithPlugins(search_query, request.user_plugins, request) # pylint: disable=redefined-outer-name
-
result_container = search.search()
except SearxParameterException as e:
@@ -698,45 +662,54 @@ def search():
logger.exception(e, exc_info=True)
return index_error(output_format, gettext('search error')), 500
- # results
- results = result_container.get_ordered_results()
- number_of_results = result_container.results_number()
- if number_of_results < result_container.results_length():
- number_of_results = 0
-
- # checkin for a external bang
+ # 1. check if the result is a redirect for an external bang
if result_container.redirect_url:
return redirect(result_container.redirect_url)
- # Server-Timing header
+ # 2. add Server-Timing header for measuring performance characteristics of
+ # web applications
request.timings = result_container.get_timings() # pylint: disable=assigning-non-slot
+ # 3. formats without a template
+
+ if output_format == 'json':
+
+ response = webutils.get_json_response(search_query, result_container)
+ return Response(response, mimetype='application/json')
+
+ if output_format == 'csv':
+
+ csv = webutils.CSVWriter(StringIO())
+ webutils.write_csv_response(csv, result_container)
+ csv.stream.seek(0)
+
+ response = Response(csv.stream.read(), mimetype='application/csv')
+ cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query)
+ response.headers.add('Content-Disposition', cont_disp)
+ return response
+
+ # 4. formats rendered by a template / RSS & HTML
+
current_template = None
previous_result = None
- # output
+ results = result_container.get_ordered_results()
for result in results:
if output_format == 'html':
if 'content' in result and result['content']:
result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
if 'title' in result and result['title']:
result['title'] = highlight_content(escape(result['title'] or ''), search_query.query)
- else:
- if result.get('content'):
- result['content'] = html_to_text(result['content']).strip()
- # removing html content and whitespace duplications
- result['title'] = ' '.join(html_to_text(result['title']).strip().split())
if 'url' in result:
- result['pretty_url'] = prettify_url(result['url'])
-
+ result['pretty_url'] = webutils.prettify_url(result['url'])
if result.get('publishedDate'): # do not try to get a date from an empty string or a None type
try: # test if publishedDate >= 1900 (datetime module bug)
result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
except ValueError:
result['publishedDate'] = None
else:
- result['publishedDate'] = searxng_l10n_timespan(result['publishedDate'])
+ result['publishedDate'] = webutils.searxng_l10n_timespan(result['publishedDate'])
# set result['open_group'] = True when the template changes from the previous result
# set result['close_group'] = True when the template changes on the next result
@@ -750,42 +723,7 @@ def search():
if previous_result:
previous_result['close_group'] = True
- if output_format == 'json':
- x = {
- 'query': search_query.query,
- 'number_of_results': number_of_results,
- 'results': results,
- 'answers': list(result_container.answers),
- 'corrections': list(result_container.corrections),
- 'infoboxes': result_container.infoboxes,
- 'suggestions': list(result_container.suggestions),
- 'unresponsive_engines': __get_translated_errors(result_container.unresponsive_engines),
- }
- response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item)
- return Response(response, mimetype='application/json')
-
- if output_format == 'csv':
- csv = UnicodeWriter(StringIO())
- keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
- csv.writerow(keys)
- for row in results:
- row['host'] = row['parsed_url'].netloc
- row['type'] = 'result'
- csv.writerow([row.get(key, '') for key in keys])
- for a in result_container.answers:
- row = {'title': a, 'type': 'answer'}
- csv.writerow([row.get(key, '') for key in keys])
- for a in result_container.suggestions:
- row = {'title': a, 'type': 'suggestion'}
- csv.writerow([row.get(key, '') for key in keys])
- for a in result_container.corrections:
- row = {'title': a, 'type': 'correction'}
- csv.writerow([row.get(key, '') for key in keys])
- csv.stream.seek(0)
- response = Response(csv.stream.read(), mimetype='application/csv')
- cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query)
- response.headers.add('Content-Disposition', cont_disp)
- return response
+ # 4.a RSS
if output_format == 'rss':
response_rss = render(
@@ -795,11 +733,11 @@ def search():
corrections=result_container.corrections,
suggestions=result_container.suggestions,
q=request.form['q'],
- number_of_results=number_of_results,
+ number_of_results=result_container.number_of_results,
)
return Response(response_rss, mimetype='text/xml')
- # HTML output format
+ # 4.b HTML
# suggestions: use RawTextQuery to get the suggestion URLs with the same bang
suggestion_urls = list(
@@ -827,14 +765,14 @@ def search():
selected_categories = search_query.categories,
pageno = search_query.pageno,
time_range = search_query.time_range or '',
- number_of_results = format_decimal(number_of_results),
+ number_of_results = format_decimal(result_container.number_of_results),
suggestions = suggestion_urls,
answers = result_container.answers,
corrections = correction_urls,
infoboxes = result_container.infoboxes,
engine_data = result_container.engine_data,
paging = result_container.paging,
- unresponsive_engines = __get_translated_errors(
+ unresponsive_engines = webutils.get_translated_errors(
result_container.unresponsive_engines
),
current_locale = request.preferences.get_value("locale"),
@@ -849,25 +787,6 @@ def search():
)
-def __get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]):
- translated_errors = []
-
- # make a copy unresponsive_engines to avoid "RuntimeError: Set changed size
- # during iteration" it happens when an engine modifies the ResultContainer
- # after the search_multiple_requests method has stopped waiting
-
- for unresponsive_engine in unresponsive_engines:
- error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type)
- if not error_user_text:
- error_user_text = exception_classname_to_text[None]
- error_msg = gettext(error_user_text)
- if unresponsive_engine.suspended:
- error_msg = gettext('Suspended') + ': ' + error_msg
- translated_errors.append((unresponsive_engine.engine, error_msg))
-
- return sorted(translated_errors, key=lambda e: e[0])
-
-
@app.route('/about', methods=['GET'])
def about():
"""Redirect to about page"""
diff --git a/searx/webutils.py b/searx/webutils.py
index 470833291..ddd9891bf 100644
--- a/searx/webutils.py
+++ b/searx/webutils.py
@@ -9,31 +9,80 @@ import hmac
import re
import inspect
import itertools
+import json
from datetime import datetime, timedelta
from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING
from io import StringIO
from codecs import getincrementalencoder
-from flask_babel import gettext, format_date
+from flask_babel import gettext, format_date # type: ignore
from searx import logger, settings
from searx.engines import DEFAULT_CATEGORY
if TYPE_CHECKING:
from searx.enginelib import Engine
-
+ from searx.results import ResultContainer
+ from searx.search import SearchQuery
+ from searx.results import UnresponsiveEngine
VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$')
logger = logger.getChild('webutils')
-
-class UnicodeWriter:
- """
- A CSV writer which will write rows to CSV file "f",
- which is encoded in the given encoding.
- """
+timeout_text = gettext('timeout')
+parsing_error_text = gettext('parsing error')
+http_protocol_error_text = gettext('HTTP protocol error')
+network_error_text = gettext('network error')
+ssl_cert_error_text = gettext("SSL error: certificate validation has failed")
+exception_classname_to_text = {
+ None: gettext('unexpected crash'),
+ 'timeout': timeout_text,
+ 'asyncio.TimeoutError': timeout_text,
+ 'httpx.TimeoutException': timeout_text,
+ 'httpx.ConnectTimeout': timeout_text,
+ 'httpx.ReadTimeout': timeout_text,
+ 'httpx.WriteTimeout': timeout_text,
+ 'httpx.HTTPStatusError': gettext('HTTP error'),
+ 'httpx.ConnectError': gettext("HTTP connection error"),
+ 'httpx.RemoteProtocolError': http_protocol_error_text,
+ 'httpx.LocalProtocolError': http_protocol_error_text,
+ 'httpx.ProtocolError': http_protocol_error_text,
+ 'httpx.ReadError': network_error_text,
+ 'httpx.WriteError': network_error_text,
+ 'httpx.ProxyError': gettext("proxy error"),
+ 'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"),
+ 'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"),
+ 'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"),
+ 'searx.exceptions.SearxEngineAPIException': gettext("server API error"),
+ 'searx.exceptions.SearxEngineXPathException': parsing_error_text,
+ 'KeyError': parsing_error_text,
+ 'json.decoder.JSONDecodeError': parsing_error_text,
+ 'lxml.etree.ParserError': parsing_error_text,
+ 'ssl.SSLCertVerificationError': ssl_cert_error_text, # for Python > 3.7
+ 'ssl.CertificateError': ssl_cert_error_text, # for Python 3.7
+}
+
+
+def get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]):
+ translated_errors = []
+
+ for unresponsive_engine in unresponsive_engines:
+ error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type)
+ if not error_user_text:
+ error_user_text = exception_classname_to_text[None]
+ error_msg = gettext(error_user_text)
+ if unresponsive_engine.suspended:
+ error_msg = gettext('Suspended') + ': ' + error_msg
+ translated_errors.append((unresponsive_engine.engine, error_msg))
+
+ return sorted(translated_errors, key=lambda e: e[0])
+
+
+class CSVWriter:
+ """A CSV writer which will write rows to CSV file "f", which is encoded in
+ the given encoding."""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
@@ -59,6 +108,58 @@ class UnicodeWriter:
self.writerow(row)
+def write_csv_response(csv: CSVWriter, rc: ResultContainer) -> None:
+ """Write rows of the results to a query (``application/csv``) into a CSV
+ table (:py:obj:`CSVWriter`). First line in the table contain the column
+ names. The column "type" specifies the type, the following types are
+ included in the table:
+
+ - result
+ - answer
+ - suggestion
+ - correction
+
+ """
+
+ results = rc.get_ordered_results()
+ keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
+ csv.writerow(keys)
+
+ for row in results:
+ row['host'] = row['parsed_url'].netloc
+ row['type'] = 'result'
+ csv.writerow([row.get(key, '') for key in keys])
+
+ for a in rc.answers:
+ row = {'title': a, 'type': 'answer'}
+ csv.writerow([row.get(key, '') for key in keys])
+
+ for a in rc.suggestions:
+ row = {'title': a, 'type': 'suggestion'}
+ csv.writerow([row.get(key, '') for key in keys])
+
+ for a in rc.corrections:
+ row = {'title': a, 'type': 'correction'}
+ csv.writerow([row.get(key, '') for key in keys])
+
+
+def get_json_response(sq: SearchQuery, rc: ResultContainer) -> str:
+ """Returns the JSON string of the results to a query (``application/json``)"""
+ results = rc.number_of_results
+ x = {
+ 'query': sq.query,
+ 'number_of_results': results,
+ 'results': rc.get_ordered_results(),
+ 'answers': list(rc.answers),
+ 'corrections': list(rc.corrections),
+ 'infoboxes': rc.infoboxes,
+ 'suggestions': list(rc.suggestions),
+ 'unresponsive_engines': get_translated_errors(rc.unresponsive_engines),
+ }
+ response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item)
+ return response
+
+
def get_themes(templates_path):
"""Returns available themes list."""
return os.listdir(templates_path)