diff options
47 files changed, 476 insertions, 415 deletions
diff --git a/.config.sh b/.config.sh index 4eff5f4c6..f9bac7383 100644 --- a/.config.sh +++ b/.config.sh @@ -26,6 +26,7 @@ fi # --------- # SEARX_INTERNAL_URL="127.0.0.1:8888" +# SEARX_SETTINGS_TEMPLATE="${REPO_ROOT}/utils/templates/etc/searx/use_default_settings.yml" # Only change, if you maintain a searx brand in your searx fork. # GIT_BRANCH="${GIT_BRANCH:-master}" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6850ab405..300349f3b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,8 +2,8 @@ ## Resources in the documentation -* [Development quickstart](http://searx.github.io/searx/dev/contribution_guide.html) -* [Contribution guide](http://searx.github.io/searx/dev/contribution_guide.html) +* [Development quickstart](https://searx.github.io/searx/dev/contribution_guide.html) +* [Contribution guide](https://searx.github.io/searx/dev/contribution_guide.html) ## Submitting PRs diff --git a/dockerfiles/docker-entrypoint.sh b/dockerfiles/docker-entrypoint.sh index a8f18f05b..accc015f7 100755 --- a/dockerfiles/docker-entrypoint.sh +++ b/dockerfiles/docker-entrypoint.sh @@ -66,7 +66,7 @@ patch_searx_settings() { CONF="$1" # Make sure that there is trailing slash at the end of BASE_URL - # see http://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Shell-Parameter-Expansion + # see https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Shell-Parameter-Expansion export BASE_URL="${BASE_URL%/}/" # update settings.yml diff --git a/docs/admin/buildhosts.rst b/docs/admin/buildhosts.rst index a727d25b9..1f6eb472e 100644 --- a/docs/admin/buildhosts.rst +++ b/docs/admin/buildhosts.rst @@ -67,7 +67,7 @@ to ``imgmath``: If your docs build (``make docs``) shows warnings like this:: WARNING: dot(1) not found, for better output quality install \ - graphviz from http://www.graphviz.org + graphviz from https://www.graphviz.org .. WARNING: LaTeX command 'latex' cannot be run (needed for math \ display), check the imgmath_latex setting diff --git a/docs/admin/installation-nginx.rst b/docs/admin/installation-nginx.rst index 65fd73573..589c40ada 100644 --- a/docs/admin/installation-nginx.rst +++ b/docs/admin/installation-nginx.rst @@ -9,7 +9,7 @@ Install with nginx .. _nginx server configuration: https://docs.nginx.com/nginx/admin-guide/web-server/web-server/#setting-up-virtual-servers .. _nginx beginners guide: - http://nginx.org/en/docs/beginners_guide.html + https://nginx.org/en/docs/beginners_guide.html .. _Getting Started wiki: https://www.nginx.com/resources/wiki/start/ .. _uWSGI support from nginx: diff --git a/docs/admin/installation-searx.rst b/docs/admin/installation-searx.rst index a368bfe8c..3f8904a1d 100644 --- a/docs/admin/installation-searx.rst +++ b/docs/admin/installation-searx.rst @@ -64,17 +64,38 @@ from the login (*~/.profile*): Open a second terminal for the configuration tasks and left the ``(searx)$`` terminal open for the tasks below. + +.. _use_default_settings.yml: + Configuration -============== +============= + +To create a initial ``/etc/searx/settings.yml`` you can start with a copy of the +file :origin:`utils/templates/etc/searx/use_default_settings.yml`. This setup +:option:ref:`use default settings <settings use_default_settings>` from +:origin:`searx/settings.yml` and is recommended since :pull:`2291` is merged. -Create a copy of the :origin:`searx/settings.yml` configuration file in system's -*/etc* folder. Configure like shown below -- replace ``searx@\$(uname -n)`` with -a name of your choice -- *and/or* edit ``/etc/searx/settings.yml`` if necessary. +For minimal Setup, configure like shown below – replace ``searx@\$(uname -n)`` +with a name of your choice, set ``ultrasecretkey`` -- *and/or* edit +``/etc/searx/settings.yml`` to your needs. .. kernel-include:: $DOCS_BUILD/includes/searx.rst :start-after: START searx config :end-before: END searx config +.. tabs:: + + .. group-tab:: Use default settings + + .. literalinclude:: ../../utils/templates/etc/searx/use_default_settings.yml + :language: yaml + + .. group-tab:: searx/settings.yml + + .. literalinclude:: ../../searx/settings.yml + :language: yaml + + Check ===== diff --git a/docs/admin/settings.rst b/docs/admin/settings.rst index 532b99752..985c16f85 100644 --- a/docs/admin/settings.rst +++ b/docs/admin/settings.rst @@ -9,6 +9,7 @@ file. .. sidebar:: Further reading .. + - :ref:`use_default_settings.yml` - :ref:`search API` .. contents:: Contents @@ -16,92 +17,61 @@ file. :local: :backlinks: entry -.. _settings global: +.. _settings location: -Global Settings -=============== +settings.yml location +===================== -.. code:: yaml +First, searx will try to load settings.yml from these locations: - server: - port : 8888 - secret_key : "ultrasecretkey" # change this! - debug : False # debug mode, only for development - request_timeout : 2.0 # seconds - base_url : False # set custom base_url (or False) - themes_path : "" # custom ui themes path - default_theme : oscar # ui theme - useragent_suffix : "" # suffix of searx_useragent, could contain - # informations like admins email address - image_proxy : False # proxying image results through searx - default_locale : "" # default interface locale +1. the full path specified in the ``SEARX_SETTINGS_PATH`` environment variable. +2. ``/etc/searx/settings.yml`` - outgoing: # communication with search engines - request_timeout : 2.0 # default timeout in seconds, can be override by engine - # max_request_timeout: 10.0 # the maximum timeout in seconds - useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator - pool_connections : 100 # Number of different hosts - pool_maxsize : 10 # Number of simultaneous requests by host - - #proxies: - # http: - # - http://proxy1:8080 - # - http://proxy2:8080 - # https: - # - http://proxy1:8080 - # - http://proxy2:8080 - # - socks5://user:password@proxy3:1080 - # - socks5h://user:password@proxy4:1080 - - #source_ips: - # - 1.1.1.1 - # - 1.1.1.2 +If these files don't exist (or are empty or can't be read), searx uses the :origin:`searx/settings.yml` file. - locales: - en : English - de : Deutsch - he : Hebrew - hu : Magyar - fr : Français - es : Español - it : Italiano - nl : Nederlands - ja : 日本語 (Japanese) - tr : Türkçe - ru : Russian - ro : Romanian +.. _settings global: -``port`` : - Port number of the searx web application if you run it directly using ``python - searx/webapp.py``. Doesn't apply to searx running on Apache or Nginx. +Global Settings +=============== -``secret_key`` : - Used for cryptography purpose. +.. code:: yaml + + general: + debug : False # Debug mode, only for development + instance_name : "searx" # displayed name ``debug`` : Allow a more detailed log if you run searx directly. Display *detailed* error messages in the browser too, so this must be deactivated in production. -``request_timeout`` : - Global timeout of the requests made to others engines in seconds. A bigger - timeout will allow to wait for answers from slow engines, but in consequence - will slow searx reactivity (the result page may take the time specified in the - timeout to load) - -``base_url`` : - The base URL where searx is deployed. Used to create correct inbound links. +.. code:: yaml -``themes_path`` : - Path to where the themes are located. If you didn't develop anything, leave it - blank. + server: + port : 8888 + bind_address : "127.0.0.1" # address to listen on + secret_key : "ultrasecretkey" # change this! + base_url : False # set custom base_url (or False) + image_proxy : False # proxying image results through searx + default_locale : "" # default interface locale + default_theme : oscar # ui theme + default_http_headers: + X-Content-Type-Options : nosniff + X-XSS-Protection : 1; mode=block + X-Download-Options : noopen + X-Robots-Tag : noindex, nofollow + Referrer-Policy : no-referrer + +``port`` & ``bind_address``: + Port number and *bind address* of the searx web application if you run it + directly using ``python searx/webapp.py``. Doesn't apply to searx running on + Apache or Nginx. -``default_theme`` : - Name of the theme you want to use by default on your searx instance. +``secret_key`` : + Used for cryptography purpose. -``useragent_suffix`` : - Suffix to the user-agent searx uses to send requests to others engines. If an - engine wish to block you, a contact info here may be useful to avoid that. +``base_url`` : + The base URL where searx is deployed. Used to create correct inbound links. ``image_proxy`` : Allow your instance of searx of being able to proxy images. Uses memory space. @@ -112,7 +82,49 @@ Global Settings specific instance of searx, a locale can be defined using an ISO language code, like ``fr``, ``en``, ``de``. -.. _requests proxies: http://requests.readthedocs.io/en/latest/user/advanced/#proxies +``default_theme`` : + Name of the theme you want to use by default on your searx instance. + +.. _HTTP headers: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers + +``default_http_headers``: + Set additional HTTP headers, see `#755 <https://github.com/searx/searx/issues/715>`__ + + +.. code:: yaml + + outgoing: # communication with search engines + request_timeout : 2.0 # default timeout in seconds, can be override by engine + # max_request_timeout: 10.0 # the maximum timeout in seconds + useragent_suffix : "" # informations like an email address to the administrator + pool_connections : 100 # Number of different hosts + pool_maxsize : 10 # Number of simultaneous requests by host + # uncomment below section if you want to use a proxy + # proxies: + # http: + # - http://proxy1:8080 + # - http://proxy2:8080 + # https: + # - http://proxy1:8080 + # - http://proxy2:8080 + # uncomment below section only if you have more than one network interface + # which can be the source of outgoing search requests + # source_ips: + # - 1.1.1.1 + # - 1.1.1.2 + + +``request_timeout`` : + Global timeout of the requests made to others engines in seconds. A bigger + timeout will allow to wait for answers from slow engines, but in consequence + will slow searx reactivity (the result page may take the time specified in the + timeout to load). Can be override by :ref:`settings engine` + +``useragent_suffix`` : + Suffix to the user-agent searx uses to send requests to others engines. If an + engine wish to block you, a contact info here may be useful to avoid that. + +.. _requests proxies: https://requests.readthedocs.io/en/latest/user/advanced/#proxies .. _PySocks: https://pypi.org/project/PySocks/ ``proxies`` : @@ -120,10 +132,29 @@ Global Settings If there are more than one proxy for one protocol (http, https), requests to the engines are distributed in a round-robin fashion. + - Proxy: `see <https://2.python-requests.org/en/latest/user/advanced/#proxies>`__. + - SOCKS proxies are also supported: `see <https://2.python-requests.org/en/latest/user/advanced/#socks>`__ + ``source_ips`` : If you use multiple network interfaces, define from which IP the requests must be made. This parameter is ignored when ``proxies`` is set. +.. code:: yaml + + locales: + en : English + de : Deutsch + he : Hebrew + hu : Magyar + fr : Français + es : Español + it : Italiano + nl : Nederlands + ja : 日本語 (Japanese) + tr : Türkçe + ru : Russian + ro : Romanian + ``locales`` : Locales codes and their names. Available translations of searx interface. @@ -208,19 +239,7 @@ Engine settings engines, and so won't be described here. -.. _settings location: - -settings.yml location -===================== - -First, searx will try to load settings.yml from these locations: - -1. the full path specified in the ``SEARX_SETTINGS_PATH`` environment variable. -2. ``/etc/searx/settings.yml`` - -If these files don't exist (or are empty or can't be read), searx uses the :origin:`searx/settings.yml` file. - -.. _ settings use_default_settings: +.. _settings use_default_settings: use_default_settings ==================== diff --git a/docs/build-templates/searx.rst b/docs/build-templates/searx.rst index 080de293f..fe82ec3d0 100644 --- a/docs/build-templates/searx.rst +++ b/docs/build-templates/searx.rst @@ -128,12 +128,28 @@ ${fedora_build} .. tabs:: - .. group-tab:: bash + .. group-tab:: Use default settings + + .. code-block:: sh + + $ sudo -H mkdir -p \"$(dirname ${SEARX_SETTINGS_PATH})\" + $ sudo -H cp \"$SEARX_SRC/utils/templates/etc/searx/use_default_settings.yml\" \\ + \"${SEARX_SETTINGS_PATH}\" + + .. group-tab:: searx/settings.yml .. code-block:: sh $ sudo -H mkdir -p \"$(dirname ${SEARX_SETTINGS_PATH})\" - $ sudo -H cp \"$SEARX_SRC/searx/settings.yml\" \"${SEARX_SETTINGS_PATH}\" + $ sudo -H cp \"$SEARX_SRC/searx/settings.yml\" \\ + \"${SEARX_SETTINGS_PATH}\" + +.. tabs:: + + .. group-tab:: minimal setup + + .. code-block:: sh + $ sudo -H sed -i -e \"s/ultrasecretkey/\$(openssl rand -hex 16)/g\" \"$SEARX_SETTINGS_PATH\" $ sudo -H sed -i -e \"s/{instance_name}/searx@\$(uname -n)/g\" \"$SEARX_SETTINGS_PATH\" diff --git a/docs/conf.py b/docs/conf.py index 4b348ae0e..d6fde9bec 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,11 +49,11 @@ extlinks['man'] = ('https://manpages.debian.org/jump?q=%s', '') #extlinks['role'] = ( # 'https://www.sphinx-doc.org/en/master/usage/restructuredtext/roles.html#role-%s', '') extlinks['duref'] = ( - 'http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#%s', '') + 'https://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#%s', '') extlinks['durole'] = ( - 'http://docutils.sourceforge.net/docs/ref/rst/roles.html#%s', '') + 'https://docutils.sourceforge.net/docs/ref/rst/roles.html#%s', '') extlinks['dudir'] = ( - 'http://docutils.sourceforge.net/docs/ref/rst/directives.html#%s', '') + 'https://docutils.sourceforge.net/docs/ref/rst/directives.html#%s', '') extlinks['ctan'] = ( 'https://ctan.org/pkg/%s', 'CTAN: ') diff --git a/docs/dev/contribution_guide.rst b/docs/dev/contribution_guide.rst index 26f8d2bb7..90b22670c 100644 --- a/docs/dev/contribution_guide.rst +++ b/docs/dev/contribution_guide.rst @@ -117,8 +117,8 @@ Translation currently takes place on :ref:`transifex <translation>`. Documentation ============= -.. _Sphinx: http://www.sphinx-doc.org -.. _reST: http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html +.. _Sphinx: https://www.sphinx-doc.org +.. _reST: https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html .. sidebar:: The reST sources diff --git a/docs/dev/engine_overview.rst b/docs/dev/engine_overview.rst index 0b5f9857f..3562ca61a 100644 --- a/docs/dev/engine_overview.rst +++ b/docs/dev/engine_overview.rst @@ -134,19 +134,19 @@ The function ``def request(query, params):`` always returns the ``params`` variable. Inside searx, the following paramters can be used to specify a search request: -================== =========== ========================================================================== -argument type information -================== =========== ========================================================================== -url string requested url -method string HTTP request method -headers set HTTP header information -data set HTTP data information (parsed if ``method != 'GET'``) -cookies set HTTP cookies -verify boolean Performing SSL-Validity check -max_redirects int maximum redirects, hard limit -soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine -raise_for_status bool True by default: raise an exception if the HTTP code of response is >= 300 -================== =========== ========================================================================== +=================== =========== ========================================================================== +argument type information +=================== =========== ========================================================================== +url string requested url +method string HTTP request method +headers set HTTP header information +data set HTTP data information (parsed if ``method != 'GET'``) +cookies set HTTP cookies +verify boolean Performing SSL-Validity check +max_redirects int maximum redirects, hard limit +soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine +raise_for_httperror bool True by default: raise an exception if the HTTP code of response is >= 300 +=================== =========== ========================================================================== example code @@ -265,7 +265,7 @@ latitude latitude of result (in decimal format) longitude longitude of result (in decimal format) boundingbox boundingbox of result (array of 4. values ``[lat-min, lat-max, lon-min, lon-max]``) -geojson geojson of result (http://geojson.org) +geojson geojson of result (https://geojson.org/) osm.type type of osm-object (if OSM-Result) osm.id id of osm-object (if OSM-Result) address.name name of object diff --git a/docs/dev/quickstart.rst b/docs/dev/quickstart.rst index 3e1a5e344..14af03fa6 100644 --- a/docs/dev/quickstart.rst +++ b/docs/dev/quickstart.rst @@ -60,7 +60,7 @@ read :ref:`make test`. How to compile styles and javascript ==================================== -.. _less: http://lesscss.org/ +.. _less: https://lesscss.org/ .. _NodeJS: https://nodejs.org How to build styles diff --git a/docs/dev/reST.rst b/docs/dev/reST.rst index 906a0e9af..963378748 100644 --- a/docs/dev/reST.rst +++ b/docs/dev/reST.rst @@ -1391,27 +1391,27 @@ The next example shows the difference of ``\tfrac`` (*textstyle*) and ``\dfrac`` .. _readability: https://docs.python-guide.org/writing/style/ .. _Sphinx-Primer: - http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html + https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html .. _reST: https://docutils.sourceforge.io/rst.html .. _Sphinx Roles: https://www.sphinx-doc.org/en/master/usage/restructuredtext/roles.html -.. _Sphinx: http://www.sphinx-doc.org -.. _`sphinx-doc FAQ`: http://www.sphinx-doc.org/en/stable/faq.html +.. _Sphinx: https://www.sphinx-doc.org +.. _`sphinx-doc FAQ`: https://www.sphinx-doc.org/en/stable/faq.html .. _Sphinx markup constructs: - http://www.sphinx-doc.org/en/stable/markup/index.html + https://www.sphinx-doc.org/en/stable/markup/index.html .. _`sphinx cross references`: - http://www.sphinx-doc.org/en/stable/markup/inline.html#cross-referencing-arbitrary-locations + https://www.sphinx-doc.org/en/stable/markup/inline.html#cross-referencing-arbitrary-locations .. _sphinx.ext.extlinks: https://www.sphinx-doc.org/en/master/usage/extensions/extlinks.html -.. _intersphinx: http://www.sphinx-doc.org/en/stable/ext/intersphinx.html -.. _sphinx config: http://www.sphinx-doc.org/en/stable/config.html -.. _Sphinx's autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html +.. _intersphinx: https://www.sphinx-doc.org/en/stable/ext/intersphinx.html +.. _sphinx config: https://www.sphinx-doc.org/en/stable/config.html +.. _Sphinx's autodoc: https://www.sphinx-doc.org/en/stable/ext/autodoc.html .. _Sphinx's Python domain: - http://www.sphinx-doc.org/en/stable/domains.html#the-python-domain + https://www.sphinx-doc.org/en/stable/domains.html#the-python-domain .. _Sphinx's C domain: - http://www.sphinx-doc.org/en/stable/domains.html#cross-referencing-c-constructs + https://www.sphinx-doc.org/en/stable/domains.html#cross-referencing-c-constructs .. _doctree: - http://www.sphinx-doc.org/en/master/extdev/tutorial.html?highlight=doctree#build-phases + https://www.sphinx-doc.org/en/master/extdev/tutorial.html?highlight=doctree#build-phases .. _docutils: http://docutils.sourceforge.net/docs/index.html .. _docutils FAQ: http://docutils.sourceforge.net/FAQ.html .. _linuxdoc: https://return42.github.io/linuxdoc @@ -1424,5 +1424,5 @@ The next example shows the difference of ``\tfrac`` (*textstyle*) and ``\dfrac`` .. _ImageMagick: https://www.imagemagick.org .. _`Emacs Table Mode`: https://www.emacswiki.org/emacs/TableMode -.. _`Online Tables Generator`: http://www.tablesgenerator.com/text_tables +.. _`Online Tables Generator`: https://www.tablesgenerator.com/text_tables .. _`OASIS XML Exchange Table Model`: https://www.oasis-open.org/specs/tm9901.html diff --git a/searx/autocomplete.py b/searx/autocomplete.py index 420b8a461..fbe634a5b 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -113,7 +113,7 @@ def searx_bang(full_query): def dbpedia(query, lang): # dbpedia autocompleter, no HTTPS - autocomplete_url = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' + autocomplete_url = 'https://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' response = get(autocomplete_url + urlencode(dict(QueryString=query))) @@ -121,8 +121,7 @@ def dbpedia(query, lang): if response.ok: dom = etree.fromstring(response.content) - results = dom.xpath('//a:Result/a:Label//text()', - namespaces={'a': 'http://lookup.dbpedia.org/'}) + results = dom.xpath('//Result/Label//text()') return results diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index ddd6a7feb..b2a9b25a4 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -20,6 +20,7 @@ import sys import threading from os.path import realpath, dirname from babel.localedata import locale_identifiers +from urllib.parse import urlparse from flask_babel import gettext from operator import itemgetter from searx import settings @@ -280,8 +281,12 @@ def initialize_engines(engine_list): load_engines(engine_list) def engine_init(engine_name, init_fn): - init_fn(get_engine_from_settings(engine_name)) - logger.debug('%s engine: Initialized', engine_name) + try: + init_fn(get_engine_from_settings(engine_name)) + except Exception: + logger.exception('%s engine: Fail to initialize', engine_name) + else: + logger.debug('%s engine: Initialized', engine_name) for engine_name, engine in engines.items(): if hasattr(engine, 'init'): @@ -289,3 +294,34 @@ def initialize_engines(engine_list): if init_fn: logger.debug('%s engine: Starting background initialization', engine_name) threading.Thread(target=engine_init, args=(engine_name, init_fn)).start() + + _set_https_support_for_engine(engine) + + +def _set_https_support_for_engine(engine): + # check HTTPS support if it is not disabled + if not engine.offline and not hasattr(engine, 'https_support'): + params = engine.request('http_test', { + 'method': 'GET', + 'headers': {}, + 'data': {}, + 'url': '', + 'cookies': {}, + 'verify': True, + 'auth': None, + 'pageno': 1, + 'time_range': None, + 'language': '', + 'safesearch': False, + 'is_test': True, + 'category': 'files', + 'raise_for_status': True, + }) + + if 'url' not in params: + return + + parsed_url = urlparse(params['url']) + https_support = parsed_url.scheme == 'https' + + setattr(engine, 'https_support', https_support) diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index b8b367c24..637443edc 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -18,7 +18,7 @@ categories = ['files', 'images', 'videos', 'music'] paging = True # search-url -base_url = 'http://www.acgsou.com/' +base_url = 'https://www.acgsou.com/' search_url = base_url + 'search.php?{query}&page={offset}' # xpath queries xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]' @@ -40,7 +40,7 @@ def response(resp): for result in eval_xpath_list(dom, xpath_results): # defaults filesize = 0 - magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" + magnet_link = "magnet:?xt=urn:btih:{}&tr=https://tracker.acgsou.com:2710/announce" category = extract_text(eval_xpath_getindex(result, xpath_category, 0, default=[])) page_a = eval_xpath_getindex(result, xpath_title, 0) diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index c702c5987..1190de363 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -19,7 +19,7 @@ from searx.utils import eval_xpath_list, eval_xpath_getindex categories = ['science'] paging = True -base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ +base_url = 'https://export.arxiv.org/api/query?search_query=all:'\ + '{query}&start={offset}&max_results={number_of_results}' # engine dependent config diff --git a/searx/engines/command.py b/searx/engines/command.py index 08ee5da06..0268d52eb 100644 --- a/searx/engines/command.py +++ b/searx/engines/command.py @@ -80,7 +80,7 @@ def search(query, params): def _get_command_to_run(query): - params = shlex_split(query.decode('utf-8')) + params = shlex_split(query) __check_query_params(params) cmd = [] diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index f41c135b9..87e21d0af 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -9,6 +9,7 @@ url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}' weight = 100 parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) +https_support = True def normalize_name(name): diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 5a1fea3cf..727eb6598 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -20,6 +20,7 @@ weight = 100 parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) results_xpath = './/table[@id="r"]/tr' +https_support = True def request(query, params): diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 5a7649173..1d1c84b4b 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -10,7 +10,7 @@ DuckDuckGo (definitions) """ import json -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse, urljoin from lxml import html from searx import logger @@ -102,6 +102,8 @@ def response(resp): # image image = search_res.get('Image') image = None if image == '' else image + if image is not None and urlparse(image).netloc == '': + image = urljoin('https://duckduckgo.com', image) # urls # Official website, Wikipedia page diff --git a/searx/engines/duden.py b/searx/engines/duden.py index 1484a21e5..1475fb846 100644 --- a/searx/engines/duden.py +++ b/searx/engines/duden.py @@ -8,11 +8,10 @@ @parse url, title, content """ -from lxml import html, etree import re from urllib.parse import quote, urljoin -from searx.utils import extract_text, eval_xpath -from searx import logger +from lxml import html +from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex categories = ['general'] paging = True @@ -40,6 +39,9 @@ def request(query, params): params['url'] = search_url_fmt.format(query=quote(query)) else: params['url'] = search_url.format(offset=offset, query=quote(query)) + # after the last page of results, spelling corrections are returned after a HTTP redirect + # whatever the page number is + params['soft_max_redirects'] = 1 return params @@ -51,28 +53,21 @@ def response(resp): dom = html.fromstring(resp.text) - try: - number_of_results_string =\ - re.sub('[^0-9]', '', - eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]) - + number_of_results_element =\ + eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', + 0, default=None) + if number_of_results_element is not None: + number_of_results_string = re.sub('[^0-9]', '', number_of_results_element) results.append({'number_of_results': int(number_of_results_string)}) - except: - logger.debug("Couldn't read number of results.") - - for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'): - try: - url = eval_xpath(result, './/h2/a')[0].get('href') - url = urljoin(base_url, url) - title = eval_xpath(result, 'string(.//h2/a)').strip() - content = extract_text(eval_xpath(result, './/p')) - # append result - results.append({'url': url, - 'title': title, - 'content': content}) - except: - logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) - continue + for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'): + url = eval_xpath_getindex(result, './/h2/a', 0).get('href') + url = urljoin(base_url, url) + title = eval_xpath(result, 'string(.//h2/a)').strip() + content = extract_text(eval_xpath(result, './/p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) return results diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py deleted file mode 100644 index 0331e7b19..000000000 --- a/searx/engines/filecrop.py +++ /dev/null @@ -1,85 +0,0 @@ -from html.parser import HTMLParser -from urllib.parse import urlencode - - -url = 'http://www.filecrop.com/' -search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa - -paging = True - - -class FilecropResultParser(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844) - - def __init__(self): - HTMLParser.__init__(self) - self.__start_processing = False - - self.results = [] - self.result = {} - - self.tr_counter = 0 - self.data_counter = 0 - - def handle_starttag(self, tag, attrs): - - if tag == 'tr': - if ('bgcolor', '#edeff5') in attrs or\ - ('bgcolor', '#ffffff') in attrs: - self.__start_processing = True - - if not self.__start_processing: - return - - if tag == 'label': - self.result['title'] = [attr[1] for attr in attrs - if attr[0] == 'title'][0] - elif tag == 'a' and ('rel', 'nofollow') in attrs\ - and ('class', 'sourcelink') in attrs: - if 'content' in self.result: - self.result['content'] += [attr[1] for attr in attrs - if attr[0] == 'title'][0] - else: - self.result['content'] = [attr[1] for attr in attrs - if attr[0] == 'title'][0] - self.result['content'] += ' ' - elif tag == 'a': - self.result['url'] = url + [attr[1] for attr in attrs - if attr[0] == 'href'][0] - - def handle_endtag(self, tag): - if self.__start_processing is False: - return - - if tag == 'tr': - self.tr_counter += 1 - - if self.tr_counter == 2: - self.__start_processing = False - self.tr_counter = 0 - self.data_counter = 0 - self.results.append(self.result) - self.result = {} - - def handle_data(self, data): - if not self.__start_processing: - return - - if 'content' in self.result: - self.result['content'] += data + ' ' - else: - self.result['content'] = data + ' ' - - self.data_counter += 1 - - -def request(query, params): - index = 1 + (params['pageno'] - 1) * 30 - params['url'] = search_url.format(query=urlencode({'w': query}), index=index) - return params - - -def response(resp): - parser = FilecropResultParser() - parser.feed(resp.text) - - return parser.results diff --git a/searx/engines/genius.py b/searx/engines/genius.py index feb7d79d1..2bfbfddf5 100644 --- a/searx/engines/genius.py +++ b/searx/engines/genius.py @@ -36,7 +36,7 @@ def parse_lyric(hit): try: content = hit['highlights'][0]['value'] except: - content = None + content = '' timestamp = hit['result']['lyrics_updated_at'] result = {'url': hit['result']['url'], 'title': hit['result']['full_title'], @@ -51,7 +51,7 @@ def parse_lyric(hit): def parse_artist(hit): result = {'url': hit['result']['url'], 'title': hit['result']['name'], - 'content': None, + 'content': '', 'thumbnail': hit['result']['image_url'], 'template': 'videos.html'} return result @@ -61,6 +61,7 @@ def parse_album(hit): result = {'url': hit['result']['url'], 'title': hit['result']['full_title'], 'thumbnail': hit['result']['cover_art_url'], + 'content': '', # 'thumbnail': hit['result']['cover_art_thumbnail_url'], 'template': 'videos.html'} try: @@ -81,9 +82,7 @@ def response(resp): json = loads(resp.text) hits = [hit for section in json['response']['sections'] for hit in section['hits']] for hit in hits: - try: - func = parse[hit['type']] - except KeyError: - continue - results.append(func(hit)) + func = parse.get(hit['type']) + if func: + results.append(func(hit)) return results diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index c909ce11b..b785719d9 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -14,6 +14,8 @@ from datetime import datetime from json import loads from urllib.parse import urlencode from searx.utils import html_to_text, match_language +from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException +from searx.raise_for_httperror import raise_for_httperror # engine dependent config @@ -24,8 +26,7 @@ supported_languages_url = 'https://qwant.com/region' category_to_keyword = {'general': 'web', 'images': 'images', - 'news': 'news', - 'social media': 'social'} + 'news': 'news'} # search-url url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' @@ -51,6 +52,7 @@ def request(query, params): params['url'] += '&locale=' + language.replace('-', '_').lower() params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' + params['raise_for_httperror'] = False return params @@ -58,8 +60,20 @@ def request(query, params): def response(resp): results = [] + # According to https://www.qwant.com/js/app.js + if resp.status_code == 429: + raise SearxEngineCaptchaException() + + # raise for other errors + raise_for_httperror(resp) + + # load JSON result search_results = loads(resp.text) + # check for an API error + if search_results.get('status') != 'success': + raise SearxEngineAPIException('API error ' + str(search_results.get('error', ''))) + # return empty array if there are no results if 'data' not in search_results: return [] @@ -90,15 +104,6 @@ def response(resp): 'thumbnail_src': thumbnail_src, 'img_src': img_src}) - elif category_to_keyword.get(categories[0], '') == 'social': - published_date = datetime.fromtimestamp(result['date'], None) - img_src = result.get('img', None) - results.append({'url': res_url, - 'title': title, - 'publishedDate': published_date, - 'content': content, - 'img_src': img_src}) - elif category_to_keyword.get(categories[0], '') == 'news': published_date = datetime.fromtimestamp(result['date'], None) media = result.get('media', []) diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py deleted file mode 100644 index 39916da6e..000000000 --- a/searx/engines/seedpeer.py +++ /dev/null @@ -1,78 +0,0 @@ -# Seedpeer (Videos, Music, Files) -# -# @website https://seedpeer.me -# @provide-api no (nothing found) -# -# @using-api no -# @results HTML (using search portal) -# @stable yes (HTML can change) -# @parse url, title, content, seed, leech, magnetlink - -from lxml import html -from json import loads -from operator import itemgetter -from urllib.parse import quote, urljoin -from searx.utils import extract_text - - -url = 'https://seedpeer.me/' -search_url = url + 'search/{search_term}?page={page_no}' -torrent_file_url = url + 'torrent/{torrent_hash}' - -# specific xpath variables -script_xpath = '//script[@type="text/javascript"][not(@src)]' -torrent_xpath = '(//table)[2]/tbody/tr' -link_xpath = '(./td)[1]/a/@href' -age_xpath = '(./td)[2]' -size_xpath = '(./td)[3]' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(search_term=quote(query), - page_no=params['pageno']) - return params - - -# get response from search-request -def response(resp): - results = [] - dom = html.fromstring(resp.text) - result_rows = dom.xpath(torrent_xpath) - - try: - script_element = dom.xpath(script_xpath)[0] - json_string = script_element.text[script_element.text.find('{'):] - torrents_json = loads(json_string) - except: - return [] - - # parse results - for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']): - title = torrent_json['name'] - seed = int(torrent_json['seeds']) - leech = int(torrent_json['peers']) - size = int(torrent_json['size']) - torrent_hash = torrent_json['hash'] - - torrentfile = torrent_file_url.format(torrent_hash=torrent_hash) - magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash) - - age = extract_text(torrent_row.xpath(age_xpath)) - link = torrent_row.xpath(link_xpath)[0] - - href = urljoin(url, link) - - # append result - results.append({'url': href, - 'title': title, - 'content': age, - 'seed': seed, - 'leech': leech, - 'filesize': size, - 'torrentfile': torrentfile, - 'magnetlink': magnetlink, - 'template': 'torrent.html'}) - - # return results sorted by seeder - return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index b1e01759f..84ff21a88 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -91,7 +91,7 @@ def response(resp): for result in search_res.get('collection', []): if result['kind'] in ('track', 'playlist'): title = result['title'] - content = result['description'] + content = result['description'] or '' publishedDate = parser.parse(result['last_modified']) uri = quote_plus(result['uri']) embedded = embedded_url.format(uri=uri) diff --git a/searx/engines/translated.py b/searx/engines/translated.py index a50e7c830..75b8b5f42 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -15,6 +15,7 @@ categories = ['general'] url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' web_url = 'https://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' weight = 100 +https_support = True parser_re = re.compile('.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) api_key = '' diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 60d0dc9a0..8d787caac 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -161,9 +161,6 @@ def request(query, params): def response(resp): results = [] - if resp.status_code != 200: - logger.debug('SPARQL endpoint error %s', resp.content.decode()) - resp.raise_for_status() jsonresponse = loads(resp.content.decode()) language = resp.search_params['language'].lower() diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 9fce170eb..54d75108e 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -14,6 +14,7 @@ from urllib.parse import quote from json import loads from lxml.html import fromstring from searx.utils import match_language, searx_useragent +from searx.raise_for_httperror import raise_for_httperror # search-url search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' @@ -37,7 +38,7 @@ def request(query, params): language=url_lang(params['language'])) params['headers']['User-Agent'] = searx_useragent() - params['raise_for_status'] = False + params['raise_for_httperror'] = False params['soft_max_redirects'] = 2 return params @@ -47,12 +48,13 @@ def request(query, params): def response(resp): if resp.status_code == 404: return [] + raise_for_httperror(resp) results = [] api_result = loads(resp.text) # skip disambiguation pages - if api_result['type'] != 'standard': + if api_result.get('type') != 'standard': return [] title = api_result['title'] diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index 8d691c852..b8f111a50 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -7,12 +7,12 @@ @using-api no @results HTML @stable no (HTML can change) - @parse url, title, thumbnail, img_src, content + @parse url, title, thumbnail """ -from lxml import html +from lxml import html, etree from urllib.parse import urlencode, urljoin -from searx.utils import extract_text +from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex # engine dependent config categories = ['images'] @@ -21,6 +21,7 @@ paging = False # search-url base_url = 'https://1x.com' search_url = base_url + '/backend/search.php?{query}' +gallery_url = 'https://gallery.1x.com/' # do search-request @@ -33,23 +34,18 @@ def request(query, params): # get response from search-request def response(resp): results = [] - - dom = html.fromstring(resp.text) - for res in dom.xpath('//div[@class="List-item MainListing"]'): - # processed start and end of link - link = res.xpath('//a')[0] - + xmldom = etree.fromstring(resp.content) + xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0) + dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div') + for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'): url = urljoin(base_url, link.attrib.get('href')) title = extract_text(link) - - thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src']) - # TODO: get image with higher resolution - img_src = thumbnail_src + thumbnail_src = urljoin(gallery_url, eval_xpath_getindex(link, './/img', 0).attrib['src']) # append result results.append({'url': url, 'title': title, - 'img_src': img_src, + 'img_src': thumbnail_src, 'content': '', 'thumbnail_src': thumbnail_src, 'template': 'images.html'}) diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index d420e250a..1507176ec 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -7,7 +7,6 @@ url_xpath = None content_xpath = None title_xpath = None thumbnail_xpath = False -categories = [] paging = False suggestion_xpath = '' results_xpath = '' @@ -39,7 +38,7 @@ def request(query, params): def response(resp): results = [] dom = html.fromstring(resp.text) - is_onion = True if 'onions' in categories else False + is_onion = True if 'onions' in categories else False # pylint: disable=undefined-variable if results_xpath: for result in eval_xpath_list(dom, results_xpath): diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 5f7d2ceab..36fc72e36 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -49,7 +49,7 @@ def response(resp): results = [] results_data = resp.text[resp.text.find('ytInitialData'):] - results_data = results_data[results_data.find('{'):results_data.find(';\n')] + results_data = results_data[results_data.find('{'):results_data.find(';</script>')] results_json = loads(results_data) if results_data else {} sections = results_json.get('contents', {})\ diff --git a/searx/exceptions.py b/searx/exceptions.py index 82c1d76dc..67a282da2 100644 --- a/searx/exceptions.py +++ b/searx/exceptions.py @@ -64,8 +64,33 @@ class SearxEngineAPIException(SearxEngineResponseException): """The website has returned an application error""" -class SearxEngineCaptchaException(SearxEngineResponseException): - """The website has returned a CAPTCHA""" +class SearxEngineAccessDeniedException(SearxEngineResponseException): + """The website is blocking the access""" + + def __init__(self, suspended_time=24 * 3600, message='Access denied'): + super().__init__(message + ', suspended_time=' + str(suspended_time)) + self.suspended_time = suspended_time + self.message = message + + +class SearxEngineCaptchaException(SearxEngineAccessDeniedException): + """The website has returned a CAPTCHA + + By default, searx stops sending requests to this engine for 1 day. + """ + + def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'): + super().__init__(message=message, suspended_time=suspended_time) + + +class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException): + """The website has returned a Too Many Request status code + + By default, searx stops sending requests to this engine for 1 hour. + """ + + def __init__(self, suspended_time=3600, message='Too many request'): + super().__init__(message=message, suspended_time=suspended_time) class SearxEngineXPathException(SearxEngineResponseException): diff --git a/searx/metrology/error_recorder.py b/searx/metrology/error_recorder.py index 4b67235e1..fee1ef7d6 100644 --- a/searx/metrology/error_recorder.py +++ b/searx/metrology/error_recorder.py @@ -4,7 +4,8 @@ import logging from json import JSONDecodeError from urllib.parse import urlparse from requests.exceptions import RequestException -from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException +from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException, + SearxEngineAccessDeniedException) from searx import logger @@ -100,6 +101,10 @@ def get_messages(exc, filename) -> typing.Tuple: return (exc.xpath_str, exc.message) if isinstance(exc, SearxEngineXPathException): return (exc.xpath_str, exc.message) + if isinstance(exc, SearxEngineAPIException): + return (str(exc.args[0]), ) + if isinstance(exc, SearxEngineAccessDeniedException): + return (exc.message, ) return () diff --git a/searx/poolrequests.py b/searx/poolrequests.py index 1eedc84b8..25a6baed9 100644 --- a/searx/poolrequests.py +++ b/searx/poolrequests.py @@ -7,6 +7,7 @@ import requests from searx import settings from searx import logger +from searx.raise_for_httperror import raise_for_httperror logger = logger.getChild('poolrequests') @@ -156,6 +157,12 @@ def request(method, url, **kwargs): if timeout is not None: kwargs['timeout'] = timeout + # raise_for_error + check_for_httperror = True + if 'raise_for_httperror' in kwargs: + check_for_httperror = kwargs['raise_for_httperror'] + del kwargs['raise_for_httperror'] + # do request response = session.request(method=method, url=url, **kwargs) @@ -176,6 +183,10 @@ def request(method, url, **kwargs): if hasattr(threadLocal, 'total_time'): threadLocal.total_time += time_after_request - time_before_request + # raise an exception + if check_for_httperror: + raise_for_httperror(response) + return response diff --git a/searx/raise_for_httperror.py b/searx/raise_for_httperror.py new file mode 100644 index 000000000..bd12df9a9 --- /dev/null +++ b/searx/raise_for_httperror.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +Raise exception for an HTTP response is an error. +""" +from searx.exceptions import (SearxEngineCaptchaException, SearxEngineTooManyRequestsException, + SearxEngineAccessDeniedException) + + +def is_cloudflare_challenge(resp): + if resp.status_code in [429, 503]: + if ('__cf_chl_jschl_tk__=' in resp.text)\ + or ('/cdn-cgi/challenge-platform/' in resp.text + and 'orchestrate/jsch/v1' in resp.text + and 'window._cf_chl_enter(' in resp.text): + return True + if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text: + return True + return False + + +def is_cloudflare_firewall(resp): + return resp.status_code == 403 and '<span class="cf-error-code">1020</span>' in resp.text + + +def raise_for_cloudflare_captcha(resp): + if resp.headers.get('Server', '').startswith('cloudflare'): + if is_cloudflare_challenge(resp): + # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha- + # suspend for 2 weeks + raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15) + + if is_cloudflare_firewall(resp): + raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24) + + +def raise_for_recaptcha(resp): + if resp.status_code == 503 \ + and '"https://www.google.com/recaptcha/' in resp.text: + raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7) + + +def raise_for_captcha(resp): + raise_for_cloudflare_captcha(resp) + raise_for_recaptcha(resp) + + +def raise_for_httperror(resp): + """Raise exception for an HTTP response is an error. + + Args: + resp (requests.Response): Response to check + + Raises: + requests.HTTPError: raise by resp.raise_for_status() + searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403. + searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429. + searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected. + """ + if resp.status_code and resp.status_code >= 400: + raise_for_captcha(resp) + if resp.status_code in (402, 403): + raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code), + suspended_time=3600 * 24) + if resp.status_code == 429: + raise SearxEngineTooManyRequestsException() + resp.raise_for_status() diff --git a/searx/results.py b/searx/results.py index 5bf4e6b9e..fb7e816eb 100644 --- a/searx/results.py +++ b/searx/results.py @@ -309,10 +309,11 @@ class ResultContainer: for res in results: # FIXME : handle more than one category per engine - res['category'] = engines[res['engine']].categories[0] + engine = engines[res['engine']] + res['category'] = engine.categories[0] if len(engine.categories) > 0 else '' # FIXME : handle more than one category per engine - category = engines[res['engine']].categories[0]\ + category = res['category']\ + ':' + res.get('template', '')\ + ':' + ('img_src' if 'img_src' in res or 'thumbnail' in res else '') diff --git a/searx/search.py b/searx/search.py index 8c2ad8d72..220950803 100644 --- a/searx/search.py +++ b/searx/search.py @@ -32,7 +32,8 @@ from searx.utils import gen_useragent from searx.results import ResultContainer from searx import logger from searx.plugins import plugins -from searx.exceptions import SearxEngineCaptchaException +from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException, + SearxEngineTooManyRequestsException,) from searx.metrology.error_recorder import record_exception, record_error @@ -131,6 +132,9 @@ def send_http_request(engine, request_params): # soft_max_redirects soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0) + # raise_for_status + request_args['raise_for_httperror'] = request_params.get('raise_for_httperror', False) + # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get @@ -142,10 +146,6 @@ def send_http_request(engine, request_params): # send the request response = req(request_params['url'], **request_args) - # check HTTP status - if request_params.get('raise_for_status'): - response.raise_for_status() - # check soft limit of the redirect count if len(response.history) > soft_max_redirects: # unexpected redirect : record an error @@ -191,6 +191,7 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont # suppose everything will be alright requests_exception = False + suspended_time = None try: # send requests and parse the results @@ -240,6 +241,15 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required') logger.exception('engine {0} : CAPTCHA') + suspended_time = e.suspended_time # pylint: disable=no-member + elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)): + result_container.add_unresponsive_engine(engine_name, 'too many requests') + logger.exception('engine {0} : Too many requests') + suspended_time = e.suspended_time # pylint: disable=no-member + elif (issubclass(e.__class__, SearxEngineAccessDeniedException)): + result_container.add_unresponsive_engine(engine_name, 'blocked') + logger.exception('engine {0} : Searx is blocked') + suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine(engine_name, 'unexpected crash') # others errors @@ -248,16 +258,18 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont if getattr(threading.current_thread(), '_timeout', False): record_error(engine_name, 'Timeout') - # suspend or not the engine if there are HTTP errors + # suspend the engine if there is an HTTP error + # or suspended_time is defined with threading.RLock(): - if requests_exception: + if requests_exception or suspended_time: # update continuous_errors / suspend_end_time engine.continuous_errors += 1 - engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'], - engine.continuous_errors * settings['search']['ban_time_on_fail']) + if suspended_time is None: + suspended_time = min(settings['search']['max_ban_time_on_fail'], + engine.continuous_errors * settings['search']['ban_time_on_fail']) + engine.suspend_end_time = time() + suspended_time else: - # no HTTP error (perhaps an engine error) - # anyway, reset the suspend variables + # reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0 @@ -342,7 +354,7 @@ def default_request_params(): 'cookies': {}, 'verify': True, 'auth': None, - 'raise_for_status': True + 'raise_for_httperror': True } diff --git a/searx/settings.yml b/searx/settings.yml index 486521d6d..e263e3ad4 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -289,6 +289,7 @@ engines: - name : 1x engine : www1x shortcut : 1x + timeout : 3.0 disabled : True - name : fdroid @@ -490,7 +491,7 @@ engines: - name : library genesis engine : xpath - search_url : https://libgen.is/search.php?req={query} + search_url : http://libgen.rs/search.php?req={query} url_xpath : //a[contains(@href,"bookfi.net")]/@href title_xpath : //a[contains(@href,"book/")]/text()[1] content_xpath : //td/a[1][contains(@href,"=author")]/text() @@ -646,11 +647,6 @@ engines: shortcut : qwn categories : news - - name : qwant social - engine : qwant - shortcut : qws - categories : social media - # - name: library # engine: recoll # shortcut: lib @@ -808,12 +804,13 @@ engines: # Or you can use the html non-stable engine, activated by default engine : youtube_noapi - - name : yggtorrent - engine : yggtorrent - shortcut : ygg - url: https://www2.yggtorrent.si/ - disabled : True - timeout : 4.0 + # tmp suspended: Cloudflare CAPTCHA + #- name : yggtorrent + # engine : yggtorrent + # shortcut : ygg + # url: https://www2.yggtorrent.si/ + # disabled : True + # timeout : 4.0 - name : dailymotion engine : dailymotion @@ -958,12 +955,6 @@ engines: page_size : 10 disabled : True - - name : seedpeer - shortcut : speu - engine : seedpeer - categories: files, music, videos - - - name : naver shortcut: nvr engine: xpath diff --git a/searx/templates/oscar/macros.html b/searx/templates/oscar/macros.html index 2bc1e7805..f40eebd37 100644 --- a/searx/templates/oscar/macros.html +++ b/searx/templates/oscar/macros.html @@ -1,6 +1,6 @@ <!-- Draw glyphicon icon from bootstrap-theme --> -{% macro icon(action) -%} - <span class="glyphicon glyphicon-{{ action }}"></span> +{% macro icon(action, alt) -%} + <span title="{{ alt }}" class="glyphicon glyphicon-{{ action }}"></span> {%- endmacro %} <!-- Draw favicon --> diff --git a/searx/templates/oscar/preferences.html b/searx/templates/oscar/preferences.html index bc688dade..fc20b8ca5 100644 --- a/searx/templates/oscar/preferences.html +++ b/searx/templates/oscar/preferences.html @@ -230,8 +230,8 @@ <td class="onoff-checkbox"> {{ checkbox_toggle('engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_'), (search_engine.name, categ) in disabled_engines) }} </td> - <th scope="row">{{ search_engine.name }}</th> - <td class="name">{{ shortcuts[search_engine.name] }}</td> + <th scope="row">{% if not search_engine.https_support %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif %} {{ search_engine.name }}</td></th> + <td class="name">{{ shortcuts[search_engine.name] }} <td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td> <td>{{ support_toggle(search_engine.safesearch==True) }}</td> <td>{{ support_toggle(search_engine.time_range_support==True) }}</td> diff --git a/searx/templates/simple/macros.html b/searx/templates/simple/macros.html index cacbbec9f..1eb42667a 100644 --- a/searx/templates/simple/macros.html +++ b/searx/templates/simple/macros.html @@ -1,6 +1,6 @@ <!-- Draw glyphicon icon from bootstrap-theme --> -{% macro icon(action) -%} - <span class="ion-icon-big ion-{{ action }}"></span> +{% macro icon(action, alt) -%} + <span title="{{ alt }}" class="ion-icon-big ion-{{ action }}"></span> {%- endmacro %} {% macro icon_small(action) -%} diff --git a/searx/templates/simple/preferences.html b/searx/templates/simple/preferences.html index d68e4be5f..f091a97cf 100644 --- a/searx/templates/simple/preferences.html +++ b/searx/templates/simple/preferences.html @@ -1,4 +1,4 @@ -{% from 'simple/macros.html' import tabs_open, tabs_close, tab_header, tab_footer, checkbox_onoff, checkbox %} +{% from 'simple/macros.html' import icon, tabs_open, tabs_close, tab_header, tab_footer, checkbox_onoff, checkbox %} {% extends "simple/base.html" %} @@ -121,7 +121,7 @@ {% set engine_id = 'engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_') %} <tr> <td class="engine_checkbox">{{ checkbox_onoff(engine_id, (search_engine.name, categ) in disabled_engines) }}</td> - <th class="name">{{ search_engine.name }}</th> + <th class="name">{% if not search_engine.https_support %}{{ icon('warning', 'No HTTPS') }}{% endif %} {{ search_engine.name }}</th> <td class="shortcut">{{ shortcuts[search_engine.name] }}</td> <td>{{ checkbox(engine_id + '_supported_languages', current_language == 'all' or current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages, true, true) }}</td> <td>{{ checkbox(engine_id + '_safesearch', search_engine.safesearch==True, true, true) }}</td> diff --git a/utils/makefile.python b/utils/makefile.python index 6c6696964..668b0894b 100644 --- a/utils/makefile.python +++ b/utils/makefile.python @@ -252,7 +252,7 @@ pyenv-python: pyenv-install # PyPi is required and since uploads via setuptools is not recommended, we have # to imstall / use twine ... its really a mess. # -# [1] http://python-packaging.readthedocs.io/en/latest/dependencies.html#packages-not-on-pypi +# [1] https://python-packaging.readthedocs.io/en/latest/dependencies.html#packages-not-on-pypi # [2] https://github.com/pypa/pip/pull/1519 # https://github.com/pypa/twine diff --git a/utils/searx.sh b/utils/searx.sh index 386b2861f..06b3c2dfc 100755 --- a/utils/searx.sh +++ b/utils/searx.sh @@ -36,6 +36,7 @@ GIT_BRANCH="${GIT_BRANCH:-master}" SEARX_PYENV="${SERVICE_HOME}/searx-pyenv" SEARX_SRC="${SERVICE_HOME}/searx-src" SEARX_SETTINGS_PATH="/etc/searx/settings.yml" +SEARX_SETTINGS_TEMPLATE="${REPO_ROOT}/utils/templates/etc/searx/use_default_settings.yml" SEARX_UWSGI_APP="searx.ini" # shellcheck disable=SC2034 SEARX_UWSGI_SOCKET="/run/uwsgi/app/searx/socket" @@ -139,7 +140,7 @@ usage() { cat <<EOF usage:: $(basename "$0") shell - $(basename "$0") install [all|user|searx-src|pyenv|uwsgi|packages|buildhost] + $(basename "$0") install [all|user|searx-src|pyenv|uwsgi|packages|settings|buildhost] $(basename "$0") update [searx] $(basename "$0") remove [all|user|pyenv|searx-src] $(basename "$0") activate [service] @@ -413,14 +414,14 @@ install_settings() { if [[ ! -f ${SEARX_SETTINGS_PATH} ]]; then info_msg "install settings ${REPO_ROOT}/searx/settings.yml" info_msg " --> ${SEARX_SETTINGS_PATH}" - cp "${REPO_ROOT}/searx/settings.yml" "${SEARX_SETTINGS_PATH}" + cp "${SEARX_SETTINGS_TEMPLATE}" "${SEARX_SETTINGS_PATH}" configure_searx return fi rst_para "Diff between origin's setting file (+) and current (-):" - echo - $DIFF_CMD "${SEARX_SETTINGS_PATH}" "${SEARX_SRC}/searx/settings.yml" + echo "${SEARX_SETTINGS_PATH}" "${SEARX_SETTINGS_TEMPLATE}" + $DIFF_CMD "${SEARX_SETTINGS_PATH}" "${SEARX_SETTINGS_TEMPLATE}" local action choose_one action "What should happen to the settings file? " \ @@ -434,7 +435,7 @@ install_settings() { "use origin settings") backup_file "${SEARX_SETTINGS_PATH}" info_msg "install origin settings" - cp "${SEARX_SRC}/searx/settings.yml" "${SEARX_SETTINGS_PATH}" + cp "${SEARX_SETTINGS_TEMPLATE}" "${SEARX_SETTINGS_PATH}" ;; "start interactiv shell") backup_file "${SEARX_SETTINGS_PATH}" @@ -442,7 +443,7 @@ install_settings() { sudo -H -i rst_para 'Diff between new setting file (-) and current (+):' echo - $DIFF_CMD "${SEARX_SRC}/searx/settings.yml" "${SEARX_SETTINGS_PATH}" + $DIFF_CMD "${SEARX_SETTINGS_TEMPLATE}" "${SEARX_SETTINGS_PATH}" wait_key ;; esac diff --git a/utils/templates/etc/searx/use_default_settings.yml b/utils/templates/etc/searx/use_default_settings.yml new file mode 100644 index 000000000..e019a25bb --- /dev/null +++ b/utils/templates/etc/searx/use_default_settings.yml @@ -0,0 +1,22 @@ +use_default_settings: True + +general: + debug : False # Debug mode, only for development + instance_name : "searx" # displayed name + +search: + safe_search : 0 # Filter results. 0: None, 1: Moderate, 2: Strict + autocomplete : "" # Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "startpage", "swisscows", "qwant", "wikipedia" - leave blank to turn it off by default + default_lang : "" # Default search language - leave blank to detect from browser information or use codes from 'languages.py' + +server: + port : 8888 + bind_address : "127.0.0.1" # address to listen on + secret_key : "ultrasecretkey" # change this! + base_url : False # Set custom base_url. Possible values: False or "https://your.custom.host/location/" + image_proxy : False # Proxying image results through searx + +# uncomment below section if you have running morty proxy +#result_proxy: +# url : http://127.0.0.1:3000/ +# key : !!binary "your_morty_proxy_key" |