summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/admin/engines/settings.rst2
-rw-r--r--docs/src/searx.botdetection.rst45
-rw-r--r--docs/src/searx.plugins.limiter.rst13
-rw-r--r--requirements.txt1
-rw-r--r--searx/botdetection/__init__.py27
-rw-r--r--searx/botdetection/_helpers.py121
-rw-r--r--searx/botdetection/http_accept.py39
-rw-r--r--searx/botdetection/http_accept_encoding.py41
-rw-r--r--searx/botdetection/http_accept_language.py35
-rw-r--r--searx/botdetection/http_connection.py37
-rw-r--r--searx/botdetection/http_user_agent.py67
-rw-r--r--searx/botdetection/ip_limit.py146
-rw-r--r--searx/botdetection/limiter.py118
-rw-r--r--searx/botdetection/limiter.toml22
-rw-r--r--searx/botdetection/link_token.py156
-rw-r--r--searx/plugins/limiter.py103
-rw-r--r--searx/plugins/self_info.py31
-rw-r--r--searx/templates/simple/base.html3
-rw-r--r--searx/tools/__init__.py8
-rw-r--r--searx/tools/config.py376
-rwxr-xr-xsearx/webapp.py8
-rw-r--r--tests/unit/test_plugins.py12
22 files changed, 1273 insertions, 138 deletions
diff --git a/docs/admin/engines/settings.rst b/docs/admin/engines/settings.rst
index f9a1dad4f..63478f441 100644
--- a/docs/admin/engines/settings.rst
+++ b/docs/admin/engines/settings.rst
@@ -235,7 +235,7 @@ Global Settings
``limiter`` :
Rate limit the number of request on the instance, block some bots. The
- :ref:`limiter plugin` requires a :ref:`settings redis` database.
+ :ref:`limiter src` requires a :ref:`settings redis` database.
.. _image_proxy:
diff --git a/docs/src/searx.botdetection.rst b/docs/src/searx.botdetection.rst
new file mode 100644
index 000000000..85e0ce4cd
--- /dev/null
+++ b/docs/src/searx.botdetection.rst
@@ -0,0 +1,45 @@
+.. _botdetection:
+
+=============
+Bot Detection
+=============
+
+.. contents:: Contents
+ :depth: 2
+ :local:
+ :backlinks: entry
+
+.. automodule:: searx.botdetection
+ :members:
+
+.. automodule:: searx.botdetection.limiter
+ :members:
+
+
+Rate limit
+==========
+
+.. automodule:: searx.botdetection.ip_limit
+ :members:
+
+.. automodule:: searx.botdetection.link_token
+ :members:
+
+
+Probe HTTP headers
+==================
+
+.. automodule:: searx.botdetection.http_accept
+ :members:
+
+.. automodule:: searx.botdetection.http_accept_encoding
+ :members:
+
+.. automodule:: searx.botdetection.http_accept_language
+ :members:
+
+.. automodule:: searx.botdetection.http_connection
+ :members:
+
+.. automodule:: searx.botdetection.http_user_agent
+ :members:
diff --git a/docs/src/searx.plugins.limiter.rst b/docs/src/searx.plugins.limiter.rst
deleted file mode 100644
index 75d06f5c2..000000000
--- a/docs/src/searx.plugins.limiter.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. _limiter plugin:
-
-==============
-Limiter Plugin
-==============
-
-.. sidebar:: info
-
- The :ref:`limiter plugin` requires a :ref:`Redis <settings redis>` database.
-
-.. automodule:: searx.plugins.limiter
- :members:
-
diff --git a/requirements.txt b/requirements.txt
index 19318f53b..12926f616 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ redis==4.5.5
markdown-it-py==2.2.0
typing_extensions==4.6.3
fasttext-predict==0.9.2.1
+pytomlpp==1.0.13
diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py
new file mode 100644
index 000000000..fcd8e5630
--- /dev/null
+++ b/searx/botdetection/__init__.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+""".. _botdetection src:
+
+X-Forwarded-For
+===============
+
+.. attention::
+
+ A correct setup of the HTTP request headers ``X-Forwarded-For`` and
+ ``X-Real-IP`` is essential to be able to assign a request to an IP correctly:
+
+ - `NGINX RequestHeader`_
+ - `Apache RequestHeader`_
+
+.. _NGINX RequestHeader:
+ https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site
+.. _Apache RequestHeader:
+ https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site
+
+.. autofunction:: searx.botdetection.get_real_ip
+
+"""
+
+from ._helpers import dump_request
+from ._helpers import get_real_ip
+from ._helpers import too_many_requests
diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py
new file mode 100644
index 000000000..8e0156d6e
--- /dev/null
+++ b/searx/botdetection/_helpers.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+# pylint: disable=missing-module-docstring, invalid-name
+from __future__ import annotations
+
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+ IPv6Address,
+ ip_address,
+ ip_network,
+)
+import flask
+import werkzeug
+
+from searx.tools import config
+from searx import logger
+
+logger = logger.getChild('botdetection')
+
+
+def dump_request(request: flask.Request):
+ return (
+ request.path
+ + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+ + " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+ + " || form: %s" % request.form
+ + " || Accept: %s" % request.headers.get('Accept')
+ + " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ + " || Content-Type: %s" % request.headers.get('Content-Type')
+ + " || Content-Length: %s" % request.headers.get('Content-Length')
+ + " || Connection: %s" % request.headers.get('Connection')
+ + " || User-Agent: %s" % request.headers.get('User-Agent')
+ )
+
+
+def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
+ """Returns a HTTP 429 response object and writes a ERROR message to the
+ 'botdetection' logger. This function is used in part by the filter methods
+ to return the default ``Too Many Requests`` response.
+
+ """
+
+ logger.debug("BLOCK %s: %s", network.compressed, log_msg)
+ return flask.make_response(('Too Many Requests', 429))
+
+
+def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network:
+ """Returns the (client) network of whether the real_ip is part of."""
+
+ ip = ip_address(real_ip)
+ if isinstance(ip, IPv6Address):
+ prefix = cfg['real_ip.ipv6_prefix']
+ else:
+ prefix = cfg['real_ip.ipv4_prefix']
+ network = ip_network(f"{real_ip}/{prefix}", strict=False)
+ # logger.debug("get_network(): %s", network.compressed)
+ return network
+
+
+def get_real_ip(request: flask.Request) -> str:
+ """Returns real IP of the request. Since not all proxies set all the HTTP
+ headers and incoming headers can be faked it may happen that the IP cannot
+ be determined correctly.
+
+ .. sidebar:: :py:obj:`flask.Request.remote_addr`
+
+ SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
+
+ This function tries to get the remote IP in the order listed below,
+ additional some tests are done and if inconsistencies or errors are
+ detected, they are logged.
+
+ The remote IP of the request is taken from (first match):
+
+ - X-Forwarded-For_ header
+ - `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
+ - :py:obj:`flask.Request.remote_addr`
+
+ .. _ProxyFix:
+ https://werkzeug.palletsprojects.com/middleware/proxy_fix/
+
+ .. _X-Forwarded-For:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
+
+ """
+
+ forwarded_for = request.headers.get("X-Forwarded-For")
+ real_ip = request.headers.get('X-Real-IP')
+ remote_addr = request.remote_addr
+ # logger.debug(
+ # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
+ # )
+
+ if not forwarded_for:
+ logger.error("X-Forwarded-For header is not set!")
+ else:
+ from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import
+
+ forwarded_for = [x.strip() for x in forwarded_for.split(',')]
+ x_for: int = get_cfg()['real_ip.x_for']
+ forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
+
+ if not real_ip:
+ logger.error("X-Real-IP header is not set!")
+
+ if forwarded_for and real_ip and forwarded_for != real_ip:
+ logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
+
+ if forwarded_for and remote_addr and forwarded_for != remote_addr:
+ logger.warning(
+ "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
+ )
+
+ if real_ip and remote_addr and real_ip != remote_addr:
+ logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
+
+ request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
+ # logger.debug("get_real_ip() -> %s", request_ip)
+ return request_ip
diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py
new file mode 100644
index 000000000..b78a86278
--- /dev/null
+++ b/searx/botdetection/http_accept.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""
+Method ``http_accept``
+----------------------
+
+The ``http_accept`` method evaluates a request as the request of a bot if the
+Accept_ header ..
+
+- did not contain ``text/html``
+
+.. _Accept:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
+
+"""
+# pylint: disable=unused-argument
+
+from __future__ import annotations
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+)
+
+import flask
+import werkzeug
+
+from searx.tools import config
+from ._helpers import too_many_requests
+
+
+def filter_request(
+ network: IPv4Network | IPv6Network,
+ request: flask.Request,
+ cfg: config.Config,
+) -> werkzeug.Response | None:
+
+ if 'text/html' not in request.accept_mimetypes:
+ return too_many_requests(network, "HTTP header Accept did not contain text/html")
+ return None
diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py
new file mode 100644
index 000000000..60718a4ca
--- /dev/null
+++ b/searx/botdetection/http_accept_encoding.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""
+Method ``http_accept_encoding``
+-------------------------------
+
+The ``http_accept_encoding`` method evaluates a request as the request of a
+bot if the Accept-Encoding_ header ..
+
+- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
+- did not contain ``text/html``
+
+.. _Accept-Encoding:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
+
+"""
+# pylint: disable=unused-argument
+
+from __future__ import annotations
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+)
+
+import flask
+import werkzeug
+
+from searx.tools import config
+from ._helpers import too_many_requests
+
+
+def filter_request(
+ network: IPv4Network | IPv6Network,
+ request: flask.Request,
+ cfg: config.Config,
+) -> werkzeug.Response | None:
+
+ accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
+ if not ('gzip' in accept_list or 'deflate' in accept_list):
+ return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate")
+ return None
diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py
new file mode 100644
index 000000000..395d28bfd
--- /dev/null
+++ b/searx/botdetection/http_accept_language.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""
+Method ``http_accept_language``
+-------------------------------
+
+The ``http_accept_language`` method evaluates a request as the request of a bot
+if the Accept-Language_ header is unset.
+
+.. _Accept-Language:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
+
+"""
+# pylint: disable=unused-argument
+from __future__ import annotations
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+)
+
+import flask
+import werkzeug
+
+from searx.tools import config
+from ._helpers import too_many_requests
+
+
+def filter_request(
+ network: IPv4Network | IPv6Network,
+ request: flask.Request,
+ cfg: config.Config,
+) -> werkzeug.Response | None:
+ if request.headers.get('Accept-Language', '').strip() == '':
+ return too_many_requests(network, "missing HTTP header Accept-Language")
+ return None
diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py
new file mode 100644
index 000000000..ee0d80a23
--- /dev/null
+++ b/searx/botdetection/http_connection.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""
+Method ``http_connection``
+--------------------------
+
+The ``http_connection`` method evaluates a request as the request of a bot if
+the Connection_ header is set to ``close``.
+
+.. _Connection:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
+
+"""
+# pylint: disable=unused-argument
+
+from __future__ import annotations
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+)
+
+import flask
+import werkzeug
+
+from searx.tools import config
+from ._helpers import too_many_requests
+
+
+def filter_request(
+ network: IPv4Network | IPv6Network,
+ request: flask.Request,
+ cfg: config.Config,
+) -> werkzeug.Response | None:
+
+ if request.headers.get('Connection', '').strip() == 'close':
+ return too_many_requests(network, "HTTP header 'Connection=close")
+ return None
diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py
new file mode 100644
index 000000000..17025f68b
--- /dev/null
+++ b/searx/botdetection/http_user_agent.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""
+Method ``http_user_agent``
+--------------------------
+
+The ``http_user_agent`` method evaluates a request as the request of a bot if
+the User-Agent_ header is unset or matches the regular expression
+:py:obj:`USER_AGENT`.
+
+.. _User-Agent:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
+
+"""
+# pylint: disable=unused-argument
+
+from __future__ import annotations
+import re
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+)
+
+import flask
+import werkzeug
+
+from searx.tools import config
+from ._helpers import too_many_requests
+
+
+USER_AGENT = (
+ r'('
+ + r'unknown'
+ + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
+ + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
+ + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
+ + r'|ZmEu|BLEXBot|bitlybot'
+ # unmaintained Farside instances
+ + r'|'
+ + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
+ # other bots and client to block
+ + '|.*PetalBot.*'
+ + r')'
+)
+"""Regular expression that matches to User-Agent_ from known *bots*"""
+
+_regexp = None
+
+
+def regexp_user_agent():
+ global _regexp # pylint: disable=global-statement
+ if not _regexp:
+ _regexp = re.compile(USER_AGENT)
+ return _regexp
+
+
+def filter_request(
+ network: IPv4Network | IPv6Network,
+ request: flask.Request,
+ cfg: config.Config,
+) -> werkzeug.Response | None:
+
+ user_agent = request.headers.get('User-Agent', 'unknown')
+ if regexp_user_agent().match(user_agent):
+ return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
+ return None
diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py
new file mode 100644
index 000000000..bb4229f0e
--- /dev/null
+++ b/searx/botdetection/ip_limit.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+""".. _botdetection.ip_limit:
+
+Method ``ip_limit``
+-------------------
+
+The ``ip_limit`` method counts request from an IP in *sliding windows*. If
+there are to many requests in a sliding window, the request is evaluated as a
+bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_
+header. To take privacy only the hash value of an IP is stored in the redis DB
+and at least for a maximum of 10 minutes.
+
+The :py:obj:`.link_token` method can be used to investigate whether a request is
+*suspicious*. To activate the :py:obj:`.link_token` method in the
+:py:obj:`.ip_limit` method add the following to your
+``/etc/searxng/limiter.toml``:
+
+.. code:: toml
+
+ [botdetection.ip_limit]
+ link_token = true
+
+If the :py:obj:`.link_token` method is activated and a request is *suspicious*
+the request rates are reduced:
+
+- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS`
+- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS`
+
+To intercept bots that get their IPs from a range of IPs, there is a
+:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored
+for a longer time. IPs stored in this sliding window have a maximum of
+:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP
+makes a request that is not suspicious, the sliding window for this IP is
+droped.
+
+.. _X-Forwarded-For:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
+
+"""
+from __future__ import annotations
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+)
+
+import flask
+import werkzeug
+from searx.tools import config
+
+from searx import redisdb
+from searx import logger
+from searx.redislib import incr_sliding_window, drop_counter
+
+from . import link_token
+from ._helpers import too_many_requests
+
+
+logger = logger.getChild('botdetection.ip_limit')
+
+BURST_WINDOW = 20
+"""Time (sec) before sliding window for *burst* requests expires."""
+
+BURST_MAX = 15
+"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`"""
+
+BURST_MAX_SUSPICIOUS = 2
+"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`"""
+
+LONG_WINDOW = 600
+"""Time (sec) before the longer sliding window expires."""
+
+LONG_MAX = 150
+"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`"""
+
+LONG_MAX_SUSPICIOUS = 10
+"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
+
+API_WONDOW = 3600
+"""Time (sec) before sliding window for API requests (format != html) expires."""
+
+API_MAX = 4
+"""Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
+
+SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30
+"""Time (sec) before sliding window for one suspicious IP expires."""
+
+SUSPICIOUS_IP_MAX = 3
+"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
+
+
+def filter_request(
+ network: IPv4Network | IPv6Network,
+ request: flask.Request,
+ cfg: config.Config,
+) -> werkzeug.Response | None:
+
+ # pylint: disable=too-many-return-statements
+ redis_client = redisdb.client()
+
+ if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']:
+ logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed)
+ return None
+
+ if request.args.get('format', 'html') != 'html':
+ c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW)
+ if c > API_MAX:
+ return too_many_requests(network, "too many request in API_WINDOW")
+
+ if cfg['botdetection.ip_limit.link_token']:
+
+ suspicious = link_token.is_suspicious(network, request, True)
+
+ if not suspicious:
+ # this IP is no longer suspicious: release ip again / delete the counter of this IP
+ drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed)
+ return None
+
+ # this IP is suspicious: count requests from this IP
+ c = incr_sliding_window(
+ redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW
+ )
+ if c > SUSPICIOUS_IP_MAX:
+ logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
+ return flask.redirect(flask.url_for('index'), code=302)
+
+ c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
+ if c > BURST_MAX_SUSPICIOUS:
+ return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
+
+ c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
+ if c > LONG_MAX_SUSPICIOUS:
+ return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
+
+ return None
+
+ # vanilla limiter without extensions counts BURST_MAX and LONG_MAX
+ c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
+ if c > BURST_MAX:
+ return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)")
+
+ c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
+ if c > LONG_MAX:
+ return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)")
+
+ return None
diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py
new file mode 100644
index 000000000..18ffc8407
--- /dev/null
+++ b/searx/botdetection/limiter.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+""".. _limiter src:
+
+Limiter
+=======
+
+.. sidebar:: info
+
+ The limiter requires a :ref:`Redis <settings redis>` database.
+
+Bot protection / IP rate limitation. The intention of rate limitation is to
+limit suspicious requests from an IP. The motivation behind this is the fact
+that SearXNG passes through requests from bots and is thus classified as a bot
+itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked
+by the search engine (the origin) in some other way.
+
+To avoid blocking, the requests from bots to SearXNG must also be blocked, this
+is the task of the limiter. To perform this task, the limiter uses the methods
+from the :py:obj:`searx.botdetection`.
+
+To enable the limiter activate:
+
+.. code:: yaml
+
+ server:
+ ...
+ limiter: true # rate limit the number of request on the instance, block some bots
+
+and set the redis-url connection. Check the value, it depends on your redis DB
+(see :ref:`settings redis`), by example:
+
+.. code:: yaml
+
+ redis:
+ url: unix:///usr/local/searxng-redis/run/redis.sock?db=0
+
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+import flask
+import werkzeug
+
+from searx.tools import config
+from searx import logger
+
+from . import (
+ http_accept,
+ http_accept_encoding,
+ http_accept_language,
+ http_connection,
+ http_user_agent,
+ ip_limit,
+)
+
+from ._helpers import (
+ get_network,
+ get_real_ip,
+ dump_request,
+)
+
+logger = logger.getChild('botdetection.limiter')
+
+CFG: config.Config = None # type: ignore
+
+LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml"
+"""Base configuration (schema) of the botdetection."""
+
+LIMITER_CFG = Path('/etc/searxng/limiter.toml')
+"""Lokal Limiter configuration."""
+
+CFG_DEPRECATED = {
+ # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
+}
+
+
+def get_cfg() -> config.Config:
+ global CFG # pylint: disable=global-statement
+ if CFG is None:
+ CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED)
+ return CFG
+
+
+def filter_request(request: flask.Request) -> werkzeug.Response | None:
+
+ cfg = get_cfg()
+ real_ip = get_real_ip(request)
+ network = get_network(real_ip, cfg)
+ if network.is_link_local:
+ return None
+
+ if request.path == '/healthz':
+ return None
+
+ for func in [
+ http_user_agent,
+ ]:
+ val = func.filter_request(network, request, cfg)
+ if val is not None:
+ return val
+
+ if request.path == '/search':
+
+ for func in [
+ http_accept,
+ http_accept_encoding,
+ http_accept_language,
+ http_connection,
+ http_user_agent,
+ ip_limit,
+ ]:
+ val = func.filter_request(network, request, cfg)
+ if val is not None:
+ return val
+ logger.debug(f"OK {network}: %s", dump_request(flask.request))
+ return None
diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml
new file mode 100644
index 000000000..71a231e8f
--- /dev/null
+++ b/searx/botdetection/limiter.toml
@@ -0,0 +1,22 @@
+[real_ip]
+
+# Number of values to trust for X-Forwarded-For.
+
+x_for = 1
+
+# The prefix defines the number of leading bits in an address that are compared
+# to determine whether or not an address is part of a (client) network.
+
+ipv4_prefix = 32
+ipv6_prefix = 48
+
+[botdetection.ip_limit]
+
+# To get unlimited access in a local network, by default link-lokal addresses
+# (networks) are not monitored by the ip_limit
+filter_link_local = false
+
+# acrivate link_token method in the ip_limit method
+link_token = false
+
+
diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py
new file mode 100644
index 000000000..11a6a56b5
--- /dev/null
+++ b/searx/botdetection/link_token.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""
+Method ``link_token``
+---------------------
+
+The ``link_token`` method evaluates a request as :py:obj:`suspicious
+<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
+client. By adding a random component (the token) in the URL, a bot can not send
+a ping by request a static URL.
+
+.. note::
+
+ This method requires a redis DB and needs a HTTP X-Forwarded-For_ header.
+
+To get in use of this method a flask URL route needs to be added:
+
+.. code:: python
+
+ @app.route('/client<token>.css', methods=['GET', 'POST'])
+ def client_token(token=None):
+ link_token.ping(request, token)
+ return Response('', mimetype='text/css')
+
+And in the HTML template from flask a stylesheet link is needed (the value of
+``link_token`` comes from :py:obj:`get_token`):
+
+.. code:: html
+
+ <link rel="stylesheet"
+ href="{{ url_for('client_token', token=link_token) }}"
+ type="text/css" />
+
+.. _X-Forwarded-For:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
+
+"""
+from __future__ import annotations
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+)
+
+import string
+import random
+import flask
+
+from searx import logger
+from searx import redisdb
+from searx.redislib import secret_hash
+
+from ._helpers import (
+ get_network,
+ get_real_ip,
+)
+
+TOKEN_LIVE_TIME = 600
+"""Livetime (sec) of limiter's CSS token."""
+
+PING_LIVE_TIME = 3600
+"""Livetime (sec) of the ping-key from a client (request)"""
+
+PING_KEY = 'SearXNG_limiter.ping'
+"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
+
+TOKEN_KEY = 'SearXNG_limiter.token'
+"""Key for which the current token is stored in the DB"""
+
+logger = logger.getChild('botdetection.link_token')
+
+
+def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False):
+ """Checks whether a valid ping is exists for this (client) network, if not
+ this request is rated as *suspicious*. If a valid ping exists and argument
+ ``renew`` is ``True`` the expire time of this ping is reset to
+ :py:obj:`PING_LIVE_TIME`.
+
+ """
+ redis_client = redisdb.client()
+ if not redis_client:
+ return False
+
+ ping_key = get_ping_key(network, request)
+ if not redis_client.get(ping_key):
+ logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key)
+ return True
+
+ if renew:
+ redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
+
+ logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key)
+ return False
+
+
+def ping(request: flask.Request, token: str):
+ """This function is called by a request to URL ``/client<token>.css``. If
+ ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
+ The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
+
+ """
+ from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import
+
+ redis_client = redisdb.client()
+ if not redis_client:
+ return
+ if not token_is_valid(token):
+ return
+
+ cfg = limiter.get_cfg()
+ real_ip = get_real_ip(request)
+ network = get_network(real_ip, cfg)
+
+ ping_key = get_ping_key(network, request)
+ logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key)
+ redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
+
+
+def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str:
+ """Generates a hashed key that fits (more or less) to a *WEB-browser
+ session* in a network."""
+ return (
+ PING_KEY
+ + "["
+ + secret_hash(
+ network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '')
+ )
+ + "]"
+ )
+
+
+def token_is_valid(token) -> bool:
+ valid = token == get_token()
+ logger.debug("token is valid --> %s", valid)
+ return valid
+
+
+def get_token() -> str:
+ """Returns current token. If there is no currently active token a new token
+ is generated randomly and stored in the redis DB.
+
+ - :py:obj:`TOKEN_LIVE_TIME`
+ - :py:obj:`TOKEN_KEY`
+
+ """
+ redis_client = redisdb.client()
+ if not redis_client:
+ # This function is also called when limiter is inactive / no redis DB
+ # (see render function in webapp.py)
+ return '12345678'
+ token = redis_client.get(TOKEN_KEY)
+ if token:
+ token = token.decode('UTF-8')
+ else:
+ token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
+ redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME)
+ return token
diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py
index 46c82f588..a8beb5e88 100644
--- a/searx/plugins/limiter.py
+++ b/searx/plugins/limiter.py
@@ -1,119 +1,32 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pyright: basic
-"""Some bot protection / rate limitation
+"""see :ref:`limiter src`"""
-To monitor rate limits and protect privacy the IP addresses are getting stored
-with a hash so the limiter plugin knows who to block. A redis database is
-needed to store the hash values.
-
-Enable the plugin in ``settings.yml``:
-
-- ``server.limiter: true``
-- ``redis.url: ...`` check the value, see :ref:`settings redis`
-"""
-
-import re
-from flask import request
+import flask
from searx import redisdb
from searx.plugins import logger
-from searx.redislib import incr_sliding_window
+from searx.botdetection import limiter
name = "Request limiter"
description = "Limit the number of request"
default_on = False
preference_section = 'service'
-logger = logger.getChild('limiter')
-
-block_user_agent = re.compile(
- r'('
- + r'unknown'
- + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
- + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
- + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
- + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
- + r'|ZmEu|BLEXBot|bitlybot'
- # unmaintained Farside instances
- + r'|'
- + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
- + '|.*PetalBot.*'
- + r')'
-)
-
-
-def is_accepted_request() -> bool:
- # pylint: disable=too-many-return-statements
- redis_client = redisdb.client()
- user_agent = request.headers.get('User-Agent', 'unknown')
- x_forwarded_for = request.headers.get('X-Forwarded-For', '')
-
- if request.path == '/healthz':
- return True
-
- if block_user_agent.match(user_agent):
- logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent))
- return False
-
- if request.path == '/search':
- c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20)
- c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600)
- if c_burst > 15 or c_10min > 150:
- logger.debug("BLOCK %s: to many request", x_forwarded_for)
- return False
-
- if len(request.headers.get('Accept-Language', '').strip()) == '':
- logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for)
- return False
-
- if request.headers.get('Connection') == 'close':
- logger.debug("BLOCK %s: got Connection=close", x_forwarded_for)
- return False
-
- accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
- if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list:
- logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for)
- return False
-
- if 'text/html' not in request.accept_mimetypes:
- logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for)
- return False
-
- if request.args.get('format', 'html') != 'html':
- c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600)
- if c > 4:
- logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for)
- return False
-
- logger.debug(
- "OK %s: '%s'" % (x_forwarded_for, request.path)
- + " || form: %s" % request.form
- + " || Accept: %s" % request.headers.get('Accept', '')
- + " || Accept-Language: %s" % request.headers.get('Accept-Language', '')
- + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '')
- + " || Content-Type: %s" % request.headers.get('Content-Type', '')
- + " || Content-Length: %s" % request.headers.get('Content-Length', '')
- + " || Connection: %s" % request.headers.get('Connection', '')
- + " || User-Agent: %s" % user_agent
- )
-
- return True
+logger = logger.getChild('limiter')
def pre_request():
- if not is_accepted_request():
- return 'Too Many Requests', 429
- return None
+ """See :ref:`flask.Flask.before_request`"""
+ return limiter.filter_request(flask.request)
-def init(app, settings):
+def init(app: flask.Flask, settings) -> bool:
if not settings['server']['limiter']:
return False
-
if not redisdb.client():
- logger.error("The limiter requires Redis") # pylint: disable=undefined-variable
+ logger.error("The limiter requires Redis")
return False
-
app.before_request(pre_request)
return True
diff --git a/searx/plugins/self_info.py b/searx/plugins/self_info.py
index fbe4518b5..8079ee0d4 100644
--- a/searx/plugins/self_info.py
+++ b/searx/plugins/self_info.py
@@ -1,21 +1,11 @@
-'''
-searx is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+# pylint: disable=missing-module-docstring,invalid-name
-searx is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with searx. If not, see < http://www.gnu.org/licenses/ >.
-
-(C) 2015 by Adam Tauber, <asciimoo@gmail.com>
-'''
-from flask_babel import gettext
import re
+from flask_babel import gettext
+
+from searx.botdetection._helpers import get_real_ip
name = gettext('Self Information')
description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".')
@@ -28,18 +18,11 @@ query_examples = ''
p = re.compile('.*user[ -]agent.*', re.IGNORECASE)
-# attach callback to the post search hook
-# request: flask request object
-# ctx: the whole local context of the pre search hook
def post_search(request, search):
if search.search_query.pageno > 1:
return True
if search.search_query.query == 'ip':
- x_forwarded_for = request.headers.getlist("X-Forwarded-For")
- if x_forwarded_for:
- ip = x_forwarded_for[0]
- else:
- ip = request.remote_addr
+ ip = get_real_ip(request)
search.result_container.answers['ip'] = {'answer': ip}
elif p.match(search.search_query.query):
ua = request.user_agent
diff --git a/searx/templates/simple/base.html b/searx/templates/simple/base.html
index a31ff07ee..3c6ed11c7 100644
--- a/searx/templates/simple/base.html
+++ b/searx/templates/simple/base.html
@@ -17,6 +17,9 @@
{% else %}
<link rel="stylesheet" href="{{ url_for('static', filename='css/searxng.min.css') }}" type="text/css" media="screen" />
{% endif %}
+ {% if get_setting('server.limiter') %}
+ <link rel="stylesheet" href="{{ url_for('client_token', token=link_token) }}" type="text/css" />
+ {% endif %}
{% block styles %}{% endblock %}
<!--[if gte IE 9]>-->
<script src="{{ url_for('static', filename='js/searxng.head.min.js') }}" client_settings="{{ client_settings }}"></script>
diff --git a/searx/tools/__init__.py b/searx/tools/__init__.py
new file mode 100644
index 000000000..08e6d982f
--- /dev/null
+++ b/searx/tools/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+""".. _tools src:
+
+A collection of *utilities* used by SearXNG, but without SearXNG specific
+peculiarities.
+
+"""
diff --git a/searx/tools/config.py b/searx/tools/config.py
new file mode 100644
index 000000000..f998031ba
--- /dev/null
+++ b/searx/tools/config.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Configuration class :py:class:`Config` with deep-update, schema validation
+and deprecated names.
+
+The :py:class:`Config` class implements a configuration that is based on
+structured dictionaries. The configuration schema is defined in a dictionary
+structure and the configuration data is given in a dictionary structure.
+"""
+from __future__ import annotations
+
+import copy
+import typing
+import logging
+import pathlib
+import pytomlpp as toml
+
+__all__ = ['Config', 'UNSET', 'SchemaIssue']
+
+log = logging.getLogger(__name__)
+
+
+class FALSE:
+ """Class of ``False`` singelton"""
+
+ # pylint: disable=multiple-statements
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __bool__(self):
+ return False
+
+ def __str__(self):
+ return self.msg
+
+ __repr__ = __str__
+
+
+UNSET = FALSE('<UNSET>')
+
+
+class SchemaIssue(ValueError):
+ """Exception to store and/or raise a message from a schema issue."""
+
+ def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str):
+ self.level = level
+ super().__init__(msg)
+
+ def __str__(self):
+ return f"[cfg schema {self.level}] {self.args[0]}"
+
+
+class Config:
+ """Base class used for configuration"""
+
+ UNSET = UNSET
+
+ @classmethod
+ def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config:
+
+ # init schema
+
+ log.debug("load schema file: %s", schema_file)
+ cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated)
+ if not cfg_file.exists():
+ log.warning("missing config file: %s", cfg_file)
+ return cfg
+
+ # load configuration
+
+ log.debug("load config file: %s", cfg_file)
+ try:
+ upd_cfg = toml.load(cfg_file)
+ except toml.DecodeError as exc:
+ msg = str(exc).replace('\t', '').replace('\n', ' ')
+ log.error("%s: %s", cfg_file, msg)
+ raise
+
+ is_valid, issue_list = cfg.validate(upd_cfg)
+ for msg in issue_list:
+ log.error(str(msg))
+ if not is_valid:
+ raise TypeError(f"schema of {cfg_file} is invalid!")
+ cfg.update(upd_cfg)
+ return cfg
+
+ def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]):
+ """Construtor of class Config.
+
+ :param cfg_schema: Schema of the configuration
+ :param deprecated: dictionary that maps deprecated configuration names to a messages
+
+ These values are needed for validation, see :py:obj:`validate`.
+
+ """
+ self.cfg_schema = cfg_schema
+ self.deprecated = deprecated
+ self.cfg = copy.deepcopy(cfg_schema)
+
+ def __getitem__(self, key: str):
+ return self.get(key)
+
+ def validate(self, cfg: dict):
+ """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`.
+ Validation is done by :py:obj:`validate`."""
+
+ return validate(self.cfg_schema, cfg, self.deprecated)
+
+ def update(self, upd_cfg: dict):
+ """Update this configuration by ``upd_cfg``."""
+
+ dict_deepupdate(self.cfg, upd_cfg)
+
+ def default(self, name: str):
+ """Returns default value of field ``name`` in ``self.cfg_schema``."""
+ return value(name, self.cfg_schema)
+
+ def get(self, name: str, default=UNSET, replace=True):
+ """Returns the value to which ``name`` points in the configuration.
+
+ If there is no such ``name`` in the config and the ``default`` is
+ :py:obj:`UNSET`, a :py:obj:`KeyError` is raised.
+ """
+
+ parent = self._get_parent_dict(name)
+ val = parent.get(name.split('.')[-1], UNSET)
+ if val is UNSET:
+ if default is UNSET:
+ raise KeyError(name)
+ val = default
+
+ if replace and isinstance(val, str):
+ val = val % self
+ return val
+
+ def set(self, name: str, val):
+ """Set the value to which ``name`` points in the configuration.
+
+ If there is no such ``name`` in the config, a :py:obj:`KeyError` is
+ raised.
+ """
+ parent = self._get_parent_dict(name)
+ parent[name.split('.')[-1]] = val
+
+ def _get_parent_dict(self, name):
+ parent_name = '.'.join(name.split('.')[:-1])
+ if parent_name:
+ parent = value(parent_name, self.cfg)
+ else:
+ parent = self.cfg
+ if (parent is UNSET) or (not isinstance(parent, dict)):
+ raise KeyError(parent_name)
+ return parent
+
+ def path(self, name: str, default=UNSET):
+ """Get a :py:class:`pathlib.Path` object from a config string."""
+
+ val = self.get(name, default)
+ if val is UNSET:
+ if default is UNSET:
+ raise KeyError(name)
+ return default
+ return pathlib.Path(str(val))
+
+ def pyobj(self, name, default=UNSET):
+ """Get python object refered by full qualiffied name (FQN) in the config
+ string."""
+
+ fqn = self.get(name, default)
+ if fqn is UNSET:
+ if default is UNSET:
+ raise KeyError(name)
+ return default
+ (modulename, name) = str(fqn).rsplit('.', 1)
+ m = __import__(modulename, {}, {}, [name], 0)
+ return getattr(m, name)
+
+
+# working with dictionaries
+
+
+def value(name: str, data_dict: dict):
+ """Returns the value to which ``name`` points in the ``dat_dict``.
+
+ .. code: python
+
+ >>> data_dict = {
+ "foo": {"bar": 1 },
+ "bar": {"foo": 2 },
+ "foobar": [1, 2, 3],
+ }
+ >>> value('foobar', data_dict)
+ [1, 2, 3]
+ >>> value('foo.bar', data_dict)
+ 1
+ >>> value('foo.bar.xxx', data_dict)
+ <UNSET>
+
+ """
+
+ ret_val = data_dict
+ for part in name.split('.'):
+ if isinstance(ret_val, dict):
+ ret_val = ret_val.get(part, UNSET)
+ if ret_val is UNSET:
+ break
+ return ret_val
+
+
+def validate(
+ schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str]
+) -> typing.Tuple[bool, list]:
+
+ """Deep validation of dictionary in ``data_dict`` against dictionary in
+ ``schema_dict``. Argument deprecated is a dictionary that maps deprecated
+ configuration names to a messages::
+
+ deprecated = {
+ "foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'",
+ "..." : "..."
+ }
+
+ The function returns a python tuple ``(is_valid, issue_list)``:
+
+ ``is_valid``:
+ A bool value indicating ``data_dict`` is valid or not.
+
+ ``issue_list``:
+ A list of messages (:py:obj:`SchemaIssue`) from the validation::
+
+ [schema warn] data_dict: deprecated 'fontlib.foo': <DEPRECATED['foo.bar']>
+ [schema invalid] data_dict: key unknown 'fontlib.foo'
+ [schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ...
+
+ If ``schema_dict`` or ``data_dict`` is not a dictionary type a
+ :py:obj:`SchemaIssue` is raised.
+
+ """
+ names = []
+ is_valid = True
+ issue_list = []
+
+ if not isinstance(schema_dict, dict):
+ raise SchemaIssue('invalid', "schema_dict is not a dict type")
+ if not isinstance(data_dict, dict):
+ raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type")
+
+ is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated)
+ return is_valid, issue_list
+
+
+def _validate(
+ names: typing.List,
+ issue_list: typing.List,
+ schema_dict: typing.Dict,
+ data_dict: typing.Dict,
+ deprecated: typing.Dict[str, str],
+) -> typing.Tuple[bool, typing.List]:
+
+ is_valid = True
+
+ for key, data_value in data_dict.items():
+
+ names.append(key)
+ name = '.'.join(names)
+
+ deprecated_msg = deprecated.get(name)
+ # print("XXX %s: key %s // data_value: %s" % (name, key, data_value))
+ if deprecated_msg:
+ issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}"))
+
+ schema_value = value(name, schema_dict)
+ # print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value))
+ if schema_value is UNSET:
+ if not deprecated_msg:
+ issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict"))
+ is_valid = False
+
+ elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck
+ issue_list.append(
+ SchemaIssue(
+ 'invalid',
+ (f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"),
+ )
+ )
+ is_valid = False
+
+ elif isinstance(data_value, dict):
+ _valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated)
+ is_valid = is_valid and _valid
+ names.pop()
+
+ return is_valid, issue_list
+
+
+def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None):
+ """Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``.
+
+ For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``:
+
+ 0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a
+ :py:obj:`TypeError`.
+
+ 1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``.
+
+ 2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a
+ (deep-) copy of ``upd_val``.
+
+ 3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the
+ list in ``upd_val``.
+
+ 4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in
+ ``upd_val``.
+ """
+ # pylint: disable=too-many-branches
+ if not isinstance(base_dict, dict):
+ raise TypeError("argument 'base_dict' is not a ditionary type")
+ if not isinstance(upd_dict, dict):
+ raise TypeError("argument 'upd_dict' is not a ditionary type")
+
+ if names is None:
+ names = []
+
+ for upd_key, upd_val in upd_dict.items():
+ # For each upd_key & upd_val pair in upd_dict:
+
+ if isinstance(upd_val, dict):
+
+ if upd_key in base_dict:
+ # if base_dict[upd_key] exists, recursively deep-update it
+ if not isinstance(base_dict[upd_key], dict):
+ raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict")
+ dict_deepupdate(
+ base_dict[upd_key],
+ upd_val,
+ names
+ + [
+ upd_key,
+ ],
+ )
+
+ else:
+ # if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val
+ base_dict[upd_key] = copy.deepcopy(upd_val)
+
+ elif isinstance(upd_val, list):
+
+ if upd_key in base_dict:
+ # if base_dict[upd_key] exists, base_dict[up_key] is extended by
+ # the list from upd_val
+ if not isinstance(base_dict[upd_key], list):
+ raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict")
+ base_dict[upd_key].extend(upd_val)
+
+ else:
+ # if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the
+ # list in upd_val.
+ base_dict[upd_key] = copy.deepcopy(upd_val)
+
+ elif isinstance(upd_val, set):
+
+ if upd_key in base_dict:
+ # if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val
+ if not isinstance(base_dict[upd_key], set):
+ raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict")
+ base_dict[upd_key].update(upd_val.copy())
+
+ else:
+ # if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the
+ # set in upd_val
+ base_dict[upd_key] = upd_val.copy()
+
+ else:
+ # for any other type of upd_val replace or add base_dict[upd_key] by a copy
+ # of upd_val
+ base_dict[upd_key] = copy.copy(upd_val)
diff --git a/searx/webapp.py b/searx/webapp.py
index 79255652f..d6322447a 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -94,6 +94,7 @@ from searx.utils import (
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
from searx.query import RawTextQuery
from searx.plugins import Plugin, plugins, initialize as plugin_initialize
+from searx.botdetection import link_token
from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import (
Preferences,
@@ -416,6 +417,7 @@ def render(template_name: str, **kwargs):
kwargs['endpoint'] = 'results' if 'q' in kwargs else request.endpoint
kwargs['cookies'] = request.cookies
kwargs['errors'] = request.errors
+ kwargs['link_token'] = link_token.get_token()
# values from the preferences
kwargs['preferences'] = request.preferences
@@ -642,6 +644,12 @@ def health():
return Response('OK', mimetype='text/plain')
+@app.route('/client<token>.css', methods=['GET', 'POST'])
+def client_token(token=None):
+ link_token.ping(request, token)
+ return Response('', mimetype='text/css')
+
+
@app.route('/search', methods=['GET', 'POST'])
def search():
"""Search query in q and return results.
diff --git a/tests/unit/test_plugins.py b/tests/unit/test_plugins.py
index 28df835e5..0d555fdc0 100644
--- a/tests/unit/test_plugins.py
+++ b/tests/unit/test_plugins.py
@@ -50,9 +50,13 @@ class SelfIPTest(SearxTestCase):
self.assertTrue(len(store.plugins) == 1)
# IP test
- request = Mock(remote_addr='127.0.0.1')
- request.headers.getlist.return_value = []
- search = get_search_mock(query='ip', pageno=1)
+ request = Mock()
+ request.remote_addr = '127.0.0.1'
+ request.headers = {'X-Forwarded-For': '1.2.3.4, 127.0.0.1', 'X-Real-IP': '127.0.0.1'}
+ search = get_search_mock(
+ query='ip',
+ pageno=1,
+ )
store.call(store.plugins, 'post_search', request, search)
self.assertTrue('127.0.0.1' in search.result_container.answers["ip"]["answer"])
@@ -62,7 +66,6 @@ class SelfIPTest(SearxTestCase):
# User agent test
request = Mock(user_agent='Mock')
- request.headers.getlist.return_value = []
search = get_search_mock(query='user-agent', pageno=1)
store.call(store.plugins, 'post_search', request, search)
@@ -98,7 +101,6 @@ class HashPluginTest(SearxTestCase):
self.assertTrue(len(store.plugins) == 1)
request = Mock(remote_addr='127.0.0.1')
- request.headers.getlist.return_value = []
# MD5
search = get_search_mock(query='md5 test', pageno=1)