diff options
-rw-r--r-- | docs/admin/engines/settings.rst | 2 | ||||
-rw-r--r-- | docs/src/searx.botdetection.rst | 45 | ||||
-rw-r--r-- | docs/src/searx.plugins.limiter.rst | 13 | ||||
-rw-r--r-- | requirements.txt | 1 | ||||
-rw-r--r-- | searx/botdetection/__init__.py | 27 | ||||
-rw-r--r-- | searx/botdetection/_helpers.py | 121 | ||||
-rw-r--r-- | searx/botdetection/http_accept.py | 39 | ||||
-rw-r--r-- | searx/botdetection/http_accept_encoding.py | 41 | ||||
-rw-r--r-- | searx/botdetection/http_accept_language.py | 35 | ||||
-rw-r--r-- | searx/botdetection/http_connection.py | 37 | ||||
-rw-r--r-- | searx/botdetection/http_user_agent.py | 67 | ||||
-rw-r--r-- | searx/botdetection/ip_limit.py | 146 | ||||
-rw-r--r-- | searx/botdetection/limiter.py | 118 | ||||
-rw-r--r-- | searx/botdetection/limiter.toml | 22 | ||||
-rw-r--r-- | searx/botdetection/link_token.py | 156 | ||||
-rw-r--r-- | searx/plugins/limiter.py | 103 | ||||
-rw-r--r-- | searx/plugins/self_info.py | 31 | ||||
-rw-r--r-- | searx/templates/simple/base.html | 3 | ||||
-rw-r--r-- | searx/tools/__init__.py | 8 | ||||
-rw-r--r-- | searx/tools/config.py | 376 | ||||
-rwxr-xr-x | searx/webapp.py | 8 | ||||
-rw-r--r-- | tests/unit/test_plugins.py | 12 |
22 files changed, 1273 insertions, 138 deletions
diff --git a/docs/admin/engines/settings.rst b/docs/admin/engines/settings.rst index f9a1dad4f..63478f441 100644 --- a/docs/admin/engines/settings.rst +++ b/docs/admin/engines/settings.rst @@ -235,7 +235,7 @@ Global Settings ``limiter`` : Rate limit the number of request on the instance, block some bots. The - :ref:`limiter plugin` requires a :ref:`settings redis` database. + :ref:`limiter src` requires a :ref:`settings redis` database. .. _image_proxy: diff --git a/docs/src/searx.botdetection.rst b/docs/src/searx.botdetection.rst new file mode 100644 index 000000000..85e0ce4cd --- /dev/null +++ b/docs/src/searx.botdetection.rst @@ -0,0 +1,45 @@ +.. _botdetection: + +============= +Bot Detection +============= + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.botdetection + :members: + +.. automodule:: searx.botdetection.limiter + :members: + + +Rate limit +========== + +.. automodule:: searx.botdetection.ip_limit + :members: + +.. automodule:: searx.botdetection.link_token + :members: + + +Probe HTTP headers +================== + +.. automodule:: searx.botdetection.http_accept + :members: + +.. automodule:: searx.botdetection.http_accept_encoding + :members: + +.. automodule:: searx.botdetection.http_accept_language + :members: + +.. automodule:: searx.botdetection.http_connection + :members: + +.. automodule:: searx.botdetection.http_user_agent + :members: diff --git a/docs/src/searx.plugins.limiter.rst b/docs/src/searx.plugins.limiter.rst deleted file mode 100644 index 75d06f5c2..000000000 --- a/docs/src/searx.plugins.limiter.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _limiter plugin: - -============== -Limiter Plugin -============== - -.. sidebar:: info - - The :ref:`limiter plugin` requires a :ref:`Redis <settings redis>` database. - -.. automodule:: searx.plugins.limiter - :members: - diff --git a/requirements.txt b/requirements.txt index 19318f53b..12926f616 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ redis==4.5.5 markdown-it-py==2.2.0 typing_extensions==4.6.3 fasttext-predict==0.9.2.1 +pytomlpp==1.0.13 diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py new file mode 100644 index 000000000..fcd8e5630 --- /dev/null +++ b/searx/botdetection/__init__.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection src: + +X-Forwarded-For +=============== + +.. attention:: + + A correct setup of the HTTP request headers ``X-Forwarded-For`` and + ``X-Real-IP`` is essential to be able to assign a request to an IP correctly: + + - `NGINX RequestHeader`_ + - `Apache RequestHeader`_ + +.. _NGINX RequestHeader: + https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site +.. _Apache RequestHeader: + https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site + +.. autofunction:: searx.botdetection.get_real_ip + +""" + +from ._helpers import dump_request +from ._helpers import get_real_ip +from ._helpers import too_many_requests diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py new file mode 100644 index 000000000..8e0156d6e --- /dev/null +++ b/searx/botdetection/_helpers.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, invalid-name +from __future__ import annotations + +from ipaddress import ( + IPv4Network, + IPv6Network, + IPv6Address, + ip_address, + ip_network, +) +import flask +import werkzeug + +from searx.tools import config +from searx import logger + +logger = logger.getChild('botdetection') + + +def dump_request(request: flask.Request): + return ( + request.path + + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) + + +def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None: + """Returns a HTTP 429 response object and writes a ERROR message to the + 'botdetection' logger. This function is used in part by the filter methods + to return the default ``Too Many Requests`` response. + + """ + + logger.debug("BLOCK %s: %s", network.compressed, log_msg) + return flask.make_response(('Too Many Requests', 429)) + + +def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network: + """Returns the (client) network of whether the real_ip is part of.""" + + ip = ip_address(real_ip) + if isinstance(ip, IPv6Address): + prefix = cfg['real_ip.ipv6_prefix'] + else: + prefix = cfg['real_ip.ipv4_prefix'] + network = ip_network(f"{real_ip}/{prefix}", strict=False) + # logger.debug("get_network(): %s", network.compressed) + return network + + +def get_real_ip(request: flask.Request) -> str: + """Returns real IP of the request. Since not all proxies set all the HTTP + headers and incoming headers can be faked it may happen that the IP cannot + be determined correctly. + + .. sidebar:: :py:obj:`flask.Request.remote_addr` + + SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). + + This function tries to get the remote IP in the order listed below, + additional some tests are done and if inconsistencies or errors are + detected, they are logged. + + The remote IP of the request is taken from (first match): + + - X-Forwarded-For_ header + - `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__ + - :py:obj:`flask.Request.remote_addr` + + .. _ProxyFix: + https://werkzeug.palletsprojects.com/middleware/proxy_fix/ + + .. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + + """ + + forwarded_for = request.headers.get("X-Forwarded-For") + real_ip = request.headers.get('X-Real-IP') + remote_addr = request.remote_addr + # logger.debug( + # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr + # ) + + if not forwarded_for: + logger.error("X-Forwarded-For header is not set!") + else: + from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import + + forwarded_for = [x.strip() for x in forwarded_for.split(',')] + x_for: int = get_cfg()['real_ip.x_for'] + forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] + + if not real_ip: + logger.error("X-Real-IP header is not set!") + + if forwarded_for and real_ip and forwarded_for != real_ip: + logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) + + if forwarded_for and remote_addr and forwarded_for != remote_addr: + logger.warning( + "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for + ) + + if real_ip and remote_addr and real_ip != remote_addr: + logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) + + request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' + # logger.debug("get_real_ip() -> %s", request_ip) + return request_ip diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py new file mode 100644 index 000000000..b78a86278 --- /dev/null +++ b/searx/botdetection/http_accept.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept`` +---------------------- + +The ``http_accept`` method evaluates a request as the request of a bot if the +Accept_ header .. + +- did not contain ``text/html`` + +.. _Accept: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + if 'text/html' not in request.accept_mimetypes: + return too_many_requests(network, "HTTP header Accept did not contain text/html") + return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py new file mode 100644 index 000000000..60718a4ca --- /dev/null +++ b/searx/botdetection/http_accept_encoding.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_encoding`` +------------------------------- + +The ``http_accept_encoding`` method evaluates a request as the request of a +bot if the Accept-Encoding_ header .. + +- did not contain ``gzip`` AND ``deflate`` (if both values are missed) +- did not contain ``text/html`` + +.. _Accept-Encoding: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] + if not ('gzip' in accept_list or 'deflate' in accept_list): + return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate") + return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py new file mode 100644 index 000000000..395d28bfd --- /dev/null +++ b/searx/botdetection/http_accept_language.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_language`` +------------------------------- + +The ``http_accept_language`` method evaluates a request as the request of a bot +if the Accept-Language_ header is unset. + +.. _Accept-Language: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" +# pylint: disable=unused-argument +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if request.headers.get('Accept-Language', '').strip() == '': + return too_many_requests(network, "missing HTTP header Accept-Language") + return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py new file mode 100644 index 000000000..ee0d80a23 --- /dev/null +++ b/searx/botdetection/http_connection.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_connection`` +-------------------------- + +The ``http_connection`` method evaluates a request as the request of a bot if +the Connection_ header is set to ``close``. + +.. _Connection: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + if request.headers.get('Connection', '').strip() == 'close': + return too_many_requests(network, "HTTP header 'Connection=close") + return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py new file mode 100644 index 000000000..17025f68b --- /dev/null +++ b/searx/botdetection/http_user_agent.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_user_agent`` +-------------------------- + +The ``http_user_agent`` method evaluates a request as the request of a bot if +the User-Agent_ header is unset or matches the regular expression +:py:obj:`USER_AGENT`. + +.. _User-Agent: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +import re +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug + +from searx.tools import config +from ._helpers import too_many_requests + + +USER_AGENT = ( + r'(' + + r'unknown' + + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + # unmaintained Farside instances + + r'|' + + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') + # other bots and client to block + + '|.*PetalBot.*' + + r')' +) +"""Regular expression that matches to User-Agent_ from known *bots*""" + +_regexp = None + + +def regexp_user_agent(): + global _regexp # pylint: disable=global-statement + if not _regexp: + _regexp = re.compile(USER_AGENT) + return _regexp + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + user_agent = request.headers.get('User-Agent', 'unknown') + if regexp_user_agent().match(user_agent): + return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") + return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py new file mode 100644 index 000000000..bb4229f0e --- /dev/null +++ b/searx/botdetection/ip_limit.py @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection.ip_limit: + +Method ``ip_limit`` +------------------- + +The ``ip_limit`` method counts request from an IP in *sliding windows*. If +there are to many requests in a sliding window, the request is evaluated as a +bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ +header. To take privacy only the hash value of an IP is stored in the redis DB +and at least for a maximum of 10 minutes. + +The :py:obj:`.link_token` method can be used to investigate whether a request is +*suspicious*. To activate the :py:obj:`.link_token` method in the +:py:obj:`.ip_limit` method add the following to your +``/etc/searxng/limiter.toml``: + +.. code:: toml + + [botdetection.ip_limit] + link_token = true + +If the :py:obj:`.link_token` method is activated and a request is *suspicious* +the request rates are reduced: + +- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` +- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` + +To intercept bots that get their IPs from a range of IPs, there is a +:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored +for a longer time. IPs stored in this sliding window have a maximum of +:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP +makes a request that is not suspicious, the sliding window for this IP is +droped. + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import flask +import werkzeug +from searx.tools import config + +from searx import redisdb +from searx import logger +from searx.redislib import incr_sliding_window, drop_counter + +from . import link_token +from ._helpers import too_many_requests + + +logger = logger.getChild('botdetection.ip_limit') + +BURST_WINDOW = 20 +"""Time (sec) before sliding window for *burst* requests expires.""" + +BURST_MAX = 15 +"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`""" + +BURST_MAX_SUSPICIOUS = 2 +"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`""" + +LONG_WINDOW = 600 +"""Time (sec) before the longer sliding window expires.""" + +LONG_MAX = 150 +"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`""" + +LONG_MAX_SUSPICIOUS = 10 +"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`""" + +API_WONDOW = 3600 +"""Time (sec) before sliding window for API requests (format != html) expires.""" + +API_MAX = 4 +"""Maximum requests from one IP in the :py:obj:`API_WONDOW`""" + +SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 +"""Time (sec) before sliding window for one suspicious IP expires.""" + +SUSPICIOUS_IP_MAX = 3 +"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" + + +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + + # pylint: disable=too-many-return-statements + redis_client = redisdb.client() + + if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: + logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) + return None + + if request.args.get('format', 'html') != 'html': + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW) + if c > API_MAX: + return too_many_requests(network, "too many request in API_WINDOW") + + if cfg['botdetection.ip_limit.link_token']: + + suspicious = link_token.is_suspicious(network, request, True) + + if not suspicious: + # this IP is no longer suspicious: release ip again / delete the counter of this IP + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed) + return None + + # this IP is suspicious: count requests from this IP + c = incr_sliding_window( + redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW + ) + if c > SUSPICIOUS_IP_MAX: + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) + return flask.redirect(flask.url_for('index'), code=302) + + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) + if c > BURST_MAX_SUSPICIOUS: + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") + + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) + if c > LONG_MAX_SUSPICIOUS: + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") + + return None + + # vanilla limiter without extensions counts BURST_MAX and LONG_MAX + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) + if c > BURST_MAX: + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)") + + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) + if c > LONG_MAX: + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)") + + return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py new file mode 100644 index 000000000..18ffc8407 --- /dev/null +++ b/searx/botdetection/limiter.py @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _limiter src: + +Limiter +======= + +.. sidebar:: info + + The limiter requires a :ref:`Redis <settings redis>` database. + +Bot protection / IP rate limitation. The intention of rate limitation is to +limit suspicious requests from an IP. The motivation behind this is the fact +that SearXNG passes through requests from bots and is thus classified as a bot +itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked +by the search engine (the origin) in some other way. + +To avoid blocking, the requests from bots to SearXNG must also be blocked, this +is the task of the limiter. To perform this task, the limiter uses the methods +from the :py:obj:`searx.botdetection`. + +To enable the limiter activate: + +.. code:: yaml + + server: + ... + limiter: true # rate limit the number of request on the instance, block some bots + +and set the redis-url connection. Check the value, it depends on your redis DB +(see :ref:`settings redis`), by example: + +.. code:: yaml + + redis: + url: unix:///usr/local/searxng-redis/run/redis.sock?db=0 + +""" + +from __future__ import annotations + +from pathlib import Path +import flask +import werkzeug + +from searx.tools import config +from searx import logger + +from . import ( + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, +) + +from ._helpers import ( + get_network, + get_real_ip, + dump_request, +) + +logger = logger.getChild('botdetection.limiter') + +CFG: config.Config = None # type: ignore + +LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" +"""Base configuration (schema) of the botdetection.""" + +LIMITER_CFG = Path('/etc/searxng/limiter.toml') +"""Lokal Limiter configuration.""" + +CFG_DEPRECATED = { + # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." +} + + +def get_cfg() -> config.Config: + global CFG # pylint: disable=global-statement + if CFG is None: + CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED) + return CFG + + +def filter_request(request: flask.Request) -> werkzeug.Response | None: + + cfg = get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + if network.is_link_local: + return None + + if request.path == '/healthz': + return None + + for func in [ + http_user_agent, + ]: + val = func.filter_request(network, request, cfg) + if val is not None: + return val + + if request.path == '/search': + + for func in [ + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, + ]: + val = func.filter_request(network, request, cfg) + if val is not None: + return val + logger.debug(f"OK {network}: %s", dump_request(flask.request)) + return None diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml new file mode 100644 index 000000000..71a231e8f --- /dev/null +++ b/searx/botdetection/limiter.toml @@ -0,0 +1,22 @@ +[real_ip] + +# Number of values to trust for X-Forwarded-For. + +x_for = 1 + +# The prefix defines the number of leading bits in an address that are compared +# to determine whether or not an address is part of a (client) network. + +ipv4_prefix = 32 +ipv6_prefix = 48 + +[botdetection.ip_limit] + +# To get unlimited access in a local network, by default link-lokal addresses +# (networks) are not monitored by the ip_limit +filter_link_local = false + +# acrivate link_token method in the ip_limit method +link_token = false + + diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py new file mode 100644 index 000000000..11a6a56b5 --- /dev/null +++ b/searx/botdetection/link_token.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``link_token`` +--------------------- + +The ``link_token`` method evaluates a request as :py:obj:`suspicious +<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the +client. By adding a random component (the token) in the URL, a bot can not send +a ping by request a static URL. + +.. note:: + + This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. + +To get in use of this method a flask URL route needs to be added: + +.. code:: python + + @app.route('/client<token>.css', methods=['GET', 'POST']) + def client_token(token=None): + link_token.ping(request, token) + return Response('', mimetype='text/css') + +And in the HTML template from flask a stylesheet link is needed (the value of +``link_token`` comes from :py:obj:`get_token`): + +.. code:: html + + <link rel="stylesheet" + href="{{ url_for('client_token', token=link_token) }}" + type="text/css" /> + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + +import string +import random +import flask + +from searx import logger +from searx import redisdb +from searx.redislib import secret_hash + +from ._helpers import ( + get_network, + get_real_ip, +) + +TOKEN_LIVE_TIME = 600 +"""Livetime (sec) of limiter's CSS token.""" + +PING_LIVE_TIME = 3600 +"""Livetime (sec) of the ping-key from a client (request)""" + +PING_KEY = 'SearXNG_limiter.ping' +"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`""" + +TOKEN_KEY = 'SearXNG_limiter.token' +"""Key for which the current token is stored in the DB""" + +logger = logger.getChild('botdetection.link_token') + + +def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): + """Checks whether a valid ping is exists for this (client) network, if not + this request is rated as *suspicious*. If a valid ping exists and argument + ``renew`` is ``True`` the expire time of this ping is reset to + :py:obj:`PING_LIVE_TIME`. + + """ + redis_client = redisdb.client() + if not redis_client: + return False + + ping_key = get_ping_key(network, request) + if not redis_client.get(ping_key): + logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key) + return True + + if renew: + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + + logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key) + return False + + +def ping(request: flask.Request, token: str): + """This function is called by a request to URL ``/client<token>.css``. If + ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. + The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. + + """ + from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import + + redis_client = redisdb.client() + if not redis_client: + return + if not token_is_valid(token): + return + + cfg = limiter.get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + + ping_key = get_ping_key(network, request) + logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + + +def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: + """Generates a hashed key that fits (more or less) to a *WEB-browser + session* in a network.""" + return ( + PING_KEY + + "[" + + secret_hash( + network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') + ) + + "]" + ) + + +def token_is_valid(token) -> bool: + valid = token == get_token() + logger.debug("token is valid --> %s", valid) + return valid + + +def get_token() -> str: + """Returns current token. If there is no currently active token a new token + is generated randomly and stored in the redis DB. + + - :py:obj:`TOKEN_LIVE_TIME` + - :py:obj:`TOKEN_KEY` + + """ + redis_client = redisdb.client() + if not redis_client: + # This function is also called when limiter is inactive / no redis DB + # (see render function in webapp.py) + return '12345678' + token = redis_client.get(TOKEN_KEY) + if token: + token = token.decode('UTF-8') + else: + token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) + return token diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 46c82f588..a8beb5e88 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -1,119 +1,32 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint # pyright: basic -"""Some bot protection / rate limitation +"""see :ref:`limiter src`""" -To monitor rate limits and protect privacy the IP addresses are getting stored -with a hash so the limiter plugin knows who to block. A redis database is -needed to store the hash values. - -Enable the plugin in ``settings.yml``: - -- ``server.limiter: true`` -- ``redis.url: ...`` check the value, see :ref:`settings redis` -""" - -import re -from flask import request +import flask from searx import redisdb from searx.plugins import logger -from searx.redislib import incr_sliding_window +from searx.botdetection import limiter name = "Request limiter" description = "Limit the number of request" default_on = False preference_section = 'service' -logger = logger.getChild('limiter') - -block_user_agent = re.compile( - r'(' - + r'unknown' - + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' - + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' - + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' - + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' - + r'|ZmEu|BLEXBot|bitlybot' - # unmaintained Farside instances - + r'|' - + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') - + '|.*PetalBot.*' - + r')' -) - - -def is_accepted_request() -> bool: - # pylint: disable=too-many-return-statements - redis_client = redisdb.client() - user_agent = request.headers.get('User-Agent', 'unknown') - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - - if request.path == '/healthz': - return True - - if block_user_agent.match(user_agent): - logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent)) - return False - - if request.path == '/search': - c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20) - c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600) - if c_burst > 15 or c_10min > 150: - logger.debug("BLOCK %s: to many request", x_forwarded_for) - return False - - if len(request.headers.get('Accept-Language', '').strip()) == '': - logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for) - return False - - if request.headers.get('Connection') == 'close': - logger.debug("BLOCK %s: got Connection=close", x_forwarded_for) - return False - - accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] - if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list: - logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for) - return False - - if 'text/html' not in request.accept_mimetypes: - logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for) - return False - - if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600) - if c > 4: - logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for) - return False - - logger.debug( - "OK %s: '%s'" % (x_forwarded_for, request.path) - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept', '') - + " || Accept-Language: %s" % request.headers.get('Accept-Language', '') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '') - + " || Content-Type: %s" % request.headers.get('Content-Type', '') - + " || Content-Length: %s" % request.headers.get('Content-Length', '') - + " || Connection: %s" % request.headers.get('Connection', '') - + " || User-Agent: %s" % user_agent - ) - - return True +logger = logger.getChild('limiter') def pre_request(): - if not is_accepted_request(): - return 'Too Many Requests', 429 - return None + """See :ref:`flask.Flask.before_request`""" + return limiter.filter_request(flask.request) -def init(app, settings): +def init(app: flask.Flask, settings) -> bool: if not settings['server']['limiter']: return False - if not redisdb.client(): - logger.error("The limiter requires Redis") # pylint: disable=undefined-variable + logger.error("The limiter requires Redis") return False - app.before_request(pre_request) return True diff --git a/searx/plugins/self_info.py b/searx/plugins/self_info.py index fbe4518b5..8079ee0d4 100644 --- a/searx/plugins/self_info.py +++ b/searx/plugins/self_info.py @@ -1,21 +1,11 @@ -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring,invalid-name -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. - -(C) 2015 by Adam Tauber, <asciimoo@gmail.com> -''' -from flask_babel import gettext import re +from flask_babel import gettext + +from searx.botdetection._helpers import get_real_ip name = gettext('Self Information') description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".') @@ -28,18 +18,11 @@ query_examples = '' p = re.compile('.*user[ -]agent.*', re.IGNORECASE) -# attach callback to the post search hook -# request: flask request object -# ctx: the whole local context of the pre search hook def post_search(request, search): if search.search_query.pageno > 1: return True if search.search_query.query == 'ip': - x_forwarded_for = request.headers.getlist("X-Forwarded-For") - if x_forwarded_for: - ip = x_forwarded_for[0] - else: - ip = request.remote_addr + ip = get_real_ip(request) search.result_container.answers['ip'] = {'answer': ip} elif p.match(search.search_query.query): ua = request.user_agent diff --git a/searx/templates/simple/base.html b/searx/templates/simple/base.html index a31ff07ee..3c6ed11c7 100644 --- a/searx/templates/simple/base.html +++ b/searx/templates/simple/base.html @@ -17,6 +17,9 @@ {% else %} <link rel="stylesheet" href="{{ url_for('static', filename='css/searxng.min.css') }}" type="text/css" media="screen" /> {% endif %} + {% if get_setting('server.limiter') %} + <link rel="stylesheet" href="{{ url_for('client_token', token=link_token) }}" type="text/css" /> + {% endif %} {% block styles %}{% endblock %} <!--[if gte IE 9]>--> <script src="{{ url_for('static', filename='js/searxng.head.min.js') }}" client_settings="{{ client_settings }}"></script> diff --git a/searx/tools/__init__.py b/searx/tools/__init__.py new file mode 100644 index 000000000..08e6d982f --- /dev/null +++ b/searx/tools/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _tools src: + +A collection of *utilities* used by SearXNG, but without SearXNG specific +peculiarities. + +""" diff --git a/searx/tools/config.py b/searx/tools/config.py new file mode 100644 index 000000000..f998031ba --- /dev/null +++ b/searx/tools/config.py @@ -0,0 +1,376 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Configuration class :py:class:`Config` with deep-update, schema validation +and deprecated names. + +The :py:class:`Config` class implements a configuration that is based on +structured dictionaries. The configuration schema is defined in a dictionary +structure and the configuration data is given in a dictionary structure. +""" +from __future__ import annotations + +import copy +import typing +import logging +import pathlib +import pytomlpp as toml + +__all__ = ['Config', 'UNSET', 'SchemaIssue'] + +log = logging.getLogger(__name__) + + +class FALSE: + """Class of ``False`` singelton""" + + # pylint: disable=multiple-statements + def __init__(self, msg): + self.msg = msg + + def __bool__(self): + return False + + def __str__(self): + return self.msg + + __repr__ = __str__ + + +UNSET = FALSE('<UNSET>') + + +class SchemaIssue(ValueError): + """Exception to store and/or raise a message from a schema issue.""" + + def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str): + self.level = level + super().__init__(msg) + + def __str__(self): + return f"[cfg schema {self.level}] {self.args[0]}" + + +class Config: + """Base class used for configuration""" + + UNSET = UNSET + + @classmethod + def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config: + + # init schema + + log.debug("load schema file: %s", schema_file) + cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated) + if not cfg_file.exists(): + log.warning("missing config file: %s", cfg_file) + return cfg + + # load configuration + + log.debug("load config file: %s", cfg_file) + try: + upd_cfg = toml.load(cfg_file) + except toml.DecodeError as exc: + msg = str(exc).replace('\t', '').replace('\n', ' ') + log.error("%s: %s", cfg_file, msg) + raise + + is_valid, issue_list = cfg.validate(upd_cfg) + for msg in issue_list: + log.error(str(msg)) + if not is_valid: + raise TypeError(f"schema of {cfg_file} is invalid!") + cfg.update(upd_cfg) + return cfg + + def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]): + """Construtor of class Config. + + :param cfg_schema: Schema of the configuration + :param deprecated: dictionary that maps deprecated configuration names to a messages + + These values are needed for validation, see :py:obj:`validate`. + + """ + self.cfg_schema = cfg_schema + self.deprecated = deprecated + self.cfg = copy.deepcopy(cfg_schema) + + def __getitem__(self, key: str): + return self.get(key) + + def validate(self, cfg: dict): + """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`. + Validation is done by :py:obj:`validate`.""" + + return validate(self.cfg_schema, cfg, self.deprecated) + + def update(self, upd_cfg: dict): + """Update this configuration by ``upd_cfg``.""" + + dict_deepupdate(self.cfg, upd_cfg) + + def default(self, name: str): + """Returns default value of field ``name`` in ``self.cfg_schema``.""" + return value(name, self.cfg_schema) + + def get(self, name: str, default=UNSET, replace=True): + """Returns the value to which ``name`` points in the configuration. + + If there is no such ``name`` in the config and the ``default`` is + :py:obj:`UNSET`, a :py:obj:`KeyError` is raised. + """ + + parent = self._get_parent_dict(name) + val = parent.get(name.split('.')[-1], UNSET) + if val is UNSET: + if default is UNSET: + raise KeyError(name) + val = default + + if replace and isinstance(val, str): + val = val % self + return val + + def set(self, name: str, val): + """Set the value to which ``name`` points in the configuration. + + If there is no such ``name`` in the config, a :py:obj:`KeyError` is + raised. + """ + parent = self._get_parent_dict(name) + parent[name.split('.')[-1]] = val + + def _get_parent_dict(self, name): + parent_name = '.'.join(name.split('.')[:-1]) + if parent_name: + parent = value(parent_name, self.cfg) + else: + parent = self.cfg + if (parent is UNSET) or (not isinstance(parent, dict)): + raise KeyError(parent_name) + return parent + + def path(self, name: str, default=UNSET): + """Get a :py:class:`pathlib.Path` object from a config string.""" + + val = self.get(name, default) + if val is UNSET: + if default is UNSET: + raise KeyError(name) + return default + return pathlib.Path(str(val)) + + def pyobj(self, name, default=UNSET): + """Get python object refered by full qualiffied name (FQN) in the config + string.""" + + fqn = self.get(name, default) + if fqn is UNSET: + if default is UNSET: + raise KeyError(name) + return default + (modulename, name) = str(fqn).rsplit('.', 1) + m = __import__(modulename, {}, {}, [name], 0) + return getattr(m, name) + + +# working with dictionaries + + +def value(name: str, data_dict: dict): + """Returns the value to which ``name`` points in the ``dat_dict``. + + .. code: python + + >>> data_dict = { + "foo": {"bar": 1 }, + "bar": {"foo": 2 }, + "foobar": [1, 2, 3], + } + >>> value('foobar', data_dict) + [1, 2, 3] + >>> value('foo.bar', data_dict) + 1 + >>> value('foo.bar.xxx', data_dict) + <UNSET> + + """ + + ret_val = data_dict + for part in name.split('.'): + if isinstance(ret_val, dict): + ret_val = ret_val.get(part, UNSET) + if ret_val is UNSET: + break + return ret_val + + +def validate( + schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str] +) -> typing.Tuple[bool, list]: + + """Deep validation of dictionary in ``data_dict`` against dictionary in + ``schema_dict``. Argument deprecated is a dictionary that maps deprecated + configuration names to a messages:: + + deprecated = { + "foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'", + "..." : "..." + } + + The function returns a python tuple ``(is_valid, issue_list)``: + + ``is_valid``: + A bool value indicating ``data_dict`` is valid or not. + + ``issue_list``: + A list of messages (:py:obj:`SchemaIssue`) from the validation:: + + [schema warn] data_dict: deprecated 'fontlib.foo': <DEPRECATED['foo.bar']> + [schema invalid] data_dict: key unknown 'fontlib.foo' + [schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ... + + If ``schema_dict`` or ``data_dict`` is not a dictionary type a + :py:obj:`SchemaIssue` is raised. + + """ + names = [] + is_valid = True + issue_list = [] + + if not isinstance(schema_dict, dict): + raise SchemaIssue('invalid', "schema_dict is not a dict type") + if not isinstance(data_dict, dict): + raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type") + + is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated) + return is_valid, issue_list + + +def _validate( + names: typing.List, + issue_list: typing.List, + schema_dict: typing.Dict, + data_dict: typing.Dict, + deprecated: typing.Dict[str, str], +) -> typing.Tuple[bool, typing.List]: + + is_valid = True + + for key, data_value in data_dict.items(): + + names.append(key) + name = '.'.join(names) + + deprecated_msg = deprecated.get(name) + # print("XXX %s: key %s // data_value: %s" % (name, key, data_value)) + if deprecated_msg: + issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}")) + + schema_value = value(name, schema_dict) + # print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value)) + if schema_value is UNSET: + if not deprecated_msg: + issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict")) + is_valid = False + + elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck + issue_list.append( + SchemaIssue( + 'invalid', + (f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"), + ) + ) + is_valid = False + + elif isinstance(data_value, dict): + _valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated) + is_valid = is_valid and _valid + names.pop() + + return is_valid, issue_list + + +def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None): + """Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``. + + For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``: + + 0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a + :py:obj:`TypeError`. + + 1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``. + + 2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a + (deep-) copy of ``upd_val``. + + 3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the + list in ``upd_val``. + + 4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in + ``upd_val``. + """ + # pylint: disable=too-many-branches + if not isinstance(base_dict, dict): + raise TypeError("argument 'base_dict' is not a ditionary type") + if not isinstance(upd_dict, dict): + raise TypeError("argument 'upd_dict' is not a ditionary type") + + if names is None: + names = [] + + for upd_key, upd_val in upd_dict.items(): + # For each upd_key & upd_val pair in upd_dict: + + if isinstance(upd_val, dict): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, recursively deep-update it + if not isinstance(base_dict[upd_key], dict): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict") + dict_deepupdate( + base_dict[upd_key], + upd_val, + names + + [ + upd_key, + ], + ) + + else: + # if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val + base_dict[upd_key] = copy.deepcopy(upd_val) + + elif isinstance(upd_val, list): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, base_dict[up_key] is extended by + # the list from upd_val + if not isinstance(base_dict[upd_key], list): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict") + base_dict[upd_key].extend(upd_val) + + else: + # if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the + # list in upd_val. + base_dict[upd_key] = copy.deepcopy(upd_val) + + elif isinstance(upd_val, set): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val + if not isinstance(base_dict[upd_key], set): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict") + base_dict[upd_key].update(upd_val.copy()) + + else: + # if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the + # set in upd_val + base_dict[upd_key] = upd_val.copy() + + else: + # for any other type of upd_val replace or add base_dict[upd_key] by a copy + # of upd_val + base_dict[upd_key] = copy.copy(upd_val) diff --git a/searx/webapp.py b/searx/webapp.py index 79255652f..d6322447a 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -94,6 +94,7 @@ from searx.utils import ( from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH from searx.query import RawTextQuery from searx.plugins import Plugin, plugins, initialize as plugin_initialize +from searx.botdetection import link_token from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import ( Preferences, @@ -416,6 +417,7 @@ def render(template_name: str, **kwargs): kwargs['endpoint'] = 'results' if 'q' in kwargs else request.endpoint kwargs['cookies'] = request.cookies kwargs['errors'] = request.errors + kwargs['link_token'] = link_token.get_token() # values from the preferences kwargs['preferences'] = request.preferences @@ -642,6 +644,12 @@ def health(): return Response('OK', mimetype='text/plain') +@app.route('/client<token>.css', methods=['GET', 'POST']) +def client_token(token=None): + link_token.ping(request, token) + return Response('', mimetype='text/css') + + @app.route('/search', methods=['GET', 'POST']) def search(): """Search query in q and return results. diff --git a/tests/unit/test_plugins.py b/tests/unit/test_plugins.py index 28df835e5..0d555fdc0 100644 --- a/tests/unit/test_plugins.py +++ b/tests/unit/test_plugins.py @@ -50,9 +50,13 @@ class SelfIPTest(SearxTestCase): self.assertTrue(len(store.plugins) == 1) # IP test - request = Mock(remote_addr='127.0.0.1') - request.headers.getlist.return_value = [] - search = get_search_mock(query='ip', pageno=1) + request = Mock() + request.remote_addr = '127.0.0.1' + request.headers = {'X-Forwarded-For': '1.2.3.4, 127.0.0.1', 'X-Real-IP': '127.0.0.1'} + search = get_search_mock( + query='ip', + pageno=1, + ) store.call(store.plugins, 'post_search', request, search) self.assertTrue('127.0.0.1' in search.result_container.answers["ip"]["answer"]) @@ -62,7 +66,6 @@ class SelfIPTest(SearxTestCase): # User agent test request = Mock(user_agent='Mock') - request.headers.getlist.return_value = [] search = get_search_mock(query='user-agent', pageno=1) store.call(store.plugins, 'post_search', request, search) @@ -98,7 +101,6 @@ class HashPluginTest(SearxTestCase): self.assertTrue(len(store.plugins) == 1) request = Mock(remote_addr='127.0.0.1') - request.headers.getlist.return_value = [] # MD5 search = get_search_mock(query='md5 test', pageno=1) |