diff options
-rw-r--r-- | docs/src/searx.botdetection.rst | 3 | ||||
-rw-r--r-- | searx/botdetection/_helpers.py | 9 | ||||
-rw-r--r-- | searx/botdetection/ip_limit.py | 8 | ||||
-rw-r--r-- | searx/botdetection/ip_lists.py | 85 | ||||
-rw-r--r-- | searx/botdetection/limiter.py | 33 | ||||
-rw-r--r-- | searx/botdetection/limiter.toml | 20 | ||||
-rw-r--r-- | searx/botdetection/link_token.py | 3 | ||||
-rw-r--r-- | searx/tools/config.py | 5 |
8 files changed, 152 insertions, 14 deletions
diff --git a/docs/src/searx.botdetection.rst b/docs/src/searx.botdetection.rst index 85e0ce4cd..093414ec8 100644 --- a/docs/src/searx.botdetection.rst +++ b/docs/src/searx.botdetection.rst @@ -15,6 +15,9 @@ Bot Detection .. automodule:: searx.botdetection.limiter :members: +.. automodule:: searx.botdetection.ip_lists + :members: + Rate limit ========== diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py index 8e0156d6e..19905fd15 100644 --- a/searx/botdetection/_helpers.py +++ b/searx/botdetection/_helpers.py @@ -6,8 +6,8 @@ from __future__ import annotations from ipaddress import ( IPv4Network, IPv6Network, + IPv4Address, IPv6Address, - ip_address, ip_network, ) import flask @@ -46,11 +46,10 @@ def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkz return flask.make_response(('Too Many Requests', 429)) -def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network: +def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network: """Returns the (client) network of whether the real_ip is part of.""" - ip = ip_address(real_ip) - if isinstance(ip, IPv6Address): + if real_ip.version == 6: prefix = cfg['real_ip.ipv6_prefix'] else: prefix = cfg['real_ip.ipv4_prefix'] @@ -99,7 +98,7 @@ def get_real_ip(request: flask.Request) -> str: from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import forwarded_for = [x.strip() for x in forwarded_for.split(',')] - x_for: int = get_cfg()['real_ip.x_for'] + x_for: int = get_cfg()['real_ip.x_for'] # type: ignore forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] if not real_ip: diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index bb4229f0e..d0605dcfb 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -49,14 +49,16 @@ import werkzeug from searx.tools import config from searx import redisdb -from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token -from ._helpers import too_many_requests +from ._helpers import ( + too_many_requests, + logger, +) -logger = logger.getChild('botdetection.ip_limit') +logger = logger.getChild('ip_limit') BURST_WINDOW = 20 """Time (sec) before sliding window for *burst* requests expires.""" diff --git a/searx/botdetection/ip_lists.py b/searx/botdetection/ip_lists.py new file mode 100644 index 000000000..456ef4365 --- /dev/null +++ b/searx/botdetection/ip_lists.py @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection.ip_lists: + +Method ``ip_lists`` +------------------- + +The ``ip_lists`` method implements IP :py:obj:`block- <block_ip>` and +:py:obj:`pass-lists <pass_ip>`. + +.. code:: toml + + [botdetection.ip_lists] + + pass_ip = [ + '140.238.172.132', # IPv4 of check.searx.space + '192.168.0.0/16', # IPv4 private network + 'fe80::/10' # IPv6 linklocal + ] + block_ip = [ + '93.184.216.34', # IPv4 of example.org + '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class + ] + +""" +# pylint: disable=unused-argument + +from __future__ import annotations +from typing import Tuple +from ipaddress import ( + ip_network, + IPv4Address, + IPv6Address, +) + +from searx.tools import config +from ._helpers import logger + +logger = logger.getChild('ip_limit') + +SEARXNG_ORG = [ + # https://github.com/searxng/searxng/pull/2484#issuecomment-1576639195 + '140.238.172.132', # IPv4 check.searx.space + '2603:c022:0:4900::/56', # IPv6 check.searx.space +] +"""Passlist of IPs from the SearXNG organization, e.g. `check.searx.space`.""" + + +def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: + """Checks if the IP on the subnet is in one of the members of the + ``botdetection.ip_lists.pass_ip`` list. + """ + + if cfg.get('botdetection.ip_lists.pass_searxng_org', default=True): + for net in SEARXNG_ORG: + net = ip_network(net, strict=False) + if real_ip.version == net.version and real_ip in net: + return True, f"IP matches {net.compressed} in SEARXNG_ORG list." + return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg) + + +def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: + """Checks if the IP on the subnet is in one of the members of the + ``botdetection.ip_lists.block_ip`` list. + """ + + block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg) + if block: + msg += " To remove IP from list, please contact the maintainer of the service." + return block, msg + + +def ip_is_subnet_of_member_in_list( + real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config +) -> Tuple[bool, str]: + + for net in cfg.get(list_name, default=[]): + try: + net = ip_network(net, strict=False) + except ValueError: + logger.error("invalid IP %s in %s", net, list_name) + continue + if real_ip.version == net.version and real_ip in net: + return True, f"IP matches {net.compressed} in {list_name}." + return False, f"IP is not a member of an item in the f{list_name} list" diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 18ffc8407..366665854 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -40,6 +40,7 @@ and set the redis-url connection. Check the value, it depends on your redis DB from __future__ import annotations from pathlib import Path +from ipaddress import ip_address import flask import werkzeug @@ -53,6 +54,7 @@ from . import ( http_connection, http_user_agent, ip_limit, + ip_lists, ) from ._helpers import ( @@ -84,16 +86,41 @@ def get_cfg() -> config.Config: def filter_request(request: flask.Request) -> werkzeug.Response | None: + # pylint: disable=too-many-return-statements cfg = get_cfg() - real_ip = get_real_ip(request) + real_ip = ip_address(get_real_ip(request)) network = get_network(real_ip, cfg) + + if request.path == '/healthz': + return None + + # link-local + if network.is_link_local: return None - if request.path == '/healthz': + # block- & pass- lists + # + # 1. The IP of the request is first checked against the pass-list; if the IP + # matches an entry in the list, the request is not blocked. + # 2. If no matching entry is found in the pass-list, then a check is made against + # the block list; if the IP matches an entry in the list, the request is + # blocked. + # 3. If the IP is not in either list, the request is not blocked. + + match, msg = ip_lists.pass_ip(real_ip, cfg) + if match: + logger.warning("PASS %s: matched PASSLIST - %s", network.compressed, msg) return None + match, msg = ip_lists.block_ip(real_ip, cfg) + if match: + logger.error("BLOCK %s: matched BLOCKLIST - %s", network.compressed, msg) + return flask.make_response(('IP is on BLOCKLIST - %s' % msg, 429)) + + # methods applied on / + for func in [ http_user_agent, ]: @@ -101,6 +128,8 @@ def filter_request(request: flask.Request) -> werkzeug.Response | None: if val is not None: return val + # methods applied on /search + if request.path == '/search': for func in [ diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index 71a231e8f..9560ec8f6 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -16,7 +16,25 @@ ipv6_prefix = 48 # (networks) are not monitored by the ip_limit filter_link_local = false -# acrivate link_token method in the ip_limit method +# activate link_token method in the ip_limit method link_token = false +[botdetection.ip_lists] +# In the limiter, the ip_lists method has priority over all other methods -> if +# an IP is in the pass_ip list, it has unrestricted access and it is also not +# checked if e.g. the "user agent" suggests a bot (e.g. curl). + +block_ip = [ + # '93.184.216.34', # IPv4 of example.org + # '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class +] + +pass_ip = [ + # '192.168.0.0/16', # IPv4 private network + # 'fe80::/10' # IPv6 linklocal / wins over botdetection.ip_limit.filter_link_local +] + +# Activate passlist of (hardcoded) IPs from the SearXNG organization, +# e.g. `check.searx.space`. +pass_searxng_org = true
\ No newline at end of file diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 11a6a56b5..d86fa86c3 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -39,6 +39,7 @@ from __future__ import annotations from ipaddress import ( IPv4Network, IPv6Network, + ip_address, ) import string @@ -107,7 +108,7 @@ def ping(request: flask.Request, token: str): return cfg = limiter.get_cfg() - real_ip = get_real_ip(request) + real_ip = ip_address(get_real_ip(request)) network = get_network(real_ip, cfg) ping_key = get_ping_key(network, request) diff --git a/searx/tools/config.py b/searx/tools/config.py index f998031ba..d2710456f 100644 --- a/searx/tools/config.py +++ b/searx/tools/config.py @@ -8,6 +8,7 @@ structured dictionaries. The configuration schema is defined in a dictionary structure and the configuration data is given in a dictionary structure. """ from __future__ import annotations +from typing import Any import copy import typing @@ -97,7 +98,7 @@ class Config: self.deprecated = deprecated self.cfg = copy.deepcopy(cfg_schema) - def __getitem__(self, key: str): + def __getitem__(self, key: str) -> Any: return self.get(key) def validate(self, cfg: dict): @@ -115,7 +116,7 @@ class Config: """Returns default value of field ``name`` in ``self.cfg_schema``.""" return value(name, self.cfg_schema) - def get(self, name: str, default=UNSET, replace=True): + def get(self, name: str, default: Any = UNSET, replace: bool = True) -> Any: """Returns the value to which ``name`` points in the configuration. If there is no such ``name`` in the config and the ``default`` is |