summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/src/searx.botdetection.rst3
-rw-r--r--searx/botdetection/_helpers.py9
-rw-r--r--searx/botdetection/ip_limit.py8
-rw-r--r--searx/botdetection/ip_lists.py85
-rw-r--r--searx/botdetection/limiter.py33
-rw-r--r--searx/botdetection/limiter.toml20
-rw-r--r--searx/botdetection/link_token.py3
-rw-r--r--searx/tools/config.py5
8 files changed, 152 insertions, 14 deletions
diff --git a/docs/src/searx.botdetection.rst b/docs/src/searx.botdetection.rst
index 85e0ce4cd..093414ec8 100644
--- a/docs/src/searx.botdetection.rst
+++ b/docs/src/searx.botdetection.rst
@@ -15,6 +15,9 @@ Bot Detection
.. automodule:: searx.botdetection.limiter
:members:
+.. automodule:: searx.botdetection.ip_lists
+ :members:
+
Rate limit
==========
diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py
index 8e0156d6e..19905fd15 100644
--- a/searx/botdetection/_helpers.py
+++ b/searx/botdetection/_helpers.py
@@ -6,8 +6,8 @@ from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
+ IPv4Address,
IPv6Address,
- ip_address,
ip_network,
)
import flask
@@ -46,11 +46,10 @@ def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkz
return flask.make_response(('Too Many Requests', 429))
-def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network:
+def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network:
"""Returns the (client) network of whether the real_ip is part of."""
- ip = ip_address(real_ip)
- if isinstance(ip, IPv6Address):
+ if real_ip.version == 6:
prefix = cfg['real_ip.ipv6_prefix']
else:
prefix = cfg['real_ip.ipv4_prefix']
@@ -99,7 +98,7 @@ def get_real_ip(request: flask.Request) -> str:
from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import
forwarded_for = [x.strip() for x in forwarded_for.split(',')]
- x_for: int = get_cfg()['real_ip.x_for']
+ x_for: int = get_cfg()['real_ip.x_for'] # type: ignore
forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
if not real_ip:
diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py
index bb4229f0e..d0605dcfb 100644
--- a/searx/botdetection/ip_limit.py
+++ b/searx/botdetection/ip_limit.py
@@ -49,14 +49,16 @@ import werkzeug
from searx.tools import config
from searx import redisdb
-from searx import logger
from searx.redislib import incr_sliding_window, drop_counter
from . import link_token
-from ._helpers import too_many_requests
+from ._helpers import (
+ too_many_requests,
+ logger,
+)
-logger = logger.getChild('botdetection.ip_limit')
+logger = logger.getChild('ip_limit')
BURST_WINDOW = 20
"""Time (sec) before sliding window for *burst* requests expires."""
diff --git a/searx/botdetection/ip_lists.py b/searx/botdetection/ip_lists.py
new file mode 100644
index 000000000..456ef4365
--- /dev/null
+++ b/searx/botdetection/ip_lists.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+""".. _botdetection.ip_lists:
+
+Method ``ip_lists``
+-------------------
+
+The ``ip_lists`` method implements IP :py:obj:`block- <block_ip>` and
+:py:obj:`pass-lists <pass_ip>`.
+
+.. code:: toml
+
+ [botdetection.ip_lists]
+
+ pass_ip = [
+ '140.238.172.132', # IPv4 of check.searx.space
+ '192.168.0.0/16', # IPv4 private network
+ 'fe80::/10' # IPv6 linklocal
+ ]
+ block_ip = [
+ '93.184.216.34', # IPv4 of example.org
+ '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class
+ ]
+
+"""
+# pylint: disable=unused-argument
+
+from __future__ import annotations
+from typing import Tuple
+from ipaddress import (
+ ip_network,
+ IPv4Address,
+ IPv6Address,
+)
+
+from searx.tools import config
+from ._helpers import logger
+
+logger = logger.getChild('ip_limit')
+
+SEARXNG_ORG = [
+ # https://github.com/searxng/searxng/pull/2484#issuecomment-1576639195
+ '140.238.172.132', # IPv4 check.searx.space
+ '2603:c022:0:4900::/56', # IPv6 check.searx.space
+]
+"""Passlist of IPs from the SearXNG organization, e.g. `check.searx.space`."""
+
+
+def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
+ """Checks if the IP on the subnet is in one of the members of the
+ ``botdetection.ip_lists.pass_ip`` list.
+ """
+
+ if cfg.get('botdetection.ip_lists.pass_searxng_org', default=True):
+ for net in SEARXNG_ORG:
+ net = ip_network(net, strict=False)
+ if real_ip.version == net.version and real_ip in net:
+ return True, f"IP matches {net.compressed} in SEARXNG_ORG list."
+ return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg)
+
+
+def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
+ """Checks if the IP on the subnet is in one of the members of the
+ ``botdetection.ip_lists.block_ip`` list.
+ """
+
+ block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg)
+ if block:
+ msg += " To remove IP from list, please contact the maintainer of the service."
+ return block, msg
+
+
+def ip_is_subnet_of_member_in_list(
+ real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config
+) -> Tuple[bool, str]:
+
+ for net in cfg.get(list_name, default=[]):
+ try:
+ net = ip_network(net, strict=False)
+ except ValueError:
+ logger.error("invalid IP %s in %s", net, list_name)
+ continue
+ if real_ip.version == net.version and real_ip in net:
+ return True, f"IP matches {net.compressed} in {list_name}."
+ return False, f"IP is not a member of an item in the f{list_name} list"
diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py
index 18ffc8407..366665854 100644
--- a/searx/botdetection/limiter.py
+++ b/searx/botdetection/limiter.py
@@ -40,6 +40,7 @@ and set the redis-url connection. Check the value, it depends on your redis DB
from __future__ import annotations
from pathlib import Path
+from ipaddress import ip_address
import flask
import werkzeug
@@ -53,6 +54,7 @@ from . import (
http_connection,
http_user_agent,
ip_limit,
+ ip_lists,
)
from ._helpers import (
@@ -84,16 +86,41 @@ def get_cfg() -> config.Config:
def filter_request(request: flask.Request) -> werkzeug.Response | None:
+ # pylint: disable=too-many-return-statements
cfg = get_cfg()
- real_ip = get_real_ip(request)
+ real_ip = ip_address(get_real_ip(request))
network = get_network(real_ip, cfg)
+
+ if request.path == '/healthz':
+ return None
+
+ # link-local
+
if network.is_link_local:
return None
- if request.path == '/healthz':
+ # block- & pass- lists
+ #
+ # 1. The IP of the request is first checked against the pass-list; if the IP
+ # matches an entry in the list, the request is not blocked.
+ # 2. If no matching entry is found in the pass-list, then a check is made against
+ # the block list; if the IP matches an entry in the list, the request is
+ # blocked.
+ # 3. If the IP is not in either list, the request is not blocked.
+
+ match, msg = ip_lists.pass_ip(real_ip, cfg)
+ if match:
+ logger.warning("PASS %s: matched PASSLIST - %s", network.compressed, msg)
return None
+ match, msg = ip_lists.block_ip(real_ip, cfg)
+ if match:
+ logger.error("BLOCK %s: matched BLOCKLIST - %s", network.compressed, msg)
+ return flask.make_response(('IP is on BLOCKLIST - %s' % msg, 429))
+
+ # methods applied on /
+
for func in [
http_user_agent,
]:
@@ -101,6 +128,8 @@ def filter_request(request: flask.Request) -> werkzeug.Response | None:
if val is not None:
return val
+ # methods applied on /search
+
if request.path == '/search':
for func in [
diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml
index 71a231e8f..9560ec8f6 100644
--- a/searx/botdetection/limiter.toml
+++ b/searx/botdetection/limiter.toml
@@ -16,7 +16,25 @@ ipv6_prefix = 48
# (networks) are not monitored by the ip_limit
filter_link_local = false
-# acrivate link_token method in the ip_limit method
+# activate link_token method in the ip_limit method
link_token = false
+[botdetection.ip_lists]
+# In the limiter, the ip_lists method has priority over all other methods -> if
+# an IP is in the pass_ip list, it has unrestricted access and it is also not
+# checked if e.g. the "user agent" suggests a bot (e.g. curl).
+
+block_ip = [
+ # '93.184.216.34', # IPv4 of example.org
+ # '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class
+]
+
+pass_ip = [
+ # '192.168.0.0/16', # IPv4 private network
+ # 'fe80::/10' # IPv6 linklocal / wins over botdetection.ip_limit.filter_link_local
+]
+
+# Activate passlist of (hardcoded) IPs from the SearXNG organization,
+# e.g. `check.searx.space`.
+pass_searxng_org = true \ No newline at end of file
diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py
index 11a6a56b5..d86fa86c3 100644
--- a/searx/botdetection/link_token.py
+++ b/searx/botdetection/link_token.py
@@ -39,6 +39,7 @@ from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
+ ip_address,
)
import string
@@ -107,7 +108,7 @@ def ping(request: flask.Request, token: str):
return
cfg = limiter.get_cfg()
- real_ip = get_real_ip(request)
+ real_ip = ip_address(get_real_ip(request))
network = get_network(real_ip, cfg)
ping_key = get_ping_key(network, request)
diff --git a/searx/tools/config.py b/searx/tools/config.py
index f998031ba..d2710456f 100644
--- a/searx/tools/config.py
+++ b/searx/tools/config.py
@@ -8,6 +8,7 @@ structured dictionaries. The configuration schema is defined in a dictionary
structure and the configuration data is given in a dictionary structure.
"""
from __future__ import annotations
+from typing import Any
import copy
import typing
@@ -97,7 +98,7 @@ class Config:
self.deprecated = deprecated
self.cfg = copy.deepcopy(cfg_schema)
- def __getitem__(self, key: str):
+ def __getitem__(self, key: str) -> Any:
return self.get(key)
def validate(self, cfg: dict):
@@ -115,7 +116,7 @@ class Config:
"""Returns default value of field ``name`` in ``self.cfg_schema``."""
return value(name, self.cfg_schema)
- def get(self, name: str, default=UNSET, replace=True):
+ def get(self, name: str, default: Any = UNSET, replace: bool = True) -> Any:
"""Returns the value to which ``name`` points in the configuration.
If there is no such ``name`` in the config and the ``default`` is