summaryrefslogtreecommitdiff
path: root/searx/botdetection
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2023-05-28 18:58:31 +0200
committerMarkus Heiser <markus.heiser@darmarit.de>2023-06-01 14:38:53 +0200
commitb8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4 (patch)
treee91de3d7005d6f55cce0f9b2f4ec69623fc196e7 /searx/botdetection
parent52f1452c09ab2ec74aa5898d9ea749f33a71a814 (diff)
downloadsearxng-b8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4.tar.gz
searxng-b8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4.zip
[mod] botdetection - improve ip_limit and link_token methods
- counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the request is validated by the link_token method [1] - renew a ping-key on validation [2], this is needed for infinite scrolling, where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in the vanilla limiter - normalize the counter names of the ip_limit method to 'ip_limit.*' - just integrate the ip_limit method straight forward in the limiter plugin / non intermediate code --> ip_limit now returns None or a werkzeug.Response object that can be passed by the plugin to the flask application / non intermediate code that returns a tuple [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277 [2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206 [3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/botdetection')
-rw-r--r--searx/botdetection/__init__.py16
-rw-r--r--searx/botdetection/_helpers.py93
-rw-r--r--searx/botdetection/http_accept.py8
-rw-r--r--searx/botdetection/http_accept_encoding.py8
-rw-r--r--searx/botdetection/http_accept_language.py8
-rw-r--r--searx/botdetection/http_connection.py8
-rw-r--r--searx/botdetection/http_user_agent.py11
-rw-r--r--searx/botdetection/ip_limit.py61
-rw-r--r--searx/botdetection/limiter.py11
-rw-r--r--searx/botdetection/link_token.py43
10 files changed, 193 insertions, 74 deletions
diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py
index 78a7d30f3..b4de0f9c8 100644
--- a/searx/botdetection/__init__.py
+++ b/searx/botdetection/__init__.py
@@ -9,18 +9,4 @@ The methods implemented in this python package are use by the :ref:`limiter src`
"""
-import flask
-
-
-def dump_request(request: flask.Request):
- return (
- "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path)
- + " || form: %s" % request.form
- + " || Accept: %s" % request.headers.get('Accept')
- + " || Accept-Language: %s" % request.headers.get('Accept-Language')
- + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
- + " || Content-Type: %s" % request.headers.get('Content-Type')
- + " || Content-Length: %s" % request.headers.get('Content-Length')
- + " || Connection: %s" % request.headers.get('Connection')
- + " || User-Agent: %s" % request.headers.get('User-Agent')
- )
+from ._helpers import dump_request
diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py
new file mode 100644
index 000000000..b034b980b
--- /dev/null
+++ b/searx/botdetection/_helpers.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+# pylint: disable=missing-module-docstring, invalid-name
+
+from typing import Optional
+import flask
+import werkzeug
+
+from searx import logger
+
+logger = logger.getChild('botdetection')
+
+
+def dump_request(request: flask.Request):
+ return (
+ "%s: %s" % (get_real_ip(request), request.path)
+ + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+ + " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+ + " || form: %s" % request.form
+ + " || Accept: %s" % request.headers.get('Accept')
+ + " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ + " || Content-Type: %s" % request.headers.get('Content-Type')
+ + " || Content-Length: %s" % request.headers.get('Content-Length')
+ + " || Connection: %s" % request.headers.get('Connection')
+ + " || User-Agent: %s" % request.headers.get('User-Agent')
+ )
+
+
+def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]:
+ log_prefix = 'BLOCK %s: ' % get_real_ip(request)
+ logger.debug(log_prefix + log_msg)
+ return flask.make_response(('Too Many Requests', 429))
+
+
+def get_real_ip(request: flask.Request) -> str:
+ """Returns real IP of the request. Since not all proxies set all the HTTP
+ headers and incoming headers can be faked it may happen that the IP cannot
+ be determined correctly.
+
+ .. sidebar:: :py:obj:`flask.Request.remote_addr`
+
+ SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
+
+ This function tries to get the remote IP in the order listed below,
+ additional some tests are done and if inconsistencies or errors are
+ detected, they are logged.
+
+ The remote IP of the request is taken from (first match):
+
+ - X-Forwarded-For_ header
+ - `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
+ - :py:obj:`flask.Request.remote_addr`
+
+ .. _ProxyFix:
+ https://werkzeug.palletsprojects.com/middleware/proxy_fix/
+
+ .. _X-Forwarded-For:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
+
+ """
+
+ forwarded_for = request.headers.get("X-Forwarded-For")
+ real_ip = request.headers.get('X-Real-IP')
+ remote_addr = request.remote_addr
+ logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr)
+
+ if not forwarded_for:
+ logger.error("X-Forwarded-For header is not set!")
+ else:
+ from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import
+
+ forwarded_for = [x.strip() for x in forwarded_for.split(',')]
+ x_for: int = get_cfg()['real_ip.x_for']
+ forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
+
+ if not real_ip:
+ logger.error("X-Real-IP header is not set!")
+
+ if forwarded_for and real_ip and forwarded_for != real_ip:
+ logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
+
+ if forwarded_for and remote_addr and forwarded_for != remote_addr:
+ logger.warning(
+ "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
+ )
+
+ if real_ip and remote_addr and real_ip != remote_addr:
+ logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
+
+ request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
+ logger.debug("get_real_ip() -> %s", request_ip)
+ return request_ip
diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py
index 23670a283..60e2330ae 100644
--- a/searx/botdetection/http_accept.py
+++ b/searx/botdetection/http_accept.py
@@ -15,13 +15,15 @@ Accept_ header ..
"""
# pylint: disable=unused-argument
-from typing import Optional, Tuple
+from typing import Optional
import flask
+import werkzeug
from searx.tools import config
+from ._helpers import too_many_requests
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
if 'text/html' not in request.accept_mimetypes:
- return 429, "bot detected, HTTP header Accept did not contain text/html"
+ return too_many_requests(request, "HTTP header Accept did not contain text/html")
return None
diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py
index 191249711..5301c5d9d 100644
--- a/searx/botdetection/http_accept_encoding.py
+++ b/searx/botdetection/http_accept_encoding.py
@@ -16,14 +16,16 @@ bot if the Accept-Encoding_ header ..
"""
# pylint: disable=unused-argument
-from typing import Optional, Tuple
+from typing import Optional
import flask
+import werkzeug
from searx.tools import config
+from ._helpers import too_many_requests
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if not ('gzip' in accept_list or 'deflate' in accept_list):
- return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate"
+ return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate")
return None
diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py
index 558a216cf..060f67ec0 100644
--- a/searx/botdetection/http_accept_language.py
+++ b/searx/botdetection/http_accept_language.py
@@ -13,13 +13,15 @@ if the Accept-Language_ header is unset.
"""
# pylint: disable=unused-argument
-from typing import Optional, Tuple
+from typing import Optional
import flask
+import werkzeug
from searx.tools import config
+from ._helpers import too_many_requests
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
if request.headers.get('Accept-Language', '').strip() == '':
- return 429, "bot detected, missing HTTP header Accept-Language"
+ return too_many_requests(request, "missing HTTP header Accept-Language")
return None
diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py
index 0ef24a7b8..e718dfe3f 100644
--- a/searx/botdetection/http_connection.py
+++ b/searx/botdetection/http_connection.py
@@ -13,13 +13,15 @@ the Connection_ header is set to ``close``.
"""
# pylint: disable=unused-argument
-from typing import Optional, Tuple
+from typing import Optional
import flask
+import werkzeug
from searx.tools import config
+from ._helpers import too_many_requests
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
if request.headers.get('Connection', '').strip() == 'close':
- return 429, "bot detected, HTTP header 'Connection=close'"
+ return too_many_requests(request, "HTTP header 'Connection=close")
return None
diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py
index 3d1ec9173..70309e975 100644
--- a/searx/botdetection/http_user_agent.py
+++ b/searx/botdetection/http_user_agent.py
@@ -14,11 +14,13 @@ the User-Agent_ header is unset or matches the regular expression
"""
# pylint: disable=unused-argument
-from typing import Optional, Tuple
+from typing import Optional
import re
import flask
+import werkzeug
from searx.tools import config
+from ._helpers import too_many_requests
USER_AGENT = (
@@ -48,11 +50,8 @@ def regexp_user_agent():
return _regexp
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
user_agent = request.headers.get('User-Agent', 'unknown')
if regexp_user_agent().match(user_agent):
- return (
- 429,
- f"bot detected, HTTP header User-Agent: {user_agent}",
- )
+ return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}")
return None
diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py
index 9cffff7f0..e7fa57187 100644
--- a/searx/botdetection/ip_limit.py
+++ b/searx/botdetection/ip_limit.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
""".. _botdetection.ip_limit:
Method ``ip_limit``
@@ -37,16 +39,18 @@ droped.
"""
-from typing import Optional, Tuple
+from typing import Optional
import flask
+import werkzeug
from searx.tools import config
-
from searx import redisdb
from searx import logger
from searx.redislib import incr_sliding_window, drop_counter
from . import link_token
+from ._helpers import too_many_requests
+
logger = logger.getChild('botdetection.ip_limit')
@@ -81,50 +85,51 @@ SUSPICIOUS_IP_MAX = 3
"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
-def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
+def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
+ # pylint: disable=too-many-return-statements
redis_client = redisdb.client()
- x_forwarded_for = request.headers.get('X-Forwarded-For', '')
- if not x_forwarded_for:
+ client_ip = request.headers.get('X-Forwarded-For', '')
+ if not client_ip:
logger.error("missing HTTP header X-Forwarded-For")
if request.args.get('format', 'html') != 'html':
- c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW)
+ c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW)
if c > API_MAX:
- return 429, "BLOCK %s: API limit exceeded"
-
- suspicious = False
- suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for
+ return too_many_requests(request, "too many request in API_WINDOW")
if cfg['botdetection.ip_limit.link_token']:
- suspicious = link_token.is_suspicious(request)
- if suspicious:
+ suspicious = link_token.is_suspicious(request, True)
+
+ if not suspicious:
+ # this IP is no longer suspicious: release ip again / delete the counter of this IP
+ drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip)
+ return None
# this IP is suspicious: count requests from this IP
- c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW)
+ c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW)
if c > SUSPICIOUS_IP_MAX:
- return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW"
+ logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip)
+ return flask.redirect(flask.url_for('index'), code=302)
- c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
+ c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
if c > BURST_MAX_SUSPICIOUS:
- return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS"
+ return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
- c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
+ c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
if c > LONG_MAX_SUSPICIOUS:
- return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS"
+ return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
- else:
+ return None
- if cfg['botdetection.ip_limit.link_token']:
- # this IP is no longer suspicious: release ip again / delete the counter of this IP
- drop_counter(redis_client, suspicious_ip_counter)
+ # vanilla limiter without extensions counts BURST_MAX and LONG_MAX
+ c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
+ if c > BURST_MAX:
+ return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)")
- c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
- if c > BURST_MAX:
- return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX"
+ c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
+ if c > LONG_MAX:
+ return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)")
- c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
- if c > LONG_MAX:
- return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX"
return None
diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py
index cc1e00b3c..93826684f 100644
--- a/searx/botdetection/limiter.py
+++ b/searx/botdetection/limiter.py
@@ -42,6 +42,7 @@ from pathlib import Path
import flask
import pytomlpp as toml
+from searx import logger
from searx.tools import config
from searx.botdetection import (
http_accept,
@@ -62,7 +63,13 @@ CFG_DEPRECATED = {
# "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
}
-CFG = config.Config({}, {})
+CFG = None
+
+
+def get_cfg() -> config.Config:
+ if CFG is None:
+ init_cfg(logger)
+ return CFG
def init_cfg(log):
@@ -73,7 +80,7 @@ def init_cfg(log):
log.warning("missing config file: %s", LIMITER_CFG)
return
- log.warning("load config file: %s", LIMITER_CFG)
+ log.info("load config file: %s", LIMITER_CFG)
try:
upd_cfg = toml.load(LIMITER_CFG)
except toml.DecodeError as exc:
diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py
index 8ef215f6c..376d06d61 100644
--- a/searx/botdetection/link_token.py
+++ b/searx/botdetection/link_token.py
@@ -47,15 +47,24 @@ from searx.redislib import secret_hash
TOKEN_LIVE_TIME = 600
"""Livetime (sec) of limiter's CSS token."""
+PING_LIVE_TIME = 3600
+"""Livetime (sec) of the ping-key from a client (request)"""
+
PING_KEY = 'SearXNG_limiter.ping'
+"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
+
TOKEN_KEY = 'SearXNG_limiter.token'
+"""Key for which the current token is stored in the DB"""
logger = logger.getChild('botdetection.link_token')
-def is_suspicious(request: flask.Request):
+def is_suspicious(request: flask.Request, renew: bool = False):
"""Checks if there is a valid ping for this request, if not this request is
- rated as *suspicious*"""
+ rated as *suspicious*. If a valid ping exists and argument ``renew`` is
+ ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`.
+
+ """
redis_client = redisdb.client()
if not redis_client:
return False
@@ -69,12 +78,19 @@ def is_suspicious(request: flask.Request):
)
return True
- logger.debug("found ping for this request: %s", ping_key)
+ if renew:
+ redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
+
+ logger.debug("found ping for client request: %s", ping_key)
return False
def ping(request: flask.Request, token: str):
- """This function is called by a request to URL ``/client<token>.css``"""
+ """This function is called by a request to URL ``/client<token>.css``. If
+ ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
+ The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
+
+ """
redis_client = redisdb.client()
if not redis_client:
return
@@ -82,19 +98,24 @@ def ping(request: flask.Request, token: str):
return
ping_key = get_ping_key(request)
logger.debug("store ping for: %s", ping_key)
- redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME)
+ redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
def get_ping_key(request: flask.Request):
- """Generates a hashed key that fits (more or less) to a request. At least
- X-Forwarded-For_ is needed to be able to assign the request to an IP.
+ """Generates a hashed key that fits (more or less) to a client (request).
+ At least X-Forwarded-For_ is needed to be able to assign the request to an
+ IP.
"""
- return secret_hash(
+ return (
PING_KEY
- + request.headers.get('X-Forwarded-For', '')
- + request.headers.get('Accept-Language', '')
- + request.headers.get('User-Agent', '')
+ + "["
+ + secret_hash(
+ request.headers.get('X-Forwarded-For', '')
+ + request.headers.get('Accept-Language', '')
+ + request.headers.get('User-Agent', '')
+ )
+ + "]"
)