diff options
Diffstat (limited to 'searx/botdetection')
-rw-r--r-- | searx/botdetection/__init__.py | 26 | ||||
-rw-r--r-- | searx/botdetection/http_accept.py | 24 | ||||
-rw-r--r-- | searx/botdetection/http_accept_encoding.py | 26 | ||||
-rw-r--r-- | searx/botdetection/http_accept_language.py | 23 | ||||
-rw-r--r-- | searx/botdetection/http_connection.py | 23 | ||||
-rw-r--r-- | searx/botdetection/http_user_agent.py | 54 | ||||
-rw-r--r-- | searx/botdetection/ip_limit.py | 90 | ||||
-rw-r--r-- | searx/botdetection/limiter.py | 79 | ||||
-rw-r--r-- | searx/botdetection/link_token.py | 126 |
9 files changed, 471 insertions, 0 deletions
diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py new file mode 100644 index 000000000..78a7d30f3 --- /dev/null +++ b/searx/botdetection/__init__.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection src: + +Bot detection methods +--------------------- + +The methods implemented in this python package are use by the :ref:`limiter src`. + +""" + +import flask + + +def dump_request(request: flask.Request): + return ( + "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path) + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py new file mode 100644 index 000000000..1ab7cb4c1 --- /dev/null +++ b/searx/botdetection/http_accept.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept`` +---------------------- + +The ``http_accept`` method evaluates a request as the request of a bot if the +Accept_ header .. + +- did not contain ``text/html`` + +.. _Accept: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept + +""" + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if 'text/html' not in request.accept_mimetypes: + return 429, "bot detected, HTTP header Accept did not contain text/html" + return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py new file mode 100644 index 000000000..ae630fd68 --- /dev/null +++ b/searx/botdetection/http_accept_encoding.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_encoding`` +------------------------------- + +The ``http_accept_encoding`` method evaluates a request as the request of a +bot if the Accept-Encoding_ header .. + +- did not contain ``gzip`` AND ``deflate`` (if both values are missed) +- did not contain ``text/html`` + +.. _Accept-Encoding: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding + +""" + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] + if not ('gzip' in accept_list or 'deflate' in accept_list): + return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" + return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py new file mode 100644 index 000000000..06743802e --- /dev/null +++ b/searx/botdetection/http_accept_language.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_language`` +------------------------------- + +The ``http_accept_language`` method evaluates a request as the request of a bot +if the Accept-Language_ header is unset. + +.. _Accept-Language: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if request.headers.get('Accept-Language', '').strip() == '': + return 429, "bot detected, missing HTTP header Accept-Language" + return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py new file mode 100644 index 000000000..f61f5e48c --- /dev/null +++ b/searx/botdetection/http_connection.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_connection`` +-------------------------- + +The ``http_connection`` method evaluates a request as the request of a bot if +the Connection_ header is set to ``close``. + +.. _Connection: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection + +""" + + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if request.headers.get('Connection', '').strip() == 'close': + return 429, "bot detected, HTTP header 'Connection=close'" + return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py new file mode 100644 index 000000000..892ae0bd9 --- /dev/null +++ b/searx/botdetection/http_user_agent.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_user_agent`` +-------------------------- + +The ``http_user_agent`` method evaluates a request as the request of a bot if +the User-Agent_ header is unset or matches the regular expression +:py:obj:`USER_AGENT`. + +.. _User-Agent: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + +from typing import Optional, Tuple +import re +import flask + +USER_AGENT = ( + r'(' + + r'unknown' + + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + # unmaintained Farside instances + + r'|' + + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') + # other bots and client to block + + '|.*PetalBot.*' + + r')' +) +"""Regular expression that matches to User-Agent_ from known *bots*""" + +_regexp = None + + +def regexp_user_agent(): + global _regexp # pylint: disable=global-statement + if not _regexp: + _regexp = re.compile(USER_AGENT) + return _regexp + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + user_agent = request.headers.get('User-Agent', 'unknown') + if regexp_user_agent().match(user_agent): + return ( + 429, + f"bot detected, HTTP header User-Agent: {user_agent}", + ) + return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py new file mode 100644 index 000000000..fce3f8b67 --- /dev/null +++ b/searx/botdetection/ip_limit.py @@ -0,0 +1,90 @@ +""" +Method ``ip_limit`` +------------------- + +The ``ip_limit`` method counts request from an IP in *sliding windows*. If +there are to many requests in a sliding window, the request is evaluated as a +bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ +header. To take privacy only the hash value of an IP is stored in the redis DB +and at least for a maximum of 10 minutes. + +The :py:obj:`link_token` method is used to investigate whether a request is +*suspicious*. If the :py:obj:`link_token` method is activated and a request is +*suspicious* the request rates are reduced: + +- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` +- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" + +from typing import Optional, Tuple +import flask + +from searx import redisdb +from searx import logger +from searx.redislib import incr_sliding_window + +from . import link_token + +logger = logger.getChild('botdetection.ip_limit') + +BURST_WINDOW = 20 +"""Time (sec) before sliding window for *burst* requests expires.""" + +BURST_MAX = 15 +"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`""" + +BURST_MAX_SUSPICIOUS = 2 +"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`""" + +LONG_WINDOW = 600 +"""Time (sec) before the longer sliding window expires.""" + +LONG_MAX = 150 +"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`""" + +LONG_MAX_SUSPICIOUS = 10 +"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`""" + +API_WONDOW = 3600 +"""Time (sec) before sliding window for API requests (format != html) expires.""" + +API_MAX = 4 +"""Maximum requests from one IP in the :py:obj:`API_WONDOW`""" + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + redis_client = redisdb.client() + + x_forwarded_for = request.headers.get('X-Forwarded-For', '') + if not x_forwarded_for: + logger.error("missing HTTP header X-Forwarded-For") + + if request.args.get('format', 'html') != 'html': + c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW) + if c > API_MAX: + return 429, "BLOCK %s: API limit exceeded" + + suspicious = link_token.is_suspicious(request) + + if suspicious: + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + if c > BURST_MAX_SUSPICIOUS: + return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" + + c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + if c > LONG_MAX_SUSPICIOUS: + return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" + + else: + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + if c > BURST_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" + + c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + if c > LONG_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX" + return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py new file mode 100644 index 000000000..71044c312 --- /dev/null +++ b/searx/botdetection/limiter.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _limiter src: + +Limiter +======= + +.. sidebar:: info + + The limiter requires a :ref:`Redis <settings redis>` database. + +Bot protection / IP rate limitation. The intention of rate limitation is to +limit suspicious requests from an IP. The motivation behind this is the fact +that SearXNG passes through requests from bots and is thus classified as a bot +itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked +by the search engine (the origin) in some other way. + +To avoid blocking, the requests from bots to SearXNG must also be blocked, this +is the task of the limiter. To perform this task, the limiter uses the methods +from the :py:obj:`searx.botdetection`. + +To enable the limiter activate: + +.. code:: yaml + + server: + ... + limiter: true # rate limit the number of request on the instance, block some bots + +and set the redis-url connection. Check the value, it depends on your redis DB +(see :ref:`settings redis`), by example: + +.. code:: yaml + + redis: + url: unix:///usr/local/searxng-redis/run/redis.sock?db=0 + +""" + +from typing import Optional, Tuple +import flask + +from searx.botdetection import ( + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, +) + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + + if request.path == '/healthz': + return None + + for func in [ + http_user_agent, + ]: + val = func.filter_request(request) + if val is not None: + return val + + if request.path == '/search': + + for func in [ + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, + ]: + val = func.filter_request(request) + if val is not None: + return val + + return None diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py new file mode 100644 index 000000000..8ef215f6c --- /dev/null +++ b/searx/botdetection/link_token.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``link_token`` +--------------------- + +The ``link_token`` method evaluates a request as :py:obj:`suspicious +<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the +client. By adding a random component (the token) in the URL a bot can not send +a ping by request a static URL. + +.. note:: + + This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. + +To get in use of this method a flask URL route needs to be added: + +.. code:: python + + @app.route('/client<token>.css', methods=['GET', 'POST']) + def client_token(token=None): + link_token.ping(request, token) + return Response('', mimetype='text/css') + +And in the HTML template from flask a stylesheet link is needed (the value of +``link_token`` comes from :py:obj:`get_token`): + +.. code:: html + + <link rel="stylesheet" + href="{{ url_for('client_token', token=link_token) }}" + type="text/css" /> + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" + +import string +import random +import flask + +from searx import logger +from searx import redisdb +from searx.redislib import secret_hash + +TOKEN_LIVE_TIME = 600 +"""Livetime (sec) of limiter's CSS token.""" + +PING_KEY = 'SearXNG_limiter.ping' +TOKEN_KEY = 'SearXNG_limiter.token' + +logger = logger.getChild('botdetection.link_token') + + +def is_suspicious(request: flask.Request): + """Checks if there is a valid ping for this request, if not this request is + rated as *suspicious*""" + redis_client = redisdb.client() + if not redis_client: + return False + + ping_key = get_ping_key(request) + if not redis_client.get(ping_key): + logger.warning( + "missing ping (IP: %s) / request: %s", + request.headers.get('X-Forwarded-For', ''), + ping_key, + ) + return True + + logger.debug("found ping for this request: %s", ping_key) + return False + + +def ping(request: flask.Request, token: str): + """This function is called by a request to URL ``/client<token>.css``""" + redis_client = redisdb.client() + if not redis_client: + return + if not token_is_valid(token): + return + ping_key = get_ping_key(request) + logger.debug("store ping for: %s", ping_key) + redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME) + + +def get_ping_key(request: flask.Request): + """Generates a hashed key that fits (more or less) to a request. At least + X-Forwarded-For_ is needed to be able to assign the request to an IP. + + """ + return secret_hash( + PING_KEY + + request.headers.get('X-Forwarded-For', '') + + request.headers.get('Accept-Language', '') + + request.headers.get('User-Agent', '') + ) + + +def token_is_valid(token) -> bool: + valid = token == get_token() + logger.debug("token is valid --> %s", valid) + return valid + + +def get_token() -> str: + """Returns current token. If there is no currently active token a new token + is generated randomly and stored in the redis DB. + + - :py:obj:`TOKEN_LIVE_TIME` + - :py:obj:`TOKEN_KEY` + + """ + redis_client = redisdb.client() + if not redis_client: + # This function is also called when limiter is inactive / no redis DB + # (see render function in webapp.py) + return '12345678' + token = redis_client.get(TOKEN_KEY) + if token: + token = token.decode('UTF-8') + else: + token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) + return token |