diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2023-05-23 18:16:37 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2023-05-29 14:54:56 +0200 |
commit | 1ec325adccc427fe05cf08da9a2d9d63da7365f4 (patch) | |
tree | 476f5efc3169db1f1a1c474968a1148d20728cca /searx/botdetection/http_user_agent.py | |
parent | 5226044c13817688a5ca3461743844dca4ed3d2b (diff) | |
download | searxng-1ec325adccc427fe05cf08da9a2d9d63da7365f4.tar.gz searxng-1ec325adccc427fe05cf08da9a2d9d63da7365f4.zip |
[mod] limiter -> botdetection: modularization and documentation
In order to be able to meet the outstanding requirements, the implementation is
modularized and supplemented with documentation.
This patch does not contain functional change, except it fixes issue #2455
----
Aktivate limiter in the settings.yml and simulate a bot request by::
curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \
-H 'Accept: text/html'
-H 'User-Agent: xyz' \
-H 'Accept-Encoding: gzip' \
'http://127.0.0.1:8888/search?q=foo'
In the LOG:
DEBUG searx.botdetection.link_token : missing ping for this request: .....
Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time
before you get a "Too Many Requests" response.
Closes: https://github.com/searxng/searxng/issues/2455
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/botdetection/http_user_agent.py')
-rw-r--r-- | searx/botdetection/http_user_agent.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py new file mode 100644 index 000000000..892ae0bd9 --- /dev/null +++ b/searx/botdetection/http_user_agent.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_user_agent`` +-------------------------- + +The ``http_user_agent`` method evaluates a request as the request of a bot if +the User-Agent_ header is unset or matches the regular expression +:py:obj:`USER_AGENT`. + +.. _User-Agent: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + +from typing import Optional, Tuple +import re +import flask + +USER_AGENT = ( + r'(' + + r'unknown' + + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + # unmaintained Farside instances + + r'|' + + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') + # other bots and client to block + + '|.*PetalBot.*' + + r')' +) +"""Regular expression that matches to User-Agent_ from known *bots*""" + +_regexp = None + + +def regexp_user_agent(): + global _regexp # pylint: disable=global-statement + if not _regexp: + _regexp = re.compile(USER_AGENT) + return _regexp + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + user_agent = request.headers.get('User-Agent', 'unknown') + if regexp_user_agent().match(user_agent): + return ( + 429, + f"bot detected, HTTP header User-Agent: {user_agent}", + ) + return None |