summaryrefslogtreecommitdiff
path: root/searx/plugins/limiter.py
blob: c7d74248b9573d54266b6af7ca8988165cfa3ec4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pyright: basic
"""Some bot protection / rate limitation

To monitor rate limits and protect privacy the IP addresses are getting stored
with a hash so the limiter plugin knows who to block.  A redis database is
needed to store the hash values.

Enable the plugin in ``settings.yml``:

- ``server.limiter: true``
- ``redis.url: ...`` check the value, see :ref:`settings redis`
"""

import re
from flask import request

from searx import redisdb
from searx.plugins import logger
from searx.redislib import incr_sliding_window, secret_hash

name = "Request limiter"
description = "Limit the number of request"
default_on = False
preference_section = 'service'
logger = logger.getChild('limiter')

block_user_agent = re.compile(
    r'('
    + r'unknown'
    + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
    + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
    + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
    + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
    + r'|ZmEu|BLEXBot|bitlybot'
    # unmaintained Farside instances
    + r'|'
    + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
    + '|.*PetalBot.*'
    + r')'
)

PING_KEY = 'SearXNG_limiter.ping'
TOKEN_KEY = 'SearXNG_limiter.token'


def ping():
    redis_client = redisdb.client()
    user_agent = request.headers.get('User-Agent', 'unknown')
    x_forwarded_for = request.headers.get('X-Forwarded-For', '')

    ping_key = PING_KEY + user_agent + x_forwarded_for
    redis_client.set(secret_hash(ping_key), 1, ex=600)


def is_accepted_request() -> bool:
    # pylint: disable=too-many-return-statements
    redis_client = redisdb.client()
    user_agent = request.headers.get('User-Agent', 'unknown')
    x_forwarded_for = request.headers.get('X-Forwarded-For', '')

    if request.path == '/healthz':
        return True

    if block_user_agent.match(user_agent):
        logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent))
        return False

    if request.path == '/search':

        c_burst_max = 2
        c_10min_max = 10

        ping_key = PING_KEY + user_agent + x_forwarded_for
        if redis_client.get(secret_hash(ping_key)):
            logger.debug('got a ping')
            c_burst_max = 15
            c_10min_max = 150
        else:
            logger.debug('missing a ping')

        c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20)
        c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600)
        if c_burst > c_burst_max or c_10min > c_10min_max:
            logger.debug("BLOCK %s: to many request", x_forwarded_for)
            return False

        if len(request.headers.get('Accept-Language', '').strip()) == '':
            logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for)
            return False

        if request.headers.get('Connection') == 'close':
            logger.debug("BLOCK %s: got Connection=close", x_forwarded_for)
            return False

        accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
        if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list:
            logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for)
            return False

        if 'text/html' not in request.accept_mimetypes:
            logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for)
            return False

        if request.args.get('format', 'html') != 'html':
            c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600)
            if c > 4:
                logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for)
                return False

    logger.debug(
        "OK %s: '%s'" % (x_forwarded_for, request.path)
        + " || form: %s" % request.form
        + " || Accept: %s" % request.headers.get('Accept', '')
        + " || Accept-Language: %s" % request.headers.get('Accept-Language', '')
        + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '')
        + " || Content-Type: %s" % request.headers.get('Content-Type', '')
        + " || Content-Length: %s" % request.headers.get('Content-Length', '')
        + " || Connection: %s" % request.headers.get('Connection', '')
        + " || User-Agent: %s" % user_agent
    )

    return True


def pre_request():
    if not is_accepted_request():
        return 'Too Many Requests', 429
    return None


def init(app, settings):
    if not settings['server']['limiter']:
        return False

    if not redisdb.client():
        logger.error("The limiter requires Redis")  # pylint: disable=undefined-variable
        return False

    app.before_request(pre_request)
    return True