summaryrefslogtreecommitdiff
path: root/searx/botdetection/_helpers.py
blob: 365067c24d723687b22b81e4e342c123ff7870de (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-module-docstring, invalid-name
from __future__ import annotations

from ipaddress import (
    IPv4Network,
    IPv6Network,
    IPv4Address,
    IPv6Address,
    ip_network,
)
import flask
import werkzeug

from searx import logger
from . import config

logger = logger.getChild('botdetection')


def dump_request(request: flask.Request):
    return (
        request.path
        + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
        + " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
        + " || form: %s" % request.form
        + " || Accept: %s" % request.headers.get('Accept')
        + " || Accept-Language: %s" % request.headers.get('Accept-Language')
        + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
        + " || Content-Type: %s" % request.headers.get('Content-Type')
        + " || Content-Length: %s" % request.headers.get('Content-Length')
        + " || Connection: %s" % request.headers.get('Connection')
        + " || User-Agent: %s" % request.headers.get('User-Agent')
    )


def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
    """Returns a HTTP 429 response object and writes a ERROR message to the
    'botdetection' logger.  This function is used in part by the filter methods
    to return the default ``Too Many Requests`` response.

    """

    logger.debug("BLOCK %s: %s", network.compressed, log_msg)
    return flask.make_response(('Too Many Requests', 429))


def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network:
    """Returns the (client) network of whether the real_ip is part of."""

    if real_ip.version == 6:
        prefix = cfg['real_ip.ipv6_prefix']
    else:
        prefix = cfg['real_ip.ipv4_prefix']
    network = ip_network(f"{real_ip}/{prefix}", strict=False)
    # logger.debug("get_network(): %s", network.compressed)
    return network


_logged_errors = []


def _log_error_only_once(err_msg):
    if err_msg not in _logged_errors:
        logger.error(err_msg)
        _logged_errors.append(err_msg)


def get_real_ip(request: flask.Request) -> str:
    """Returns real IP of the request.  Since not all proxies set all the HTTP
    headers and incoming headers can be faked it may happen that the IP cannot
    be determined correctly.

    .. sidebar:: :py:obj:`flask.Request.remote_addr`

       SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).

    This function tries to get the remote IP in the order listed below,
    additional some tests are done and if inconsistencies or errors are
    detected, they are logged.

    The remote IP of the request is taken from (first match):

    - X-Forwarded-For_ header
    - `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
    - :py:obj:`flask.Request.remote_addr`

    .. _ProxyFix:
       https://werkzeug.palletsprojects.com/middleware/proxy_fix/

    .. _X-Forwarded-For:
      https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For

    """

    forwarded_for = request.headers.get("X-Forwarded-For")
    real_ip = request.headers.get('X-Real-IP')
    remote_addr = request.remote_addr
    # logger.debug(
    #     "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
    # )

    if not forwarded_for:
        _log_error_only_once("X-Forwarded-For header is not set!")
    else:
        from . import cfg  # pylint: disable=import-outside-toplevel, cyclic-import

        forwarded_for = [x.strip() for x in forwarded_for.split(',')]
        x_for: int = cfg['real_ip.x_for']  # type: ignore
        forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]

    if not real_ip:
        _log_error_only_once("X-Real-IP header is not set!")

    if forwarded_for and real_ip and forwarded_for != real_ip:
        logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)

    if forwarded_for and remote_addr and forwarded_for != remote_addr:
        logger.warning(
            "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
        )

    if real_ip and remote_addr and real_ip != remote_addr:
        logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)

    request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
    # logger.debug("get_real_ip() -> %s", request_ip)
    return request_ip