summaryrefslogtreecommitdiff
path: root/qutebrowser/utils/urlmatch.py
blob: f57e7d793b4f619b3ac99e4f9f50892ecc32f3e2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# Copyright 2018-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser.  If not, see <https://www.gnu.org/licenses/>.

"""A Chromium-like URL matching pattern.

See:
https://developer.chrome.com/apps/match_patterns
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h

Based on the following commit in Chromium:
https://chromium.googlesource.com/chromium/src/+/6f4a6681eae01c2036336c18b06303e16a304a7c
(October 10 2020, newest commit as per October 28th 2020)
"""

import ipaddress
import fnmatch
import urllib.parse
from typing import Any, Optional, Tuple

from qutebrowser.qt.core import QUrl

from qutebrowser.utils import utils, qtutils


class ParseError(Exception):

    """Raised when a pattern could not be parsed."""


class UrlPattern:

    """A Chromium-like URL matching pattern.

    Class attributes:
        _DEFAULT_PORTS: The default ports used for schemes which support ports.
        _SCHEMES_WITHOUT_HOST: Schemes which don't need a host.

    Attributes:
        host: The host to match to, or None for any host.
        _pattern: The given pattern as string.
        _match_all: Whether the pattern should match all URLs.
        _match_subdomains: Whether the pattern should match subdomains of the
                           given host.
        _scheme: The scheme to match to, or None to match any scheme.
                 Note that with Chromium, '*'/None only matches http/https and
                 not file/ftp. We deviate from that as per-URL settings aren't
                 security relevant.
        _path: The path to match to, or None for any path.
        _port: The port to match to as integer, or None for any port.
    """

    _DEFAULT_PORTS = {'https': 443, 'http': 80, 'ftp': 21}
    _SCHEMES_WITHOUT_HOST = ['about', 'file', 'data', 'javascript']

    def __init__(self, pattern: str) -> None:
        # Make sure all attributes are initialized if we exit early.
        self._pattern = pattern
        self._match_all = False
        self._match_subdomains: bool = False
        self._scheme: Optional[str] = None
        self.host: Optional[str] = None
        self._path: Optional[str] = None
        self._port: Optional[int] = None

        # > The special pattern <all_urls> matches any URL that starts with a
        # > permitted scheme.
        if pattern == '<all_urls>':
            self._match_all = True
            return

        if '\0' in pattern:
            raise ParseError("May not contain NUL byte")

        pattern = self._fixup_pattern(pattern)

        # We use urllib.parse instead of QUrl here because it can handle
        # hosts with * in them.
        try:
            parsed = urllib.parse.urlparse(pattern)
        except ValueError as e:
            raise ParseError(str(e))

        assert parsed is not None

        self._init_scheme(parsed)
        self._init_host(parsed)
        self._init_path(parsed)
        self._init_port(parsed)

    def _to_tuple(self) -> Tuple[
        bool,  # _match_all
        bool,  # _match_subdomains
        Optional[str],  # _scheme
        Optional[str],  # host
        Optional[str],  # _path
        Optional[int],  # _port
    ]:
        """Get a pattern with information used for __eq__/__hash__."""
        return (self._match_all, self._match_subdomains, self._scheme,
                self.host, self._path, self._port)

    def __hash__(self) -> int:
        return hash(self._to_tuple())

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, UrlPattern):
            return NotImplemented
        return self._to_tuple() == other._to_tuple()

    def __repr__(self) -> str:
        return utils.get_repr(self, pattern=self._pattern, constructor=True)

    def __str__(self) -> str:
        return self._pattern

    def _fixup_pattern(self, pattern: str) -> str:
        """Make sure the given pattern is parseable by urllib.parse."""
        if pattern.startswith('*:'):  # Any scheme, but *:// is unparsable
            pattern = 'any:' + pattern[2:]

        schemes = tuple(s + ':' for s in self._SCHEMES_WITHOUT_HOST)
        if '://' not in pattern and not pattern.startswith(schemes):
            pattern = 'any://' + pattern

        # Chromium handles file://foo like file:///foo
        # FIXME This doesn't actually strip the hostname correctly.
        if (pattern.startswith('file://') and
                not pattern.startswith('file:///')):
            pattern = 'file:///' + pattern[len("file://"):]

        return pattern

    def _init_scheme(self, parsed: urllib.parse.ParseResult) -> None:
        """Parse the scheme from the given URL.

        Deviation from Chromium:
        - We assume * when no scheme has been given.
        """
        if not parsed.scheme:
            raise ParseError("Missing scheme")

        if parsed.scheme == 'any':
            self._scheme = None
            return

        self._scheme = parsed.scheme

    def _init_path(self, parsed: urllib.parse.ParseResult) -> None:
        """Parse the path from the given URL.

        Deviation from Chromium:
        - We assume * when no path has been given.
        """
        if self._scheme == 'about' and not parsed.path.strip():
            raise ParseError("Pattern without path")

        if parsed.path == '/*':
            self._path = None
        elif not parsed.path:
            # When the user doesn't add a trailing slash, we assume the pattern
            # matches any path.
            self._path = None
        else:
            self._path = parsed.path

    def _init_host(self, parsed: urllib.parse.ParseResult) -> None:
        """Parse the host from the given URL.

        Deviation from Chromium:
        - http://:1234/ is not a valid URL because it has no host.
        - We don't allow patterns for dot/space hosts which QUrl considers
          invalid.
        """
        if parsed.hostname is None or not parsed.hostname.strip():
            if self._scheme not in self._SCHEMES_WITHOUT_HOST:
                raise ParseError("Pattern without host")
            assert self.host is None
            return

        if parsed.netloc.startswith('['):
            # Using QUrl parsing to minimize ipv6 addresses
            url = QUrl()
            url.setHost(parsed.hostname)
            if not url.isValid():
                raise ParseError(url.errorString())
            self.host = url.host()
            return

        if parsed.hostname == '*':
            self._match_subdomains = True
            hostname = None
        elif parsed.hostname.startswith('*.'):
            if len(parsed.hostname) == 2:
                # We don't allow just '*.' as a host.
                raise ParseError("Pattern without host")
            self._match_subdomains = True
            hostname = parsed.hostname[2:]
        elif set(parsed.hostname) in {frozenset('.'), frozenset('. ')}:
            raise ParseError("Invalid host")
        else:
            hostname = parsed.hostname

        if hostname is None:
            self.host = None
        elif '*' in hostname:
            # Only * or *.foo is allowed as host.
            raise ParseError("Invalid host wildcard")
        else:
            self.host = hostname.rstrip('.')

    def _init_port(self, parsed: urllib.parse.ParseResult) -> None:
        """Parse the port from the given URL.

        Deviation from Chromium:
        - We use None instead of "*" if there's no port filter.
        """
        if parsed.netloc.endswith(':*'):
            # We can't access parsed.port as it tries to run int()
            self._port = None
        elif parsed.netloc.endswith(':'):
            raise ParseError("Invalid port: Port is empty")
        else:
            try:
                self._port = parsed.port
            except ValueError as e:
                raise ParseError("Invalid port: {}".format(e))

        scheme_has_port = (self._scheme in list(self._DEFAULT_PORTS) or
                           self._scheme is None)
        if self._port is not None and not scheme_has_port:
            raise ParseError("Ports are unsupported with {} scheme".format(
                self._scheme))

    def _matches_scheme(self, scheme: str) -> bool:
        return self._scheme is None or self._scheme == scheme

    def _matches_host(self, host: str) -> bool:
        # FIXME what about multiple dots?
        host = host.rstrip('.')

        # If we have no host in the match pattern, that means that we're
        # matching all hosts, which means we have a match no matter what the
        # test host is.
        # Contrary to Chromium, we don't need to check for
        # self._match_subdomains, as we want to return True here for e.g.
        # file:// as well.
        if self.host is None:
            return True

        # If the hosts are exactly equal, we have a match.
        if host == self.host:
            return True

        # Otherwise, we can only match if our match pattern matches subdomains.
        if not self._match_subdomains:
            return False

        # We don't do subdomain matching against IP addresses, so we can give
        # up now if the test host is an IP address.
        if not utils.raises(ValueError, ipaddress.ip_address, host):
            return False

        # Check if the test host is a subdomain of our host.
        if len(host) <= (len(self.host) + 1):
            return False

        if not host.endswith(self.host):
            return False

        return host[len(host) - len(self.host) - 1] == '.'

    def _matches_port(self, scheme: str, port: int) -> bool:
        if port == -1 and scheme in self._DEFAULT_PORTS:
            port = self._DEFAULT_PORTS[scheme]
        return self._port is None or self._port == port

    def _matches_path(self, path: str) -> bool:
        """Match the URL's path.

        Deviations from Chromium:
        - Chromium only matches <all_urls> with "javascript:" (pathless); but
          we also match *://*/* and friends.
        """
        if self._path is None:
            return True

        # Match 'google.com' with 'google.com/'
        if path + '/*' == self._path:
            return True

        # FIXME Chromium seems to have a more optimized glob matching which
        # doesn't rely on regexes. Do we need that too?
        return fnmatch.fnmatchcase(path, self._path)

    def matches(self, qurl: QUrl) -> bool:
        """Check if the pattern matches the given QUrl."""
        qtutils.ensure_valid(qurl)

        if self._match_all:
            return True

        if not self._matches_scheme(qurl.scheme()):
            return False
        # FIXME ignore for file:// like Chromium?
        if not self._matches_host(qurl.host()):
            return False
        if not self._matches_port(qurl.scheme(), qurl.port()):
            return False
        if not self._matches_path(qurl.path()):
            return False

        return True