1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
|
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
# Copyright 2018-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser. If not, see <https://www.gnu.org/licenses/>.
"""A Chromium-like URL matching pattern.
See:
https://developer.chrome.com/apps/match_patterns
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h
Based on the following commit in Chromium:
https://chromium.googlesource.com/chromium/src/+/6f4a6681eae01c2036336c18b06303e16a304a7c
(October 10 2020, newest commit as per October 28th 2020)
"""
import ipaddress
import fnmatch
import urllib.parse
from typing import Any, Optional, Tuple
from qutebrowser.qt.core import QUrl
from qutebrowser.utils import utils, qtutils
class ParseError(Exception):
"""Raised when a pattern could not be parsed."""
class UrlPattern:
"""A Chromium-like URL matching pattern.
Class attributes:
_DEFAULT_PORTS: The default ports used for schemes which support ports.
_SCHEMES_WITHOUT_HOST: Schemes which don't need a host.
Attributes:
host: The host to match to, or None for any host.
_pattern: The given pattern as string.
_match_all: Whether the pattern should match all URLs.
_match_subdomains: Whether the pattern should match subdomains of the
given host.
_scheme: The scheme to match to, or None to match any scheme.
Note that with Chromium, '*'/None only matches http/https and
not file/ftp. We deviate from that as per-URL settings aren't
security relevant.
_path: The path to match to, or None for any path.
_port: The port to match to as integer, or None for any port.
"""
_DEFAULT_PORTS = {'https': 443, 'http': 80, 'ftp': 21}
_SCHEMES_WITHOUT_HOST = ['about', 'file', 'data', 'javascript']
def __init__(self, pattern: str) -> None:
# Make sure all attributes are initialized if we exit early.
self._pattern = pattern
self._match_all = False
self._match_subdomains: bool = False
self._scheme: Optional[str] = None
self.host: Optional[str] = None
self._path: Optional[str] = None
self._port: Optional[int] = None
# > The special pattern <all_urls> matches any URL that starts with a
# > permitted scheme.
if pattern == '<all_urls>':
self._match_all = True
return
if '\0' in pattern:
raise ParseError("May not contain NUL byte")
pattern = self._fixup_pattern(pattern)
# We use urllib.parse instead of QUrl here because it can handle
# hosts with * in them.
try:
parsed = urllib.parse.urlparse(pattern)
except ValueError as e:
raise ParseError(str(e))
assert parsed is not None
self._init_scheme(parsed)
self._init_host(parsed)
self._init_path(parsed)
self._init_port(parsed)
def _to_tuple(self) -> Tuple[
bool, # _match_all
bool, # _match_subdomains
Optional[str], # _scheme
Optional[str], # host
Optional[str], # _path
Optional[int], # _port
]:
"""Get a pattern with information used for __eq__/__hash__."""
return (self._match_all, self._match_subdomains, self._scheme,
self.host, self._path, self._port)
def __hash__(self) -> int:
return hash(self._to_tuple())
def __eq__(self, other: Any) -> bool:
if not isinstance(other, UrlPattern):
return NotImplemented
return self._to_tuple() == other._to_tuple()
def __repr__(self) -> str:
return utils.get_repr(self, pattern=self._pattern, constructor=True)
def __str__(self) -> str:
return self._pattern
def _fixup_pattern(self, pattern: str) -> str:
"""Make sure the given pattern is parseable by urllib.parse."""
if pattern.startswith('*:'): # Any scheme, but *:// is unparsable
pattern = 'any:' + pattern[2:]
schemes = tuple(s + ':' for s in self._SCHEMES_WITHOUT_HOST)
if '://' not in pattern and not pattern.startswith(schemes):
pattern = 'any://' + pattern
# Chromium handles file://foo like file:///foo
# FIXME This doesn't actually strip the hostname correctly.
if (pattern.startswith('file://') and
not pattern.startswith('file:///')):
pattern = 'file:///' + pattern[len("file://"):]
return pattern
def _init_scheme(self, parsed: urllib.parse.ParseResult) -> None:
"""Parse the scheme from the given URL.
Deviation from Chromium:
- We assume * when no scheme has been given.
"""
if not parsed.scheme:
raise ParseError("Missing scheme")
if parsed.scheme == 'any':
self._scheme = None
return
self._scheme = parsed.scheme
def _init_path(self, parsed: urllib.parse.ParseResult) -> None:
"""Parse the path from the given URL.
Deviation from Chromium:
- We assume * when no path has been given.
"""
if self._scheme == 'about' and not parsed.path.strip():
raise ParseError("Pattern without path")
if parsed.path == '/*':
self._path = None
elif not parsed.path:
# When the user doesn't add a trailing slash, we assume the pattern
# matches any path.
self._path = None
else:
self._path = parsed.path
def _init_host(self, parsed: urllib.parse.ParseResult) -> None:
"""Parse the host from the given URL.
Deviation from Chromium:
- http://:1234/ is not a valid URL because it has no host.
- We don't allow patterns for dot/space hosts which QUrl considers
invalid.
"""
if parsed.hostname is None or not parsed.hostname.strip():
if self._scheme not in self._SCHEMES_WITHOUT_HOST:
raise ParseError("Pattern without host")
assert self.host is None
return
if parsed.netloc.startswith('['):
# Using QUrl parsing to minimize ipv6 addresses
url = QUrl()
url.setHost(parsed.hostname)
if not url.isValid():
raise ParseError(url.errorString())
self.host = url.host()
return
if parsed.hostname == '*':
self._match_subdomains = True
hostname = None
elif parsed.hostname.startswith('*.'):
if len(parsed.hostname) == 2:
# We don't allow just '*.' as a host.
raise ParseError("Pattern without host")
self._match_subdomains = True
hostname = parsed.hostname[2:]
elif set(parsed.hostname) in {frozenset('.'), frozenset('. ')}:
raise ParseError("Invalid host")
else:
hostname = parsed.hostname
if hostname is None:
self.host = None
elif '*' in hostname:
# Only * or *.foo is allowed as host.
raise ParseError("Invalid host wildcard")
else:
self.host = hostname.rstrip('.')
def _init_port(self, parsed: urllib.parse.ParseResult) -> None:
"""Parse the port from the given URL.
Deviation from Chromium:
- We use None instead of "*" if there's no port filter.
"""
if parsed.netloc.endswith(':*'):
# We can't access parsed.port as it tries to run int()
self._port = None
elif parsed.netloc.endswith(':'):
raise ParseError("Invalid port: Port is empty")
else:
try:
self._port = parsed.port
except ValueError as e:
raise ParseError("Invalid port: {}".format(e))
scheme_has_port = (self._scheme in list(self._DEFAULT_PORTS) or
self._scheme is None)
if self._port is not None and not scheme_has_port:
raise ParseError("Ports are unsupported with {} scheme".format(
self._scheme))
def _matches_scheme(self, scheme: str) -> bool:
return self._scheme is None or self._scheme == scheme
def _matches_host(self, host: str) -> bool:
# FIXME what about multiple dots?
host = host.rstrip('.')
# If we have no host in the match pattern, that means that we're
# matching all hosts, which means we have a match no matter what the
# test host is.
# Contrary to Chromium, we don't need to check for
# self._match_subdomains, as we want to return True here for e.g.
# file:// as well.
if self.host is None:
return True
# If the hosts are exactly equal, we have a match.
if host == self.host:
return True
# Otherwise, we can only match if our match pattern matches subdomains.
if not self._match_subdomains:
return False
# We don't do subdomain matching against IP addresses, so we can give
# up now if the test host is an IP address.
if not utils.raises(ValueError, ipaddress.ip_address, host):
return False
# Check if the test host is a subdomain of our host.
if len(host) <= (len(self.host) + 1):
return False
if not host.endswith(self.host):
return False
return host[len(host) - len(self.host) - 1] == '.'
def _matches_port(self, scheme: str, port: int) -> bool:
if port == -1 and scheme in self._DEFAULT_PORTS:
port = self._DEFAULT_PORTS[scheme]
return self._port is None or self._port == port
def _matches_path(self, path: str) -> bool:
"""Match the URL's path.
Deviations from Chromium:
- Chromium only matches <all_urls> with "javascript:" (pathless); but
we also match *://*/* and friends.
"""
if self._path is None:
return True
# Match 'google.com' with 'google.com/'
if path + '/*' == self._path:
return True
# FIXME Chromium seems to have a more optimized glob matching which
# doesn't rely on regexes. Do we need that too?
return fnmatch.fnmatchcase(path, self._path)
def matches(self, qurl: QUrl) -> bool:
"""Check if the pattern matches the given QUrl."""
qtutils.ensure_valid(qurl)
if self._match_all:
return True
if not self._matches_scheme(qurl.scheme()):
return False
# FIXME ignore for file:// like Chromium?
if not self._matches_host(qurl.host()):
return False
if not self._matches_port(qurl.scheme(), qurl.port()):
return False
if not self._matches_path(qurl.path()):
return False
return True
|