summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFlorian Bruhin <me@the-compiler.org>2021-01-04 21:28:56 +0100
committerFlorian Bruhin <me@the-compiler.org>2021-01-05 11:19:15 +0100
commit40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170 (patch)
treed87fefb806c805ab4e50096d388a94edfca992ac
parentf0486432d13cc9563a4866c6cf96af98d0862035 (diff)
downloadqutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.tar.gz
qutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.zip
Remove custom data: URL parsing
Argh. I should've known Python can do this, but I didn't notice (or forget?) and wrote code - some 140 lines of it - which only was in this repository for a couple of hours. Oh well, good riddance, code which was replaced by a simple "mimetypes.guess_type(url.toString())". May you be resurrected if we ever need a proper data: URL parser at a later point. I guess some lessons have to be learned the hard way...
-rw-r--r--qutebrowser/utils/urlutils.py52
-rw-r--r--tests/unit/utils/test_urlutils.py99
2 files changed, 3 insertions, 148 deletions
diff --git a/qutebrowser/utils/urlutils.py b/qutebrowser/utils/urlutils.py
index fa7867d4d..b5afe958c 100644
--- a/qutebrowser/utils/urlutils.py
+++ b/qutebrowser/utils/urlutils.py
@@ -394,52 +394,6 @@ def get_path_if_valid(pathstr: str,
return path
-def parse_data_url(url: QUrl) -> Tuple[str, str, bytes]:
- """Parse a data URL.
-
- Returns a tuple with:
- 1) The media type
- 2) Media type parameters (currently without any further parsing)
- 3) The (possibly decoded) data
-
- Based on https://en.wikipedia.org/wiki/Data_URI_scheme
-
- Possible further inspiration:
- https://github.com/scrapy/w3lib/blob/v1.22.0/w3lib/url.py#L324-L384
- """
- ensure_valid(url)
- if url.scheme().lower() != 'data':
- raise Error(f"URL {url.toDisplayString()} has no data: scheme")
- if ',' not in url.path():
- raise Error("Missing comma")
-
- encoded = url.toEncoded().data()
- encoded = encoded[len('data:'):] # strip off scheme
- encoded = urllib.parse.unquote_to_bytes(encoded)
- encoded_header, data = encoded.split(b',', 1)
-
- try:
- header = encoded_header.decode('ascii')
- except UnicodeDecodeError as e:
- raise Error(f"Invalid header in {url.toDisplayString()}: {e}")
-
- b64_suffix = ';base64'
- if header.endswith(b64_suffix):
- header = header[:-len(b64_suffix)]
- data = base64.b64decode(data)
-
- if ';' in header:
- media_type, params = header.split(';', 1)
- else:
- media_type = header
- params = ''
-
- if not media_type:
- media_type = 'text/plain'
-
- return media_type, params, data
-
-
def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]:
"""Get a suitable filename from a URL.
@@ -454,11 +408,11 @@ def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]:
return fallback
if url.scheme().lower() == 'data':
- media_type, _params, _data = parse_data_url(url)
- if not media_type:
+ mimetype, _encoding = mimetypes.guess_type(url.toString())
+ if not mimetype:
return fallback
- ext = mimetypes.guess_extension(media_type, strict=False) or ''
+ ext = mimetypes.guess_extension(mimetype, strict=False) or ''
return 'download' + ext
pathname = posixpath.basename(url.path())
diff --git a/tests/unit/utils/test_urlutils.py b/tests/unit/utils/test_urlutils.py
index 3372c67a1..fda4e45d9 100644
--- a/tests/unit/utils/test_urlutils.py
+++ b/tests/unit/utils/test_urlutils.py
@@ -545,105 +545,6 @@ def test_raise_cmdexc_if_invalid(url, valid, has_err_string):
urlutils.raise_cmdexc_if_invalid(qurl)
-# Test cases inspired by scrapy's w3lib:
-# https://github.com/scrapy/w3lib/blob/v1.22.0/tests/test_url.py#L654-L739
-@pytest.mark.parametrize('url, media_type, params, data', [
- # Basic test
- (
- "data:,A%20brief%20note",
- "text/plain",
- "", # we don't default to "charset=US-ASCII" because params are ignored anyways
- b"A brief note",
- ),
- # Unicode URL
- (
- "data:,é",
- "text/plain",
- "",
- "é".encode("utf-8"),
- ),
- # Default media type
- (
- "data:;charset=iso-8859-7,%be%d3%be",
- "text/plain",
- "charset=iso-8859-7",
- b"\xbe\xd3\xbe",
- ),
- # Text with charset
- (
- "data:text/plain;charset=iso-8859-7,%be%d3%be",
- "text/plain",
- "charset=iso-8859-7",
- b"\xbe\xd3\xbe",
- ),
- # base64
- (
- "data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D",
- "text/plain",
- "",
- b"Hello, world.",
- ),
- # base64 with spaces
- (
- "data:text/plain;base64,SGVsb%20G8sIH%0A%20%20dvcm%20%20%20xk%20Lg%3D%0A%3D",
- "text/plain",
- "",
- b"Hello, world.",
- ),
- (
- "data:text/plain;base64,SGVsb G8sIH\n dvcm xk Lg%3D\n%3D",
- "text/plain",
- "",
- b"Hello, world.",
- ),
- # case-insensitive scheme
- (
- "DATA:,A%20brief%20note",
- "text/plain",
- "",
- b"A brief note",
- ),
- # wrong base64 param - should be invalid but we don't parse the params...
- (
- "data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D",
- "text/plain",
- "baes64",
- b"SGVsbG8sIHdvcmxkLg==",
- ),
- # custom media type
- (
- "data:application/pdf,",
- "application/pdf",
- "",
- b"",
- )
-])
-def test_parse_data_url_valid(url, media_type, params, data):
- assert isinstance(data, bytes)
- assert urlutils.parse_data_url(QUrl(url)) == (media_type, params, data)
-
-
-@pytest.mark.parametrize('url', [
- QUrl(), # invalid URL
- QUrl('https://example.org/'), # no data: scheme
- QUrl('data:A%20brief%20note'), # missing comma
-])
-def test_parse_data_url_invalid(url):
- with pytest.raises(urlutils.Error):
- urlutils.parse_data_url(url)
-
-
-@hypothesis.given(s=hypothesis.strategies.text())
-def test_parse_data_url_hypothesis(s):
- url = QUrl('data:' + s)
- hypothesis.assume(url.isValid())
-
- try:
- urlutils.parse_data_url(url)
- except urlutils.Error:
- pass
-
-
@pytest.mark.parametrize('qurl, output', [
(QUrl(), None),
(QUrl('http://qutebrowser.org/test.html'), 'test.html'),