Remove custom data: URL parsing

Argh. I should've known Python can do this, but I didn't notice (or forget?) and wrote code - some 140 lines of it - which only was in this repository for a couple of hours. Oh well, good riddance, code which was replaced by a simple "mimetypes.guess_type(url.toString())". May you be resurrected if we ever need a proper data: URL parser at a later point. I guess some lessons have to be learned the hard way...
author: Florian Bruhin <me@the-compiler.org> 2021-01-04 21:28:56 +0100
committer: Florian Bruhin <me@the-compiler.org> 2021-01-05 11:19:15 +0100
commit: 40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170 (patch)
tree: d87fefb806c805ab4e50096d388a94edfca992ac
parent: f0486432d13cc9563a4866c6cf96af98d0862035 (diff)
download: qutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.tar.gz
qutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.zip
2 files changed, 3 insertions, 148 deletions
diff --git a/qutebrowser/utils/urlutils.py b/qutebrowser/utils/urlutils.py
index fa7867d4d..b5afe958c 100644
--- a/qutebrowser/utils/urlutils.py
+++ b/qutebrowser/utils/urlutils.py
@@ -394,52 +394,6 @@ def get_path_if_valid(pathstr: str,
     return path
 
 
-def parse_data_url(url: QUrl) -> Tuple[str, str, bytes]:
-    """Parse a data URL.
-
-    Returns a tuple with:
-    1) The media type
-    2) Media type parameters (currently without any further parsing)
-    3) The (possibly decoded) data
-
-    Based on https://en.wikipedia.org/wiki/Data_URI_scheme
-
-    Possible further inspiration:
-    https://github.com/scrapy/w3lib/blob/v1.22.0/w3lib/url.py#L324-L384
-    """
-    ensure_valid(url)
-    if url.scheme().lower() != 'data':
-        raise Error(f"URL {url.toDisplayString()} has no data: scheme")
-    if ',' not in url.path():
-        raise Error("Missing comma")
-
-    encoded = url.toEncoded().data()
-    encoded = encoded[len('data:'):]  # strip off scheme
-    encoded = urllib.parse.unquote_to_bytes(encoded)
-    encoded_header, data = encoded.split(b',', 1)
-
-    try:
-        header = encoded_header.decode('ascii')
-    except UnicodeDecodeError as e:
-        raise Error(f"Invalid header in {url.toDisplayString()}: {e}")
-
-    b64_suffix = ';base64'
-    if header.endswith(b64_suffix):
-        header = header[:-len(b64_suffix)]
-        data = base64.b64decode(data)
-
-    if ';' in header:
-        media_type, params = header.split(';', 1)
-    else:
-        media_type = header
-        params = ''
-
-    if not media_type:
-        media_type = 'text/plain'
-
-    return media_type, params, data
-
-
 def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]:
     """Get a suitable filename from a URL.
 
@@ -454,11 +408,11 @@ def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]:
         return fallback
 
     if url.scheme().lower() == 'data':
-        media_type, _params, _data = parse_data_url(url)
-        if not media_type:
+        mimetype, _encoding = mimetypes.guess_type(url.toString())
+        if not mimetype:
             return fallback
 
-        ext = mimetypes.guess_extension(media_type, strict=False) or ''
+        ext = mimetypes.guess_extension(mimetype, strict=False) or ''
         return 'download' + ext
 
     pathname = posixpath.basename(url.path())
diff --git a/tests/unit/utils/test_urlutils.py b/tests/unit/utils/test_urlutils.py
index 3372c67a1..fda4e45d9 100644
--- a/tests/unit/utils/test_urlutils.py
+++ b/tests/unit/utils/test_urlutils.py
@@ -545,105 +545,6 @@ def test_raise_cmdexc_if_invalid(url, valid, has_err_string):
             urlutils.raise_cmdexc_if_invalid(qurl)
 
 
-# Test cases inspired by scrapy's w3lib:
-# https://github.com/scrapy/w3lib/blob/v1.22.0/tests/test_url.py#L654-L739
-@pytest.mark.parametrize('url, media_type, params, data', [
-    # Basic test
-    (
-        "data:,A%20brief%20note",
-        "text/plain",
-        "",  # we don't default to "charset=US-ASCII" because params are ignored anyways
-        b"A brief note",
-    ),
-    # Unicode URL
-    (
-        "data:,é",
-        "text/plain",
-        "",
-        "é".encode("utf-8"),
-    ),
-    # Default media type
-    (
-        "data:;charset=iso-8859-7,%be%d3%be",
-        "text/plain",
-        "charset=iso-8859-7",
-        b"\xbe\xd3\xbe",
-    ),
-    # Text with charset
-    (
-        "data:text/plain;charset=iso-8859-7,%be%d3%be",
-        "text/plain",
-        "charset=iso-8859-7",
-        b"\xbe\xd3\xbe",
-    ),
-    # base64
-    (
-        "data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D",
-        "text/plain",
-        "",
-        b"Hello, world.",
-    ),
-    # base64 with spaces
-    (
-        "data:text/plain;base64,SGVsb%20G8sIH%0A%20%20dvcm%20%20%20xk%20Lg%3D%0A%3D",
-        "text/plain",
-        "",
-        b"Hello, world.",
-    ),
-    (
-        "data:text/plain;base64,SGVsb G8sIH\n  dvcm   xk Lg%3D\n%3D",
-        "text/plain",
-        "",
-        b"Hello, world.",
-    ),
-    # case-insensitive scheme
-    (
-        "DATA:,A%20brief%20note",
-        "text/plain",
-        "",
-        b"A brief note",
-    ),
-    # wrong base64 param - should be invalid but we don't parse the params...
-    (
-        "data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D",
-        "text/plain",
-        "baes64",
-        b"SGVsbG8sIHdvcmxkLg==",
-    ),
-    # custom media type
-    (
-        "data:application/pdf,",
-        "application/pdf",
-        "",
-        b"",
-    )
-])
-def test_parse_data_url_valid(url, media_type, params, data):
-    assert isinstance(data, bytes)
-    assert urlutils.parse_data_url(QUrl(url)) == (media_type, params, data)
-
-
-@pytest.mark.parametrize('url', [
-    QUrl(),  # invalid URL
-    QUrl('https://example.org/'),  # no data: scheme
-    QUrl('data:A%20brief%20note'),  # missing comma
-])
-def test_parse_data_url_invalid(url):
-    with pytest.raises(urlutils.Error):
-        urlutils.parse_data_url(url)
-
-
-@hypothesis.given(s=hypothesis.strategies.text())
-def test_parse_data_url_hypothesis(s):
-    url = QUrl('data:' + s)
-    hypothesis.assume(url.isValid())
-
-    try:
-        urlutils.parse_data_url(url)
-    except urlutils.Error:
-        pass
-
-
 @pytest.mark.parametrize('qurl, output', [
     (QUrl(), None),
     (QUrl('http://qutebrowser.org/test.html'), 'test.html'),
author	Florian Bruhin <me@the-compiler.org>	2021-01-04 21:28:56 +0100
committer	Florian Bruhin <me@the-compiler.org>	2021-01-05 11:19:15 +0100
commit	40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170 (patch)
tree	d87fefb806c805ab4e50096d388a94edfca992ac
parent	f0486432d13cc9563a4866c6cf96af98d0862035 (diff)
download	qutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.tar.gz qutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.zip