From 1a5b95a745b0317923894d5a7383c5f2fec2fd4b Mon Sep 17 00:00:00 2001
From: Florian Bruhin <me@the-compiler.org>
Date: Mon, 4 Jan 2021 19:16:03 +0100
Subject: urlutils: Add basic parsing of data: URLs

---
 qutebrowser/utils/urlutils.py     | 46 ++++++++++++++++++
 tests/unit/utils/test_urlutils.py | 99 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+)

diff --git a/qutebrowser/utils/urlutils.py b/qutebrowser/utils/urlutils.py
index 41d20e734..3f1fb021c 100644
--- a/qutebrowser/utils/urlutils.py
+++ b/qutebrowser/utils/urlutils.py
@@ -393,6 +393,52 @@ def get_path_if_valid(pathstr: str,
     return path
 
 
+def parse_data_url(url: QUrl) -> Tuple[str, str, bytes]:
+    """Parse a data URL.
+
+    Returns a tuple with:
+    1) The media type
+    2) Media type parameters (currently without any further parsing)
+    3) The (possibly decoded) data
+
+    Based on https://en.wikipedia.org/wiki/Data_URI_scheme
+
+    Possible further inspiration:
+    https://github.com/scrapy/w3lib/blob/v1.22.0/w3lib/url.py#L324-L384
+    """
+    ensure_valid(url)
+    if url.scheme().lower() != 'data':
+        raise Error(f"URL {url.toDisplayString()} has no data: scheme")
+    if ',' not in url.path():
+        raise Error("Missing comma")
+
+    encoded = url.toEncoded().data()
+    encoded = encoded[len('data:'):]  # strip off scheme
+    encoded = urllib.parse.unquote_to_bytes(encoded)
+    encoded_header, data = encoded.split(b',', 1)
+
+    try:
+        header = encoded_header.decode('ascii')
+    except UnicodeDecodeError as e:
+        raise Error(f"Invalid header in {url.toDisplayString()}: {e}")
+
+    b64_suffix = ';base64'
+    if header.endswith(b64_suffix):
+        header = header[:-len(b64_suffix)]
+        data = base64.b64decode(data)
+
+    if ';' in header:
+        media_type, params = header.split(';', 1)
+    else:
+        media_type = header
+        params = ''
+
+    if not media_type:
+        media_type = 'text/plain'
+
+    return media_type, params, data
+
+
 def filename_from_url(url: QUrl) -> Optional[str]:
     """Get a suitable filename from a URL.
 
diff --git a/tests/unit/utils/test_urlutils.py b/tests/unit/utils/test_urlutils.py
index 0167f6cee..c2ff0042f 100644
--- a/tests/unit/utils/test_urlutils.py
+++ b/tests/unit/utils/test_urlutils.py
@@ -545,6 +545,105 @@ def test_raise_cmdexc_if_invalid(url, valid, has_err_string):
             urlutils.raise_cmdexc_if_invalid(qurl)
 
 
+# Test cases inspired by scrapy's w3lib:
+# https://github.com/scrapy/w3lib/blob/v1.22.0/tests/test_url.py#L654-L739
+@pytest.mark.parametrize('url, media_type, params, data', [
+    # Basic test
+    (
+        "data:,A%20brief%20note",
+        "text/plain",
+        "",  # we don't default to "charset=US-ASCII" because params are ignored anyways
+        b"A brief note",
+    ),
+    # Unicode URL
+    (
+        "data:,é",
+        "text/plain",
+        "",
+        "é".encode("utf-8"),
+    ),
+    # Default media type
+    (
+        "data:;charset=iso-8859-7,%be%d3%be",
+        "text/plain",
+        "charset=iso-8859-7",
+        b"\xbe\xd3\xbe",
+    ),
+    # Text with charset
+    (
+        "data:text/plain;charset=iso-8859-7,%be%d3%be",
+        "text/plain",
+        "charset=iso-8859-7",
+        b"\xbe\xd3\xbe",
+    ),
+    # base64
+    (
+        "data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D",
+        "text/plain",
+        "",
+        b"Hello, world.",
+    ),
+    # base64 with spaces
+    (
+        "data:text/plain;base64,SGVsb%20G8sIH%0A%20%20dvcm%20%20%20xk%20Lg%3D%0A%3D",
+        "text/plain",
+        "",
+        b"Hello, world.",
+    ),
+    (
+        "data:text/plain;base64,SGVsb G8sIH\n  dvcm   xk Lg%3D\n%3D",
+        "text/plain",
+        "",
+        b"Hello, world.",
+    ),
+    # case-insensitive scheme
+    (
+        "DATA:,A%20brief%20note",
+        "text/plain",
+        "",
+        b"A brief note",
+    ),
+    # wrong base64 param - should be invalid but we don't parse the params...
+    (
+        "data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D",
+        "text/plain",
+        "baes64",
+        b"SGVsbG8sIHdvcmxkLg==",
+    ),
+    # custom media type
+    (
+        "data:application/pdf,",
+        "application/pdf",
+        "",
+        b"",
+    )
+])
+def test_parse_data_url_valid(url, media_type, params, data):
+    assert isinstance(data, bytes)
+    assert urlutils.parse_data_url(QUrl(url)) == (media_type, params, data)
+
+
+@pytest.mark.parametrize('url', [
+    QUrl(),  # invalid URL
+    QUrl('https://example.org/'),  # no data: scheme
+    QUrl('data:A%20brief%20note'),  # missing comma
+])
+def test_parse_data_url_invalid(url):
+    with pytest.raises(urlutils.Error):
+        urlutils.parse_data_url(url)
+
+
+@hypothesis.given(s=hypothesis.strategies.text())
+def test_parse_data_url_hypothesis(s):
+    url = QUrl('data:' + s)
+    hypothesis.assume(url.isValid())
+
+    try:
+        urlutils.parse_data_url(url)
+    except urlutils.Error:
+        pass
+
+
 @pytest.mark.parametrize('qurl, output', [
     (QUrl(), None),
     (QUrl('http://qutebrowser.org/test.html'), 'test.html'),
-- 
cgit v1.2.3-54-g00ecf