summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFlorian Bruhin <me@the-compiler.org>2021-01-04 19:16:03 +0100
committerFlorian Bruhin <me@the-compiler.org>2021-01-04 19:49:03 +0100
commit1a5b95a745b0317923894d5a7383c5f2fec2fd4b (patch)
treedf7981703506f2d5c9b8dc3082815aecc9ca799e
parente6ae8797e71a678bef97a13b9057e29442e0ef48 (diff)
downloadqutebrowser-1a5b95a745b0317923894d5a7383c5f2fec2fd4b.tar.gz
qutebrowser-1a5b95a745b0317923894d5a7383c5f2fec2fd4b.zip
urlutils: Add basic parsing of data: URLs
-rw-r--r--qutebrowser/utils/urlutils.py46
-rw-r--r--tests/unit/utils/test_urlutils.py99
2 files changed, 145 insertions, 0 deletions
diff --git a/qutebrowser/utils/urlutils.py b/qutebrowser/utils/urlutils.py
index 41d20e734..3f1fb021c 100644
--- a/qutebrowser/utils/urlutils.py
+++ b/qutebrowser/utils/urlutils.py
@@ -393,6 +393,52 @@ def get_path_if_valid(pathstr: str,
return path
+def parse_data_url(url: QUrl) -> Tuple[str, str, bytes]:
+ """Parse a data URL.
+
+ Returns a tuple with:
+ 1) The media type
+ 2) Media type parameters (currently without any further parsing)
+ 3) The (possibly decoded) data
+
+ Based on https://en.wikipedia.org/wiki/Data_URI_scheme
+
+ Possible further inspiration:
+ https://github.com/scrapy/w3lib/blob/v1.22.0/w3lib/url.py#L324-L384
+ """
+ ensure_valid(url)
+ if url.scheme().lower() != 'data':
+ raise Error(f"URL {url.toDisplayString()} has no data: scheme")
+ if ',' not in url.path():
+ raise Error("Missing comma")
+
+ encoded = url.toEncoded().data()
+ encoded = encoded[len('data:'):] # strip off scheme
+ encoded = urllib.parse.unquote_to_bytes(encoded)
+ encoded_header, data = encoded.split(b',', 1)
+
+ try:
+ header = encoded_header.decode('ascii')
+ except UnicodeDecodeError as e:
+ raise Error(f"Invalid header in {url.toDisplayString()}: {e}")
+
+ b64_suffix = ';base64'
+ if header.endswith(b64_suffix):
+ header = header[:-len(b64_suffix)]
+ data = base64.b64decode(data)
+
+ if ';' in header:
+ media_type, params = header.split(';', 1)
+ else:
+ media_type = header
+ params = ''
+
+ if not media_type:
+ media_type = 'text/plain'
+
+ return media_type, params, data
+
+
def filename_from_url(url: QUrl) -> Optional[str]:
"""Get a suitable filename from a URL.
diff --git a/tests/unit/utils/test_urlutils.py b/tests/unit/utils/test_urlutils.py
index 0167f6cee..c2ff0042f 100644
--- a/tests/unit/utils/test_urlutils.py
+++ b/tests/unit/utils/test_urlutils.py
@@ -545,6 +545,105 @@ def test_raise_cmdexc_if_invalid(url, valid, has_err_string):
urlutils.raise_cmdexc_if_invalid(qurl)
+# Test cases inspired by scrapy's w3lib:
+# https://github.com/scrapy/w3lib/blob/v1.22.0/tests/test_url.py#L654-L739
+@pytest.mark.parametrize('url, media_type, params, data', [
+ # Basic test
+ (
+ "data:,A%20brief%20note",
+ "text/plain",
+ "", # we don't default to "charset=US-ASCII" because params are ignored anyways
+ b"A brief note",
+ ),
+ # Unicode URL
+ (
+ "data:,é",
+ "text/plain",
+ "",
+ "é".encode("utf-8"),
+ ),
+ # Default media type
+ (
+ "data:;charset=iso-8859-7,%be%d3%be",
+ "text/plain",
+ "charset=iso-8859-7",
+ b"\xbe\xd3\xbe",
+ ),
+ # Text with charset
+ (
+ "data:text/plain;charset=iso-8859-7,%be%d3%be",
+ "text/plain",
+ "charset=iso-8859-7",
+ b"\xbe\xd3\xbe",
+ ),
+ # base64
+ (
+ "data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D",
+ "text/plain",
+ "",
+ b"Hello, world.",
+ ),
+ # base64 with spaces
+ (
+ "data:text/plain;base64,SGVsb%20G8sIH%0A%20%20dvcm%20%20%20xk%20Lg%3D%0A%3D",
+ "text/plain",
+ "",
+ b"Hello, world.",
+ ),
+ (
+ "data:text/plain;base64,SGVsb G8sIH\n dvcm xk Lg%3D\n%3D",
+ "text/plain",
+ "",
+ b"Hello, world.",
+ ),
+ # case-insensitive scheme
+ (
+ "DATA:,A%20brief%20note",
+ "text/plain",
+ "",
+ b"A brief note",
+ ),
+ # wrong base64 param - should be invalid but we don't parse the params...
+ (
+ "data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D",
+ "text/plain",
+ "baes64",
+ b"SGVsbG8sIHdvcmxkLg==",
+ ),
+ # custom media type
+ (
+ "data:application/pdf,",
+ "application/pdf",
+ "",
+ b"",
+ )
+])
+def test_parse_data_url_valid(url, media_type, params, data):
+ assert isinstance(data, bytes)
+ assert urlutils.parse_data_url(QUrl(url)) == (media_type, params, data)
+
+
+@pytest.mark.parametrize('url', [
+ QUrl(), # invalid URL
+ QUrl('https://example.org/'), # no data: scheme
+ QUrl('data:A%20brief%20note'), # missing comma
+])
+def test_parse_data_url_invalid(url):
+ with pytest.raises(urlutils.Error):
+ urlutils.parse_data_url(url)
+
+
+@hypothesis.given(s=hypothesis.strategies.text())
+def test_parse_data_url_hypothesis(s):
+ url = QUrl('data:' + s)
+ hypothesis.assume(url.isValid())
+
+ try:
+ urlutils.parse_data_url(url)
+ except urlutils.Error:
+ pass
+
+
@pytest.mark.parametrize('qurl, output', [
(QUrl(), None),
(QUrl('http://qutebrowser.org/test.html'), 'test.html'),