From 1a5b95a745b0317923894d5a7383c5f2fec2fd4b Mon Sep 17 00:00:00 2001 From: Florian Bruhin Date: Mon, 4 Jan 2021 19:16:03 +0100 Subject: urlutils: Add basic parsing of data: URLs --- qutebrowser/utils/urlutils.py | 46 ++++++++++++++++++ tests/unit/utils/test_urlutils.py | 99 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/qutebrowser/utils/urlutils.py b/qutebrowser/utils/urlutils.py index 41d20e734..3f1fb021c 100644 --- a/qutebrowser/utils/urlutils.py +++ b/qutebrowser/utils/urlutils.py @@ -393,6 +393,52 @@ def get_path_if_valid(pathstr: str, return path +def parse_data_url(url: QUrl) -> Tuple[str, str, bytes]: + """Parse a data URL. + + Returns a tuple with: + 1) The media type + 2) Media type parameters (currently without any further parsing) + 3) The (possibly decoded) data + + Based on https://en.wikipedia.org/wiki/Data_URI_scheme + + Possible further inspiration: + https://github.com/scrapy/w3lib/blob/v1.22.0/w3lib/url.py#L324-L384 + """ + ensure_valid(url) + if url.scheme().lower() != 'data': + raise Error(f"URL {url.toDisplayString()} has no data: scheme") + if ',' not in url.path(): + raise Error("Missing comma") + + encoded = url.toEncoded().data() + encoded = encoded[len('data:'):] # strip off scheme + encoded = urllib.parse.unquote_to_bytes(encoded) + encoded_header, data = encoded.split(b',', 1) + + try: + header = encoded_header.decode('ascii') + except UnicodeDecodeError as e: + raise Error(f"Invalid header in {url.toDisplayString()}: {e}") + + b64_suffix = ';base64' + if header.endswith(b64_suffix): + header = header[:-len(b64_suffix)] + data = base64.b64decode(data) + + if ';' in header: + media_type, params = header.split(';', 1) + else: + media_type = header + params = '' + + if not media_type: + media_type = 'text/plain' + + return media_type, params, data + + def filename_from_url(url: QUrl) -> Optional[str]: """Get a suitable filename from a URL. diff --git a/tests/unit/utils/test_urlutils.py b/tests/unit/utils/test_urlutils.py index 0167f6cee..c2ff0042f 100644 --- a/tests/unit/utils/test_urlutils.py +++ b/tests/unit/utils/test_urlutils.py @@ -545,6 +545,105 @@ def test_raise_cmdexc_if_invalid(url, valid, has_err_string): urlutils.raise_cmdexc_if_invalid(qurl) +# Test cases inspired by scrapy's w3lib: +# https://github.com/scrapy/w3lib/blob/v1.22.0/tests/test_url.py#L654-L739 +@pytest.mark.parametrize('url, media_type, params, data', [ + # Basic test + ( + "data:,A%20brief%20note", + "text/plain", + "", # we don't default to "charset=US-ASCII" because params are ignored anyways + b"A brief note", + ), + # Unicode URL + ( + "data:,é", + "text/plain", + "", + "é".encode("utf-8"), + ), + # Default media type + ( + "data:;charset=iso-8859-7,%be%d3%be", + "text/plain", + "charset=iso-8859-7", + b"\xbe\xd3\xbe", + ), + # Text with charset + ( + "data:text/plain;charset=iso-8859-7,%be%d3%be", + "text/plain", + "charset=iso-8859-7", + b"\xbe\xd3\xbe", + ), + # base64 + ( + "data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D", + "text/plain", + "", + b"Hello, world.", + ), + # base64 with spaces + ( + "data:text/plain;base64,SGVsb%20G8sIH%0A%20%20dvcm%20%20%20xk%20Lg%3D%0A%3D", + "text/plain", + "", + b"Hello, world.", + ), + ( + "data:text/plain;base64,SGVsb G8sIH\n dvcm xk Lg%3D\n%3D", + "text/plain", + "", + b"Hello, world.", + ), + # case-insensitive scheme + ( + "DATA:,A%20brief%20note", + "text/plain", + "", + b"A brief note", + ), + # wrong base64 param - should be invalid but we don't parse the params... + ( + "data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D", + "text/plain", + "baes64", + b"SGVsbG8sIHdvcmxkLg==", + ), + # custom media type + ( + "data:application/pdf,", + "application/pdf", + "", + b"", + ) +]) +def test_parse_data_url_valid(url, media_type, params, data): + assert isinstance(data, bytes) + assert urlutils.parse_data_url(QUrl(url)) == (media_type, params, data) + + +@pytest.mark.parametrize('url', [ + QUrl(), # invalid URL + QUrl('https://example.org/'), # no data: scheme + QUrl('data:A%20brief%20note'), # missing comma +]) +def test_parse_data_url_invalid(url): + with pytest.raises(urlutils.Error): + urlutils.parse_data_url(url) + + +@hypothesis.given(s=hypothesis.strategies.text()) +def test_parse_data_url_hypothesis(s): + url = QUrl('data:' + s) + hypothesis.assume(url.isValid()) + + try: + urlutils.parse_data_url(url) + except urlutils.Error: + pass + + @pytest.mark.parametrize('qurl, output', [ (QUrl(), None), (QUrl('http://qutebrowser.org/test.html'), 'test.html'), -- cgit v1.2.3-54-g00ecf