diff options
author | Florian Bruhin <me@the-compiler.org> | 2021-01-04 21:28:56 +0100 |
---|---|---|
committer | Florian Bruhin <me@the-compiler.org> | 2021-01-05 11:19:15 +0100 |
commit | 40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170 (patch) | |
tree | d87fefb806c805ab4e50096d388a94edfca992ac | |
parent | f0486432d13cc9563a4866c6cf96af98d0862035 (diff) | |
download | qutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.tar.gz qutebrowser-40464ebe3df06d88e2f4dc1ffcd7fb8df9e77170.zip |
Remove custom data: URL parsing
Argh. I should've known Python can do this, but I didn't notice (or
forget?) and wrote code - some 140 lines of it - which only was in this
repository for a couple of hours.
Oh well, good riddance, code which was replaced by a simple
"mimetypes.guess_type(url.toString())". May you be resurrected if we
ever need a proper data: URL parser at a later point.
I guess some lessons have to be learned the hard way...
-rw-r--r-- | qutebrowser/utils/urlutils.py | 52 | ||||
-rw-r--r-- | tests/unit/utils/test_urlutils.py | 99 |
2 files changed, 3 insertions, 148 deletions
diff --git a/qutebrowser/utils/urlutils.py b/qutebrowser/utils/urlutils.py index fa7867d4d..b5afe958c 100644 --- a/qutebrowser/utils/urlutils.py +++ b/qutebrowser/utils/urlutils.py @@ -394,52 +394,6 @@ def get_path_if_valid(pathstr: str, return path -def parse_data_url(url: QUrl) -> Tuple[str, str, bytes]: - """Parse a data URL. - - Returns a tuple with: - 1) The media type - 2) Media type parameters (currently without any further parsing) - 3) The (possibly decoded) data - - Based on https://en.wikipedia.org/wiki/Data_URI_scheme - - Possible further inspiration: - https://github.com/scrapy/w3lib/blob/v1.22.0/w3lib/url.py#L324-L384 - """ - ensure_valid(url) - if url.scheme().lower() != 'data': - raise Error(f"URL {url.toDisplayString()} has no data: scheme") - if ',' not in url.path(): - raise Error("Missing comma") - - encoded = url.toEncoded().data() - encoded = encoded[len('data:'):] # strip off scheme - encoded = urllib.parse.unquote_to_bytes(encoded) - encoded_header, data = encoded.split(b',', 1) - - try: - header = encoded_header.decode('ascii') - except UnicodeDecodeError as e: - raise Error(f"Invalid header in {url.toDisplayString()}: {e}") - - b64_suffix = ';base64' - if header.endswith(b64_suffix): - header = header[:-len(b64_suffix)] - data = base64.b64decode(data) - - if ';' in header: - media_type, params = header.split(';', 1) - else: - media_type = header - params = '' - - if not media_type: - media_type = 'text/plain' - - return media_type, params, data - - def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]: """Get a suitable filename from a URL. @@ -454,11 +408,11 @@ def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]: return fallback if url.scheme().lower() == 'data': - media_type, _params, _data = parse_data_url(url) - if not media_type: + mimetype, _encoding = mimetypes.guess_type(url.toString()) + if not mimetype: return fallback - ext = mimetypes.guess_extension(media_type, strict=False) or '' + ext = mimetypes.guess_extension(mimetype, strict=False) or '' return 'download' + ext pathname = posixpath.basename(url.path()) diff --git a/tests/unit/utils/test_urlutils.py b/tests/unit/utils/test_urlutils.py index 3372c67a1..fda4e45d9 100644 --- a/tests/unit/utils/test_urlutils.py +++ b/tests/unit/utils/test_urlutils.py @@ -545,105 +545,6 @@ def test_raise_cmdexc_if_invalid(url, valid, has_err_string): urlutils.raise_cmdexc_if_invalid(qurl) -# Test cases inspired by scrapy's w3lib: -# https://github.com/scrapy/w3lib/blob/v1.22.0/tests/test_url.py#L654-L739 -@pytest.mark.parametrize('url, media_type, params, data', [ - # Basic test - ( - "data:,A%20brief%20note", - "text/plain", - "", # we don't default to "charset=US-ASCII" because params are ignored anyways - b"A brief note", - ), - # Unicode URL - ( - "data:,é", - "text/plain", - "", - "é".encode("utf-8"), - ), - # Default media type - ( - "data:;charset=iso-8859-7,%be%d3%be", - "text/plain", - "charset=iso-8859-7", - b"\xbe\xd3\xbe", - ), - # Text with charset - ( - "data:text/plain;charset=iso-8859-7,%be%d3%be", - "text/plain", - "charset=iso-8859-7", - b"\xbe\xd3\xbe", - ), - # base64 - ( - "data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D", - "text/plain", - "", - b"Hello, world.", - ), - # base64 with spaces - ( - "data:text/plain;base64,SGVsb%20G8sIH%0A%20%20dvcm%20%20%20xk%20Lg%3D%0A%3D", - "text/plain", - "", - b"Hello, world.", - ), - ( - "data:text/plain;base64,SGVsb G8sIH\n dvcm xk Lg%3D\n%3D", - "text/plain", - "", - b"Hello, world.", - ), - # case-insensitive scheme - ( - "DATA:,A%20brief%20note", - "text/plain", - "", - b"A brief note", - ), - # wrong base64 param - should be invalid but we don't parse the params... - ( - "data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D", - "text/plain", - "baes64", - b"SGVsbG8sIHdvcmxkLg==", - ), - # custom media type - ( - "data:application/pdf,", - "application/pdf", - "", - b"", - ) -]) -def test_parse_data_url_valid(url, media_type, params, data): - assert isinstance(data, bytes) - assert urlutils.parse_data_url(QUrl(url)) == (media_type, params, data) - - -@pytest.mark.parametrize('url', [ - QUrl(), # invalid URL - QUrl('https://example.org/'), # no data: scheme - QUrl('data:A%20brief%20note'), # missing comma -]) -def test_parse_data_url_invalid(url): - with pytest.raises(urlutils.Error): - urlutils.parse_data_url(url) - - -@hypothesis.given(s=hypothesis.strategies.text()) -def test_parse_data_url_hypothesis(s): - url = QUrl('data:' + s) - hypothesis.assume(url.isValid()) - - try: - urlutils.parse_data_url(url) - except urlutils.Error: - pass - - @pytest.mark.parametrize('qurl, output', [ (QUrl(), None), (QUrl('http://qutebrowser.org/test.html'), 'test.html'), |