summaryrefslogtreecommitdiff
path: root/qutebrowser/browser/webkit/httpheaders.py
diff options
context:
space:
mode:
Diffstat (limited to 'qutebrowser/browser/webkit/httpheaders.py')
-rw-r--r--qutebrowser/browser/webkit/httpheaders.py187
1 files changed, 187 insertions, 0 deletions
diff --git a/qutebrowser/browser/webkit/httpheaders.py b/qutebrowser/browser/webkit/httpheaders.py
new file mode 100644
index 000000000..95b7b7104
--- /dev/null
+++ b/qutebrowser/browser/webkit/httpheaders.py
@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: Florian Bruhin (The Compiler) <mail@qutebrowser.org>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Parsing functions for various HTTP headers."""
+
+import email.headerregistry
+import email.errors
+import dataclasses
+import os.path
+from typing import Type
+
+from qutebrowser.qt.network import QNetworkRequest
+
+from qutebrowser.utils import log, utils
+
+
+class ContentDispositionError(Exception):
+
+ """Base class for RFC6266 errors."""
+
+
+@dataclasses.dataclass
+class DefectWrapper:
+
+ """Wrapper around a email.error for comparison."""
+
+ error_class: Type[email.errors.MessageDefect]
+ line: str
+
+ def __eq__(self, other):
+ return (
+ isinstance(other, self.error_class)
+ and other.line == self.line # type: ignore[attr-defined]
+ )
+
+
+class ContentDisposition:
+
+ """Records various indications and hints about content disposition.
+
+ These can be used to know if a file should be downloaded or
+ displayed directly, and to hint what filename it should have
+ in the download case.
+ """
+
+ # Ignoring this defect fixes the attfnboth2 test case. It does *not* fix attfnboth
+ # one which has a slightly different wording ("duplicate(s) ignored" instead of
+ # "duplicate ignored"), because even if we did ignore that one, it still wouldn't
+ # work properly...
+ _IGNORED_DEFECT = DefectWrapper(
+ email.errors.InvalidHeaderDefect,
+ 'duplicate parameter name; duplicate ignored'
+ )
+
+ def __init__(self, disposition, params):
+ """Used internally after parsing the header."""
+ self.disposition = disposition
+ self.params = params
+ assert 'filename*' not in self.params # Handled by headerregistry
+
+ @classmethod
+ def parse(cls, value):
+ """Build a _ContentDisposition from header values."""
+ # We allow non-ascii here (it will only be parsed inside of qdtext, and
+ # rejected by the grammar if it appears in other places), although parsing
+ # it can be ambiguous. Parsing it ensures that a non-ambiguous filename*
+ # value won't get dismissed because of an unrelated ambiguity in the
+ # filename parameter. But it does mean we occasionally give
+ # less-than-certain values for some legacy senders.
+ decoded = value.decode('iso-8859-1')
+
+ reg = email.headerregistry.HeaderRegistry()
+ try:
+ parsed = reg('Content-Disposition', decoded)
+ except IndexError: # pragma: no cover
+ # WORKAROUND for https://github.com/python/cpython/issues/81672
+ # Fixed in Python 3.7.5 and 3.8.0.
+ # Still getting failures on 3.10 on CI though
+ raise ContentDispositionError("Missing closing quote character")
+ except ValueError:
+ # WORKAROUND for https://github.com/python/cpython/issues/87112
+ raise ContentDispositionError("Non-ASCII digit")
+ except AttributeError: # pragma: no cover
+ # WORKAROUND for https://github.com/python/cpython/issues/93010
+ raise ContentDispositionError("Section number has an invalid leading 0")
+
+ if parsed.defects:
+ defects = list(parsed.defects)
+ if defects != [cls._IGNORED_DEFECT]:
+ raise ContentDispositionError(defects)
+
+ # https://github.com/python/mypy/issues/12314
+ assert isinstance(
+ parsed, # type: ignore[unreachable]
+ email.headerregistry.ContentDispositionHeader,
+ ), parsed
+ return cls( # type: ignore[unreachable]
+ disposition=parsed.content_disposition,
+ params=parsed.params,
+ )
+
+ def filename(self):
+ """The filename from the Content-Disposition header or None.
+
+ On safety:
+
+ This property records the intent of the sender.
+
+ You shouldn't use this sender-controlled value as a filesystem path, it
+ can be insecure. Serving files with this filename can be dangerous as
+ well, due to a certain browser using the part after the dot for
+ mime-sniffing. Saving it to a database is fine by itself though.
+ """
+ return self.params.get('filename')
+
+ def is_inline(self):
+ """Return if the file should be handled inline.
+
+ If not, and unless your application supports other dispositions
+ than the standard inline and attachment, it should be handled
+ as an attachment.
+ """
+ return self.disposition in {None, 'inline'}
+
+ def __repr__(self):
+ return utils.get_repr(self, constructor=True,
+ disposition=self.disposition, params=self.params)
+
+
+def parse_content_disposition(reply):
+ """Parse a content_disposition header.
+
+ Args:
+ reply: The QNetworkReply to get a filename for.
+
+ Return:
+ A (is_inline, filename) tuple.
+ """
+ is_inline = True
+ filename = None
+ content_disposition_header = b'Content-Disposition'
+ # First check if the Content-Disposition header has a filename
+ # attribute.
+ if reply.hasRawHeader(content_disposition_header):
+ # We use the unsafe variant of the filename as we sanitize it via
+ # os.path.basename later.
+ try:
+ value = bytes(reply.rawHeader(content_disposition_header))
+ log.network.debug(f"Parsing Content-Disposition: {value!r}")
+ content_disposition = ContentDisposition.parse(value)
+ filename = content_disposition.filename()
+ except ContentDispositionError as e:
+ log.network.error(f"Error while parsing filename: {e}")
+ else:
+ is_inline = content_disposition.is_inline()
+ # Then try to get filename from url
+ if not filename:
+ filename = reply.url().path().rstrip('/')
+ # If that fails as well, use a fallback
+ if not filename:
+ filename = 'qutebrowser-download'
+ return is_inline, os.path.basename(filename)
+
+
+def parse_content_type(reply):
+ """Parse a Content-Type header.
+
+ The parsing done here is very cheap, as we really only want to get the
+ Mimetype. Parameters aren't parsed specially.
+
+ Args:
+ reply: The QNetworkReply to handle.
+
+ Return:
+ A [mimetype, rest] list, or [None, None] if unset.
+ Rest can be None.
+ """
+ content_type = reply.header(QNetworkRequest.KnownHeaders.ContentTypeHeader)
+ if content_type is None:
+ return [None, None]
+ if ';' in content_type:
+ ret = content_type.split(';', maxsplit=1)
+ else:
+ ret = [content_type, None]
+ ret[0] = ret[0].strip()
+ return ret