1 files changed, 187 insertions, 0 deletions
diff --git a/qutebrowser/browser/webkit/httpheaders.py b/qutebrowser/browser/webkit/httpheaders.py
new file mode 100644
index 000000000..95b7b7104
--- /dev/null
+++ b/qutebrowser/browser/webkit/httpheaders.py
@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: Florian Bruhin (The Compiler) <mail@qutebrowser.org>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Parsing functions for various HTTP headers."""
+
+import email.headerregistry
+import email.errors
+import dataclasses
+import os.path
+from typing import Type
+
+from qutebrowser.qt.network import QNetworkRequest
+
+from qutebrowser.utils import log, utils
+
+
+class ContentDispositionError(Exception):
+
+    """Base class for RFC6266 errors."""
+
+
+@dataclasses.dataclass
+class DefectWrapper:
+
+    """Wrapper around a email.error for comparison."""
+
+    error_class: Type[email.errors.MessageDefect]
+    line: str
+
+    def __eq__(self, other):
+        return (
+            isinstance(other, self.error_class)
+            and other.line == self.line  # type: ignore[attr-defined]
+        )
+
+
+class ContentDisposition:
+
+    """Records various indications and hints about content disposition.
+
+    These can be used to know if a file should be downloaded or
+    displayed directly, and to hint what filename it should have
+    in the download case.
+    """
+
+    # Ignoring this defect fixes the attfnboth2 test case. It does *not* fix attfnboth
+    # one which has a slightly different wording ("duplicate(s) ignored" instead of
+    # "duplicate ignored"), because even if we did ignore that one, it still wouldn't
+    # work properly...
+    _IGNORED_DEFECT = DefectWrapper(
+        email.errors.InvalidHeaderDefect,
+        'duplicate parameter name; duplicate ignored'
+    )
+
+    def __init__(self, disposition, params):
+        """Used internally after parsing the header."""
+        self.disposition = disposition
+        self.params = params
+        assert 'filename*' not in self.params  # Handled by headerregistry
+
+    @classmethod
+    def parse(cls, value):
+        """Build a _ContentDisposition from header values."""
+        # We allow non-ascii here (it will only be parsed inside of qdtext, and
+        # rejected by the grammar if it appears in other places), although parsing
+        # it can be ambiguous.  Parsing it ensures that a non-ambiguous filename*
+        # value won't get dismissed because of an unrelated ambiguity in the
+        # filename parameter. But it does mean we occasionally give
+        # less-than-certain values for some legacy senders.
+        decoded = value.decode('iso-8859-1')
+
+        reg = email.headerregistry.HeaderRegistry()
+        try:
+            parsed = reg('Content-Disposition', decoded)
+        except IndexError:  # pragma: no cover
+            # WORKAROUND for https://github.com/python/cpython/issues/81672
+            # Fixed in Python 3.7.5 and 3.8.0.
+            # Still getting failures on 3.10 on CI though
+            raise ContentDispositionError("Missing closing quote character")
+        except ValueError:
+            # WORKAROUND for https://github.com/python/cpython/issues/87112
+            raise ContentDispositionError("Non-ASCII digit")
+        except AttributeError:  # pragma: no cover
+            # WORKAROUND for https://github.com/python/cpython/issues/93010
+            raise ContentDispositionError("Section number has an invalid leading 0")
+
+        if parsed.defects:
+            defects = list(parsed.defects)
+            if defects != [cls._IGNORED_DEFECT]:
+                raise ContentDispositionError(defects)
+
+        # https://github.com/python/mypy/issues/12314
+        assert isinstance(
+            parsed,  # type: ignore[unreachable]
+            email.headerregistry.ContentDispositionHeader,
+        ), parsed
+        return cls(  # type: ignore[unreachable]
+            disposition=parsed.content_disposition,
+            params=parsed.params,
+        )
+
+    def filename(self):
+        """The filename from the Content-Disposition header or None.
+
+        On safety:
+
+        This property records the intent of the sender.
+
+        You shouldn't use this sender-controlled value as a filesystem path, it
+        can be insecure. Serving files with this filename can be dangerous as
+        well, due to a certain browser using the part after the dot for
+        mime-sniffing.  Saving it to a database is fine by itself though.
+        """
+        return self.params.get('filename')
+
+    def is_inline(self):
+        """Return if the file should be handled inline.
+
+        If not, and unless your application supports other dispositions
+        than the standard inline and attachment, it should be handled
+        as an attachment.
+        """
+        return self.disposition in {None, 'inline'}
+
+    def __repr__(self):
+        return utils.get_repr(self, constructor=True,
+                              disposition=self.disposition, params=self.params)
+
+
+def parse_content_disposition(reply):
+    """Parse a content_disposition header.
+
+    Args:
+        reply: The QNetworkReply to get a filename for.
+
+    Return:
+        A (is_inline, filename) tuple.
+    """
+    is_inline = True
+    filename = None
+    content_disposition_header = b'Content-Disposition'
+    # First check if the Content-Disposition header has a filename
+    # attribute.
+    if reply.hasRawHeader(content_disposition_header):
+        # We use the unsafe variant of the filename as we sanitize it via
+        # os.path.basename later.
+        try:
+            value = bytes(reply.rawHeader(content_disposition_header))
+            log.network.debug(f"Parsing Content-Disposition: {value!r}")
+            content_disposition = ContentDisposition.parse(value)
+            filename = content_disposition.filename()
+        except ContentDispositionError as e:
+            log.network.error(f"Error while parsing filename: {e}")
+        else:
+            is_inline = content_disposition.is_inline()
+    # Then try to get filename from url
+    if not filename:
+        filename = reply.url().path().rstrip('/')
+    # If that fails as well, use a fallback
+    if not filename:
+        filename = 'qutebrowser-download'
+    return is_inline, os.path.basename(filename)
+
+
+def parse_content_type(reply):
+    """Parse a Content-Type header.
+
+    The parsing done here is very cheap, as we really only want to get the
+    Mimetype. Parameters aren't parsed specially.
+
+    Args:
+        reply: The QNetworkReply to handle.
+
+    Return:
+        A [mimetype, rest] list, or [None, None] if unset.
+        Rest can be None.
+    """
+    content_type = reply.header(QNetworkRequest.KnownHeaders.ContentTypeHeader)
+    if content_type is None:
+        return [None, None]
+    if ';' in content_type:
+        ret = content_type.split(';', maxsplit=1)
+    else:
+        ret = [content_type, None]
+    ret[0] = ret[0].strip()
+    return ret