qutebrowser/browser/webkit/httpheaders.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

# SPDX-FileCopyrightText: Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Parsing functions for various HTTP headers."""

import email.headerregistry
import email.errors
import dataclasses
import os.path
from typing import Type

from qutebrowser.qt.network import QNetworkRequest

from qutebrowser.utils import log, utils


class ContentDispositionError(Exception):

    """Base class for RFC6266 errors."""


@dataclasses.dataclass
class DefectWrapper:

    """Wrapper around a email.error for comparison."""

    error_class: Type[email.errors.MessageDefect]
    line: str

    def __eq__(self, other):
        return (
            isinstance(other, self.error_class)
            and other.line == self.line  # type: ignore[attr-defined]
        )


class ContentDisposition:

    """Records various indications and hints about content disposition.

    These can be used to know if a file should be downloaded or
    displayed directly, and to hint what filename it should have
    in the download case.
    """

    # Ignoring this defect fixes the attfnboth2 test case. It does *not* fix attfnboth
    # one which has a slightly different wording ("duplicate(s) ignored" instead of
    # "duplicate ignored"), because even if we did ignore that one, it still wouldn't
    # work properly...
    _IGNORED_DEFECT = DefectWrapper(
        email.errors.InvalidHeaderDefect,
        'duplicate parameter name; duplicate ignored'
    )

    def __init__(self, disposition, params):
        """Used internally after parsing the header."""
        self.disposition = disposition
        self.params = params
        assert 'filename*' not in self.params  # Handled by headerregistry

    @classmethod
    def parse(cls, value):
        """Build a _ContentDisposition from header values."""
        # We allow non-ascii here (it will only be parsed inside of qdtext, and
        # rejected by the grammar if it appears in other places), although parsing
        # it can be ambiguous.  Parsing it ensures that a non-ambiguous filename*
        # value won't get dismissed because of an unrelated ambiguity in the
        # filename parameter. But it does mean we occasionally give
        # less-than-certain values for some legacy senders.
        decoded = value.decode('iso-8859-1')

        reg = email.headerregistry.HeaderRegistry()
        try:
            parsed = reg('Content-Disposition', decoded)
        except IndexError:  # pragma: no cover
            # WORKAROUND for https://github.com/python/cpython/issues/81672
            # Fixed in Python 3.7.5 and 3.8.0.
            # Still getting failures on 3.10 on CI though
            raise ContentDispositionError("Missing closing quote character")
        except ValueError:
            # WORKAROUND for https://github.com/python/cpython/issues/87112
            raise ContentDispositionError("Non-ASCII digit")
        except AttributeError:  # pragma: no cover
            # WORKAROUND for https://github.com/python/cpython/issues/93010
            raise ContentDispositionError("Section number has an invalid leading 0")

        if parsed.defects:
            defects = list(parsed.defects)
            if defects != [cls._IGNORED_DEFECT]:
                raise ContentDispositionError(defects)

        # https://github.com/python/mypy/issues/12314
        assert isinstance(
            parsed,  # type: ignore[unreachable]
            email.headerregistry.ContentDispositionHeader,
        ), parsed
        return cls(  # type: ignore[unreachable]
            disposition=parsed.content_disposition,
            params=parsed.params,
        )

    def filename(self):
        """The filename from the Content-Disposition header or None.

        On safety:

        This property records the intent of the sender.

        You shouldn't use this sender-controlled value as a filesystem path, it
        can be insecure. Serving files with this filename can be dangerous as
        well, due to a certain browser using the part after the dot for
        mime-sniffing.  Saving it to a database is fine by itself though.
        """
        return self.params.get('filename')

    def is_inline(self):
        """Return if the file should be handled inline.

        If not, and unless your application supports other dispositions
        than the standard inline and attachment, it should be handled
        as an attachment.
        """
        return self.disposition in {None, 'inline'}

    def __repr__(self):
        return utils.get_repr(self, constructor=True,
                              disposition=self.disposition, params=self.params)


def parse_content_disposition(reply):
    """Parse a content_disposition header.

    Args:
        reply: The QNetworkReply to get a filename for.

    Return:
        A (is_inline, filename) tuple.
    """
    is_inline = True
    filename = None
    content_disposition_header = b'Content-Disposition'
    # First check if the Content-Disposition header has a filename
    # attribute.
    if reply.hasRawHeader(content_disposition_header):
        # We use the unsafe variant of the filename as we sanitize it via
        # os.path.basename later.
        try:
            value = bytes(reply.rawHeader(content_disposition_header))
            log.network.debug(f"Parsing Content-Disposition: {value!r}")
            content_disposition = ContentDisposition.parse(value)
            filename = content_disposition.filename()
        except ContentDispositionError as e:
            log.network.error(f"Error while parsing filename: {e}")
        else:
            is_inline = content_disposition.is_inline()
    # Then try to get filename from url
    if not filename:
        filename = reply.url().path().rstrip('/')
    # If that fails as well, use a fallback
    if not filename:
        filename = 'qutebrowser-download'
    return is_inline, os.path.basename(filename)


def parse_content_type(reply):
    """Parse a Content-Type header.

    The parsing done here is very cheap, as we really only want to get the
    Mimetype. Parameters aren't parsed specially.

    Args:
        reply: The QNetworkReply to handle.

    Return:
        A [mimetype, rest] list, or [None, None] if unset.
        Rest can be None.
    """
    content_type = reply.header(QNetworkRequest.KnownHeaders.ContentTypeHeader)
    if content_type is None:
        return [None, None]
    if ';' in content_type:
        ret = content_type.split(';', maxsplit=1)
    else:
        ret = [content_type, None]
    ret[0] = ret[0].strip()
    return ret