qutebrowser/browser/pdfjs.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:

# Copyright 2016-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
# Copyright 2015 Daniel Schadt
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser.  If not, see <https://www.gnu.org/licenses/>.

"""pdf.js integration for qutebrowser."""

import os

from PyQt5.QtCore import QUrl, QUrlQuery

from qutebrowser.utils import resources, javascript, jinja, standarddir, log
from qutebrowser.config import config


_SYSTEM_PATHS = [
    # Debian pdf.js-common
    # Arch Linux pdfjs
    '/usr/share/pdf.js/',
    # Flatpak (Flathub)
    '/app/share/pdf.js/',
    # Arch Linux pdf.js (defunct)
    '/usr/share/javascript/pdf.js/',
    # Debian libjs-pdf
    '/usr/share/javascript/pdf/',
]


class PDFJSNotFound(Exception):

    """Raised when no pdf.js installation is found.

    Attributes:
        path: path of the file that was requested but not found.
    """

    def __init__(self, path):
        self.path = path
        message = "Path '{}' not found".format(path)
        super().__init__(message)


def generate_pdfjs_page(filename, url):
    """Return the html content of a page that displays a file with pdfjs.

    Returns a string.

    Args:
        filename: The filename of the PDF to open.
        url: The URL being opened.
    """
    if not is_available():
        pdfjs_dir = os.path.join(standarddir.data(), 'pdfjs')
        return jinja.render('no_pdfjs.html',
                            url=url.toDisplayString(),
                            title="PDF.js not found",
                            pdfjs_dir=pdfjs_dir)
    html = get_pdfjs_res('web/viewer.html').decode('utf-8')

    script = _generate_pdfjs_script(filename)
    html = html.replace('</body>',
                        '</body><script>{}</script>'.format(script))
    # WORKAROUND for the fact that PDF.js tries to use the Fetch API even with
    # qute:// URLs.
    pdfjs_script = '<script src="../build/pdf.js"></script>'
    html = html.replace(pdfjs_script,
                        '<script>window.Response = undefined;</script>\n' +
                        pdfjs_script)
    return html


def _generate_pdfjs_script(filename):
    """Generate the script that shows the pdf with pdf.js.

    Args:
        filename: The name of the file to open.
    """
    url = QUrl('qute://pdfjs/file')
    url_query = QUrlQuery()
    url_query.addQueryItem('filename', filename)
    url.setQuery(url_query)

    js_url = javascript.to_js(
        url.toString(QUrl.FullyEncoded))  # type: ignore[arg-type]

    return jinja.js_environment.from_string("""
        document.addEventListener("DOMContentLoaded", function() {
          if (typeof window.PDFJS !== 'undefined') {
              // v1.x
              window.PDFJS.verbosity = window.PDFJS.VERBOSITY_LEVELS.info;
          } else {
              // v2.x
              const options = window.PDFViewerApplicationOptions;
              options.set('verbosity', pdfjsLib.VerbosityLevel.INFOS);
          }

          const viewer = window.PDFView || window.PDFViewerApplication;
          viewer.open({{ url }});
        });
    """).render(url=js_url)


def get_pdfjs_res_and_path(path):
    """Get a pdf.js resource in binary format.

    Returns a (content, path) tuple, where content is the file content and path
    is the path where the file was found. If path is None, the bundled version
    was used.

    Args:
        path: The path inside the pdfjs directory.
    """
    path = path.lstrip('/')
    content = None
    file_path = None

    system_paths = _SYSTEM_PATHS + [
        # fallback
        os.path.join(standarddir.data(), 'pdfjs'),
        # hardcoded fallback for --temp-basedir
        os.path.expanduser('~/.local/share/qutebrowser/pdfjs/'),
    ]

    # First try a system wide installation
    # System installations might strip off the 'build/' or 'web/' prefixes.
    # qute expects them, so we need to adjust for it.
    names_to_try = [path, _remove_prefix(path)]
    for system_path in system_paths:
        content, file_path = _read_from_system(system_path, names_to_try)
        if content is not None:
            break

    # Fallback to bundled pdf.js
    if content is None:
        res_path = '3rdparty/pdfjs/{}'.format(path)
        try:
            content = resources.read_file_binary(res_path)
        except FileNotFoundError:
            raise PDFJSNotFound(path) from None
        except OSError as e:
            log.misc.warning("OSError while reading PDF.js file: {}".format(e))
            raise PDFJSNotFound(path) from None

    return content, file_path


def get_pdfjs_res(path):
    """Get a pdf.js resource in binary format.

    Args:
        path: The path inside the pdfjs directory.
    """
    content, _path = get_pdfjs_res_and_path(path)
    return content


def _remove_prefix(path):
    """Remove the web/ or build/ prefix of a pdfjs-file-path.

    Args:
        path: Path as string where the prefix should be stripped off.
    """
    prefixes = {'web/', 'build/'}
    if any(path.startswith(prefix) for prefix in prefixes):
        return path.split('/', maxsplit=1)[1]
    # Return the unchanged path if no prefix is found
    return path


def _read_from_system(system_path, names):
    """Try to read a file with one of the given names in system_path.

    Returns a (content, path) tuple, where the path is the filepath that was
    used.

    Each file in names is considered equal, the first file that is found
    is read and its binary content returned.

    Returns (None, None) if no file could be found

    Args:
        system_path: The folder where the file should be searched.
        names: List of possible file names.
    """
    for name in names:
        try:
            full_path = os.path.join(system_path, name)
            with open(full_path, 'rb') as f:
                return (f.read(), full_path)
        except FileNotFoundError:
            continue
        except OSError as e:
            log.misc.warning("OSError while reading PDF.js file: {}".format(e))
            continue
    return (None, None)


def is_available():
    """Return true if a pdfjs installation is available."""
    try:
        get_pdfjs_res('build/pdf.js')
        get_pdfjs_res('web/viewer.html')
    except PDFJSNotFound:
        return False
    else:
        return True


def should_use_pdfjs(mimetype, url):
    """Check whether PDF.js should be used."""
    # e.g. 'blob:qute%3A///b45250b3-787e-44d1-a8d8-c2c90f81f981'
    is_download_url = (url.scheme() == 'blob' and
                       QUrl(url.path()).scheme() == 'qute')
    is_pdf = mimetype in ['application/pdf', 'application/x-pdf']
    config_enabled = config.instance.get('content.pdfjs', url=url)
    return is_pdf and not is_download_url and config_enabled


def get_main_url(filename: str, original_url: QUrl) -> QUrl:
    """Get the URL to be opened to view a local PDF."""
    url = QUrl('qute://pdfjs/web/viewer.html')
    query = QUrlQuery()
    query.addQueryItem('filename', filename)  # read from our JS
    query.addQueryItem('file', '')  # to avoid pdfjs opening the default PDF
    urlstr = original_url.toString(QUrl.FullyEncoded)  # type: ignore[arg-type]
    query.addQueryItem('source', urlstr)
    url.setQuery(query)
    return url