summaryrefslogtreecommitdiff
path: root/qutebrowser/browser/greasemonkey.py
blob: 2ec6c940014daffd5f801764238bfdbc8cfdc52b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:

# Copyright 2017-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser.  If not, see <https://www.gnu.org/licenses/>.

"""Load, parse and make available Greasemonkey scripts."""

import re
import os
import json
import fnmatch
import functools
import glob
import textwrap
import dataclasses
from typing import cast, List, Sequence

from PyQt5.QtCore import pyqtSignal, QObject, QUrl

from qutebrowser.utils import (log, standarddir, jinja, objreg, utils,
                               javascript, urlmatch, version, usertypes, message)
from qutebrowser.api import cmdutils
from qutebrowser.browser import downloads
from qutebrowser.misc import objects


gm_manager = cast('GreasemonkeyManager', None)


def _scripts_dirs():
    """Get the directory of the scripts."""
    return [
        os.path.join(standarddir.data(), 'greasemonkey'),
        os.path.join(standarddir.config(), 'greasemonkey'),
    ]


class GreasemonkeyScript:

    """Container class for userscripts, parses metadata blocks."""

    def __init__(self, properties, code,  # noqa: C901 pragma: no mccabe
                 filename=None):
        self._code = code
        self.includes: Sequence[str] = []
        self.matches: Sequence[str] = []
        self.excludes: Sequence[str] = []
        self.requires: Sequence[str] = []
        self.description = None
        self.namespace = None
        self.run_at = None
        self.script_meta = None
        self.runs_on_sub_frames = True
        self.jsworld = "main"
        self.name = ''

        for name, value in properties:
            if name == 'name':
                self.name = value
            elif name == 'namespace':
                self.namespace = value
            elif name == 'description':
                self.description = value
            elif name == 'include':
                self.includes.append(value)
            elif name == 'match':
                self.matches.append(value)
            elif name in ['exclude', 'exclude_match']:
                self.excludes.append(value)
            elif name == 'run-at':
                self.run_at = value
            elif name == 'noframes':
                self.runs_on_sub_frames = False
            elif name == 'require':
                self.requires.append(value)
            elif name == 'qute-js-world':
                self.jsworld = value

        if not self.name:
            if filename:
                self.name = filename
            else:
                raise ValueError(
                    "@name key required or pass filename to init."
                )

    HEADER_REGEX = r'// ==UserScript==|\n+// ==/UserScript==\n'
    PROPS_REGEX = r'// @(?P<prop>[^\s]+)\s*(?P<val>.*)'

    @classmethod
    def parse(cls, source, filename=None):
        """GreasemonkeyScript factory.

        Takes a userscript source and returns a GreasemonkeyScript.
        Parses the Greasemonkey metadata block, if present, to fill out
        attributes.
        """
        matches = re.split(cls.HEADER_REGEX, source, maxsplit=2)
        try:
            _head, props, _code = matches
        except ValueError:
            props = ""
        script = cls(
            re.findall(cls.PROPS_REGEX, props),
            source,
            filename=filename
        )
        script.script_meta = props
        if not script.includes and not script.matches:
            script.includes = ['*']
        return script

    def needs_document_end_workaround(self):
        """Check whether to force @run-at document-end.

        This needs to be done on QtWebEngine (since Qt 5.12) for known-broken scripts.

        On Qt 5.12, accessing the DOM isn't possible with "@run-at
        document-start". It was documented to be impossible before, but seems
        to work fine.

        However, some scripts do DOM access with "@run-at document-start". Fix
        those by forcing them to use document-end instead.
        """
        if objects.backend == usertypes.Backend.QtWebKit:
            return False

        assert objects.backend == usertypes.Backend.QtWebEngine, objects.backend

        broken_scripts = [
            ('http://userstyles.org', None),
            ('https://github.com/ParticleCore', 'Iridium'),
        ]
        return any(self._matches_id(namespace=namespace, name=name)
                   for namespace, name in broken_scripts)

    def _matches_id(self, *, namespace, name):
        """Check if this script matches the given namespace/name.

        Both namespace and name can be None in order to match any script.
        """
        matches_namespace = namespace is None or self.namespace == namespace
        matches_name = name is None or self.name == name
        return matches_namespace and matches_name

    def code(self):
        """Return the processed JavaScript code of this script.

        Adorns the source code with GM_* methods for Greasemonkey
        compatibility and wraps it in an IIFE to hide it within a
        lexical scope. Note that this means line numbers in your
        browser's debugger/inspector will not match up to the line
        numbers in the source script directly.
        """
        # Don't use Proxy on this webkit version, the support isn't there.
        use_proxy = not (
            objects.backend == usertypes.Backend.QtWebKit and
            version.qWebKitVersion() == '602.1')
        template = jinja.js_environment.get_template('greasemonkey_wrapper.js')
        return template.render(
            scriptName=javascript.string_escape(
                "/".join([self.namespace or '', self.name])),
            scriptInfo=self._meta_json(),
            scriptMeta=javascript.string_escape(self.script_meta or ''),
            scriptSource=self._code,
            use_proxy=use_proxy)

    def _meta_json(self):
        return json.dumps({
            'name': self.name,
            'description': self.description,
            'matches': self.matches,
            'includes': self.includes,
            'excludes': self.excludes,
            'run-at': self.run_at,
        })

    def add_required_script(self, source):
        """Add the source of a required script to this script."""
        # The additional source is indented in case it also contains a
        # metadata block. Because we pass everything at once to
        # QWebEngineScript and that would parse the first metadata block
        # found as the valid one.
        self._code = "\n".join([textwrap.indent(source, "    "), self._code])


@dataclasses.dataclass
class MatchingScripts:

    """All userscripts registered to run on a particular url."""

    url: QUrl
    start: List[GreasemonkeyScript] = dataclasses.field(default_factory=list)
    end: List[GreasemonkeyScript] = dataclasses.field(default_factory=list)
    idle: List[GreasemonkeyScript] = dataclasses.field(default_factory=list)


class GreasemonkeyMatcher:

    """Check whether scripts should be loaded for a given URL."""

    # https://wiki.greasespot.net/Include_and_exclude_rules#Greaseable_schemes
    # Limit the schemes scripts can run on due to unreasonable levels of
    # exploitability
    GREASEABLE_SCHEMES = ['http', 'https', 'ftp', 'file']

    def __init__(self, url):
        self._url = url
        self._url_string = url.toString(QUrl.FullyEncoded)
        self.is_greaseable = url.scheme() in self.GREASEABLE_SCHEMES

    def _match_pattern(self, pattern):
        # For include and exclude rules if they start and end with '/' they
        # should be treated as a (ecma syntax) regular expression.
        if pattern.startswith('/') and pattern.endswith('/'):
            matches = re.search(pattern[1:-1], self._url_string, flags=re.I)
            return matches is not None

        # Otherwise they are glob expressions.
        return fnmatch.fnmatch(self._url_string, pattern)

    def matches(self, script):
        """Check whether the URL matches filtering rules of the script."""
        assert self.is_greaseable
        matching_includes = any(self._match_pattern(pat)
                                for pat in script.includes)
        matching_match = any(urlmatch.UrlPattern(pat).matches(self._url)
                             for pat in script.matches)
        matching_excludes = any(self._match_pattern(pat)
                                for pat in script.excludes)
        return (matching_includes or matching_match) and not matching_excludes


class GreasemonkeyManager(QObject):

    """Manager of userscripts and a Greasemonkey compatible environment.

    Signals:
        scripts_reloaded: Emitted when scripts are reloaded from disk.
            Any cached or already-injected scripts should be
            considered obsolete.
    """

    scripts_reloaded = pyqtSignal()

    def __init__(self, parent=None):
        super().__init__(parent)
        self._run_start: List[GreasemonkeyScript] = []
        self._run_end: List[GreasemonkeyScript] = []
        self._run_idle: List[GreasemonkeyScript] = []
        self._in_progress_dls: List[downloads.AbstractDownloadItem] = []

        self.load_scripts()

    def load_scripts(self, *, force: bool = False) -> List[GreasemonkeyScript]:
        """Re-read Greasemonkey scripts from disk.

        The scripts are read from a 'greasemonkey' subdirectory in
        qutebrowser's data directory (see `:version`).

        Args:
            force: For any scripts that have required dependencies,
                   re-download them.

        Return:
            A list of loaded scripts.
        """
        self._run_start = []
        self._run_end = []
        self._run_idle = []

        scripts = []
        for scripts_dir in _scripts_dirs():
            scripts_dir = os.path.abspath(scripts_dir)
            log.greasemonkey.debug("Reading scripts from: {}".format(scripts_dir))

            for script_filename in glob.glob(os.path.join(scripts_dir, '*.js')):
                if not os.path.isfile(script_filename):
                    continue
                script_path = os.path.join(scripts_dir, script_filename)
                with open(script_path, encoding='utf-8-sig') as script_file:
                    script = GreasemonkeyScript.parse(script_file.read(),
                                                      script_filename)
                    if not script.name:
                        script.name = script_filename
                    self.add_script(script, force)
                    scripts.append(script)

        self.scripts_reloaded.emit()
        return sorted(scripts, key=lambda script: script.name)

    def add_script(self, script, force=False):
        """Add a GreasemonkeyScript to this manager.

        Args:
            force: Fetch and overwrite any dependencies which are
                   already locally cached.
        """
        if script.requires:
            log.greasemonkey.debug(
                "Deferring script until requirements are "
                "fulfilled: {}".format(script.name))
            self._get_required_scripts(script, force)
        else:
            self._add_script(script)

    def _add_script(self, script):
        if script.run_at == 'document-start':
            self._run_start.append(script)
        elif script.run_at == 'document-end':
            self._run_end.append(script)
        elif script.run_at == 'document-idle':
            self._run_idle.append(script)
        else:
            if script.run_at:
                log.greasemonkey.warning("Script {} has invalid run-at "
                                         "defined, defaulting to "
                                         "document-end"
                                         .format(script.name))
                # Default as per
                # https://wiki.greasespot.net/Metadata_Block#.40run-at
            self._run_end.append(script)
        log.greasemonkey.debug("Loaded script: {}".format(script.name))

    def _required_url_to_file_path(self, url):
        requires_dir = os.path.join(_scripts_dirs()[0], 'requires')
        if not os.path.exists(requires_dir):
            os.mkdir(requires_dir)
        return os.path.join(requires_dir, utils.sanitize_filename(url))

    def _on_required_download_finished(self, script, download):
        self._in_progress_dls.remove(download)
        if not self._add_script_with_requires(script):
            log.greasemonkey.debug(
                "Finished download {} for script {} "
                "but some requirements are still pending"
                .format(download.basename, script.name))

    def _add_script_with_requires(self, script, quiet=False):
        """Add a script with pending downloads to this GreasemonkeyManager.

        Specifically a script that has dependencies specified via an
        `@require` rule.

        Args:
            script: The GreasemonkeyScript to add.
            quiet: True to suppress the scripts_reloaded signal after
                   adding `script`.
        Returns: True if the script was added, False if there are still
                 dependencies being downloaded.
        """
        # See if we are still waiting on any required scripts for this one
        for dl in self._in_progress_dls:
            if dl.requested_url in script.requires:
                return False

        # Need to add the required scripts to the IIFE now
        for url in reversed(script.requires):
            target_path = self._required_url_to_file_path(url)
            log.greasemonkey.debug(
                "Adding required script for {} to IIFE: {}"
                .format(script.name, url))
            with open(target_path, encoding='utf8') as f:
                script.add_required_script(f.read())

        self._add_script(script)
        if not quiet:
            self.scripts_reloaded.emit()
        return True

    def _get_required_scripts(self, script, force=False):
        required_dls = [(url, self._required_url_to_file_path(url))
                        for url in script.requires]
        if not force:
            required_dls = [(url, path) for (url, path) in required_dls
                            if not os.path.exists(path)]
        if not required_dls:
            # All the required files exist already
            self._add_script_with_requires(script, quiet=True)
            return

        download_manager = objreg.get('qtnetwork-download-manager')

        for url, target_path in required_dls:
            target = downloads.FileDownloadTarget(target_path,
                                                  force_overwrite=True)
            download = download_manager.get(QUrl(url), target=target,
                                            auto_remove=True)
            download.requested_url = url
            self._in_progress_dls.append(download)
            if download.successful:
                self._on_required_download_finished(script, download)
            else:
                download.finished.connect(
                    functools.partial(self._on_required_download_finished,
                                      script, download))

    def scripts_for(self, url):
        """Fetch scripts that are registered to run for url.

        returns a tuple of lists of scripts meant to run at (document-start,
        document-end, document-idle)
        """
        matcher = GreasemonkeyMatcher(url)
        if not matcher.is_greaseable:
            return MatchingScripts(url, [], [], [])
        return MatchingScripts(
            url=url,
            start=[script for script in self._run_start
                   if matcher.matches(script)],
            end=[script for script in self._run_end
                 if matcher.matches(script)],
            idle=[script for script in self._run_idle
                  if matcher.matches(script)]
        )

    def all_scripts(self):
        """Return all scripts found in the configured script directory."""
        return self._run_start + self._run_end + self._run_idle


@cmdutils.register()
def greasemonkey_reload(force: bool = False, quiet: bool = False) -> None:
    """Re-read Greasemonkey scripts from disk.

    The scripts are read from a 'greasemonkey' subdirectory in
    qutebrowser's data or config directories (see `:version`).

    Args:
        force: For any scripts that have required dependencies,
                re-download them.
        quiet: Suppress message after loading scripts.
    """
    scripts = gm_manager.load_scripts(force=force)
    names = '\n'.join(script.name for script in scripts)
    if not quiet:
        message.info(f"Loaded scripts:\n\n{names}")


def init():
    """Initialize Greasemonkey support."""
    global gm_manager
    gm_manager = GreasemonkeyManager()

    for scripts_dir in _scripts_dirs():
        try:
            os.mkdir(scripts_dir)
        except FileExistsError:
            pass