summaryrefslogtreecommitdiff
path: root/scripts/importer.py
blob: 1914e6976c1c32c9e4ae887c147a1a43b1c35451 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#!/usr/bin/env python3

# SPDX-FileCopyrightText: Claude (longneck) <longneck@scratchbook.ch>
# SPDX-FileCopyrightText: Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later


"""Tool to import data from other browsers.

Currently importing bookmarks from Netscape HTML Bookmark files, Chrome
profiles, and Mozilla profiles is supported.
"""


import argparse
import textwrap
import sqlite3
import os
import urllib.parse
import json
import string


def main():
    args = get_args()
    bookmark_types = []
    output_format = None
    input_format = args.input_format
    if args.search_output:
        bookmark_types = ['search']
        if args.oldconfig:
            output_format = 'oldsearch'
        else:
            output_format = 'search'
    else:
        if args.bookmark_output:
            output_format = 'bookmark'
        elif args.quickmark_output:
            output_format = 'quickmark'
        if args.import_bookmarks:
            bookmark_types.append('bookmark')
        if args.import_keywords:
            bookmark_types.append('keyword')
    if not bookmark_types:
        bookmark_types = ['bookmark', 'keyword']
    if not output_format:
        output_format = 'quickmark'

    import_function = {
        'html': import_html_bookmarks,
        'mozilla': import_moz_places,
        'chrome': import_chrome,
    }
    import_function[input_format](args.bookmarks, bookmark_types,
                                  output_format)


def get_args():
    """Get the argparse parser."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''
            To import bookmarks, you'll need the path to your profile or an
            exported HTML file from your browser's bookmark manager. Redirect
            the output from this script to the appropriate file in your
            qutebrowser config directory (listed in the output of :version),
            usually done with the '>' operator; for example,
                ./importer.py -i mozilla your_profile_path > ~/.config/qutebrowser/quickmarks

            Common browsers with native input format support:
                chrome: Chrome, Chromium, Edge
                mozilla: Firefox, SeaMonkey, Pale Moon
        '''))
    parser.add_argument(
        '-i',
        '--input-format',
        help="Which input format? Defaults to html",
        choices=['html', 'mozilla', 'chrome'],
        default='html',
        required=False)
    parser.add_argument(
        '-b',
        '--bookmark-output',
        help="Output in bookmark format.",
        action='store_true',
        default=False,
        required=False)
    parser.add_argument(
        '-q',
        '--quickmark-output',
        help="Output in quickmark format (default).",
        action='store_true',
        default=False,
        required=False)
    parser.add_argument(
        '-s',
        '--search-output',
        help="Output config.py search engine format (negates -B and -K)",
        action='store_true',
        default=False,
        required=False)
    parser.add_argument(
        '--oldconfig',
        help="Output search engine format for old qutebrowser.conf format",
        default=False,
        action='store_true',
        required=False)
    parser.add_argument(
        '-B',
        '--import-bookmarks',
        help="Import plain bookmarks (can be combiend with -K)",
        action='store_true',
        default=False,
        required=False)
    parser.add_argument(
        '-K',
        '--import-keywords',
        help="Import keywords (can be combined with -B)",
        action='store_true',
        default=False,
        required=False)
    parser.add_argument(
        'bookmarks',
        help="Bookmarks file (html format) or "
        "profile folder (Mozilla format)")
    args = parser.parse_args()
    return args


def search_escape(url):
    """Escape URLs such that preexisting { and } are handled properly.

    Will obviously trash a properly-formatted qutebrowser URL.
    """
    return url.replace('{', '{{').replace('}', '}}')


def opensearch_convert(url):
    """Convert a basic OpenSearch URL into something qutebrowser can use.

    Exceptions:
        KeyError:
            An unknown and required parameter is present in the URL. This
            usually means there's browser/addon specific functionality needed
            to build the URL (I'm looking at you and your browser, Google) that
            obviously won't be present here.
    """
    subst = {
        'searchTerms': '%s',  # for proper escaping later
        'language': '*',
        'inputEncoding': 'UTF-8',
        'outputEncoding': 'UTF-8'
    }

    # remove optional parameters (even those we don't support)
    for param in string.Formatter().parse(url):
        if param[1]:
            if param[1].endswith('?'):
                url = url.replace('{' + param[1] + '}', '')
            elif param[2] and param[2].endswith('?'):
                url = url.replace('{' + param[1] + ':' + param[2] + '}', '')
    return search_escape(url.format(**subst)).replace('%s', '{}')


def import_html_bookmarks(bookmarks_file, bookmark_types, output_format):
    """Import bookmarks from a NETSCAPE-Bookmark-file v1.

    Generated by Chromium, Firefox, IE and possibly more browsers. Not all
    export all possible bookmark types:
        - Firefox mostly works with everything
        - Chrome doesn't support keywords at all; searches are a separate
          database
    """
    import bs4
    with open(bookmarks_file, encoding='utf-8') as f:
        soup = bs4.BeautifulSoup(f, 'html.parser')
    bookmark_query = {
        'search': lambda tag: (
            (tag.name == 'a') and
            ('shortcuturl' in tag.attrs) and
            ('%s' in tag['href'])),
        'keyword': lambda tag: (
            (tag.name == 'a') and
            ('shortcuturl' in tag.attrs) and
            ('%s' not in tag['href'])),
        'bookmark': lambda tag: (
            (tag.name == 'a') and
            ('shortcuturl' not in tag.attrs) and
            (tag.string)),
    }
    output_template = {
        'search': {
            'search':
            "c.url.searchengines['{tag[shortcuturl]}'] = "
            "'{tag[href]}' #{tag.string}"
        },
        'oldsearch': {
            'search': '{tag[shortcuturl]} = {tag[href]} #{tag.string}',
        },
        'bookmark': {
            'bookmark': '{tag[href]} {tag.string}',
            'keyword': '{tag[href]} {tag.string}'
        },
        'quickmark': {
            'bookmark': '{tag.string} {tag[href]}',
            'keyword': '{tag[shortcuturl]} {tag[href]}'
        }
    }
    bookmarks = []
    for typ in bookmark_types:
        tags = soup.findAll(bookmark_query[typ])
        for tag in tags:
            if typ == 'search':
                tag['href'] = search_escape(tag['href']).replace('%s', '{}')
            if tag['href'] not in bookmarks:
                bookmarks.append(
                    output_template[output_format][typ].format(tag=tag))
    for bookmark in bookmarks:
        print(bookmark)


def import_moz_places(profile, bookmark_types, output_format):
    """Import bookmarks from a Mozilla profile's places.sqlite database."""
    place_query = {
        'bookmark': (
            "SELECT DISTINCT moz_bookmarks.title,moz_places.url "
            "FROM moz_bookmarks,moz_places "
            "WHERE moz_places.id=moz_bookmarks.fk "
            "AND moz_places.id NOT IN (SELECT place_id FROM moz_keywords) "
            "AND moz_places.url NOT LIKE 'place:%';"
        ),  # Bookmarks with no keywords assigned
        'keyword': (
            "SELECT moz_keywords.keyword,moz_places.url "
            "FROM moz_keywords,moz_places,moz_bookmarks "
            "WHERE moz_places.id=moz_bookmarks.fk "
            "AND moz_places.id=moz_keywords.place_id "
            "AND moz_places.url NOT LIKE '%!%s%' ESCAPE '!';"
        ),  # Bookmarks with keywords assigned but no %s substitution
        'search': (
            "SELECT moz_keywords.keyword, "
            "    moz_bookmarks.title, "
            "    search_conv(moz_places.url) AS url "
            "FROM moz_keywords,moz_places,moz_bookmarks "
            "WHERE moz_places.id=moz_bookmarks.fk "
            "AND moz_places.id=moz_keywords.place_id "
            "AND moz_places.url LIKE '%!%s%' ESCAPE '!';"
        )  # bookmarks with keyword and %s substitution
    }
    out_template = {
        'bookmark': {
            'bookmark': '{url} {title}',
            'keyword': '{url} {keyword}'
        },
        'quickmark': {
            'bookmark': '{title} {url}',
            'keyword': '{keyword} {url}'
        },
        'oldsearch': {
            'search': '{keyword} {url} #{title}'
        },
        'search': {
            'search': "c.url.searchengines['{keyword}'] = '{url}' #{title}"
        }
    }

    def search_conv(url):
        return search_escape(url).replace('%s', '{}')

    places = sqlite3.connect(os.path.join(profile, "places.sqlite"))
    places.create_function('search_conv', 1, search_conv)
    places.row_factory = sqlite3.Row
    c = places.cursor()
    for typ in bookmark_types:
        c.execute(place_query[typ])
        for row in c:
            print(out_template[output_format][typ].format(**row))


def import_chrome(profile, bookmark_types, output_format):
    """Import bookmarks and search keywords from Chrome-type profiles.

    On Chrome, keywords and search engines are the same thing and handled in
    their own database table; bookmarks cannot have associated keywords. This
    is why the dictionary lookups here are much simpler.
    """
    out_template = {
        'bookmark': '{url} {name}',
        'quickmark': '{name} {url}',
        'search': "c.url.searchengines['{keyword}'] = '{url}'",
        'oldsearch': '{keyword} {url}'
    }

    if 'search' in bookmark_types:
        webdata = sqlite3.connect(os.path.join(profile, 'Web Data'))
        c = webdata.cursor()
        c.execute('SELECT keyword,url FROM keywords;')
        for keyword, url in c:
            try:
                url = opensearch_convert(url)
                print(out_template[output_format].format(
                    keyword=keyword, url=url))
            except KeyError:
                print('# Unsupported parameter in url for {}; skipping....'.
                      format(keyword))

    else:
        with open(os.path.join(profile, 'Bookmarks'), encoding='utf-8') as f:
            bookmarks = json.load(f)

        def bm_tree_walk(bm, template):
            """Recursive function to walk through bookmarks."""
            if not isinstance(bm, dict):
                return
            assert 'type' in bm, bm
            if bm['type'] == 'url':
                if urllib.parse.urlparse(bm['url']).scheme != 'chrome':
                    print(template.format(**bm))
            elif bm['type'] == 'folder':
                for child in bm['children']:
                    bm_tree_walk(child, template)

        for root in bookmarks['roots'].values():
            bm_tree_walk(root, out_template[output_format])


if __name__ == '__main__':
    main()