springer_dl.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

#!/usr/bin/env python3

import argparse
import os
import sys
import urllib.parse
from lib.util import Parser, request, download_file

if __name__ == '__main__':
    desc = 'springer-dl: download the set of books Springer released for free '\
           'during the 2020 COVID-19 outbreak'
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('--path', dest='path', type=str,
                        help='path to download books', required=True)
    args = parser.parse_args()

    dl_path = args.path
    os.makedirs(dl_path, exist_ok=True)

    HEADERS = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like '\
               'Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '\
               'CriOS/81.0.4044.124 Mobile/15E148 Safari/604.1'}
    BOOK_PAGE = "https://hnarayanan.github.io/springer-books/"

    r = request(BOOK_PAGE, headers=HEADERS)
    html = r.read().decode('utf-8')

    p = Parser()
    p.feed(html)

    prefix = 'http://link.springer.com/openurl'
    books = [(x, x.split('isbn=')[1]) for x in p.links if x.startswith(prefix)]

    for url, isbn in books:
        r = request(url, HEADERS)
        end_url = r.url

        html = r.read().decode('utf-8')
        p = Parser()
        p.feed(html)

        links = [x for x in p.links if 'content/pdf' in x or 'download/epub' in x]
        links = [urllib.parse.urljoin(end_url, x) + '?javascript-disabled=true' for x in links]

        book_title = p.title[0].split(' |')[0]
        book_path = os.path.join(dl_path, book_title)
        os.makedirs(book_path, exist_ok=True)

        if 'epub' in links[1]:
            links = links[:2]
        else:
            links = links[:1]

        for link in links:
            filename = '%s - %s' % (book_title, isbn) + os.path.splitext(
                urllib.parse.urlparse(link).path)[-1]
            filepath = os.path.join(book_path, filename)
            if os.path.exists(filepath):
                continue
            try:
                print('[+] %s' % filename)
                download_file(link, HEADERS, filepath)
            except KeyboardInterrupt:
                if os.path.exists(filepath):
                    os.remove(filepath)
                sys.exit()
            except:
                print('[x] error downloading %s, skipping...' % filename)