diff options
author | Jordan <me@jordan.im> | 2020-06-15 17:50:03 -0700 |
---|---|---|
committer | Jordan <me@jordan.im> | 2020-06-15 17:50:03 -0700 |
commit | 7239112a3e9d65dcb370e88b1d754cb77a7d9995 (patch) | |
tree | 3fda150da618ebd9e5cca9d56f3428e6170c5d6f | |
download | springer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.tar.gz springer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.zip |
initial commit
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | README | 18 | ||||
-rw-r--r-- | lib/util.py | 42 | ||||
-rwxr-xr-x | springer_dl.py | 68 |
4 files changed, 131 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6d669bf --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.swp +*.swo +__pycache__ @@ -0,0 +1,18 @@ +springer-dl: download the set of books Springer released for free during the +2020 COVID-19 outbreak + +- bypasses Google captcha +- supports PDF and EPUB book formats +- sorts by title, includes ISBN in filename +- no external dependencies + +usage: springer_dl.py [-h] --path PATH + +optional arguments: + -h, --help show this help message and exit + --path PATH path to download books + +$ ls -R books/ +'books/Understanding Statistics Using R': +'Understanding Statistics Using R - 978-1-4614-6227-9.epub' +'Understanding Statistics Using R - 978-1-4614-6227-9.pdf' diff --git a/lib/util.py b/lib/util.py new file mode 100644 index 0000000..1bb9811 --- /dev/null +++ b/lib/util.py @@ -0,0 +1,42 @@ +import re +import urllib.request +from html.parser import HTMLParser + +class Parser(HTMLParser): + def __init__(self, links=None): + HTMLParser.__init__(self) + if links is None: + self.links = [] + else: + self.links = links + self.title = [] + self.current_tag = None + def handle_starttag(self, tag, attrs): + self.current_tag = tag + if tag == 'a': + self.links.append(dict(attrs).get('href')) + def handle_data(self, data): + if self.current_tag == 'title': + self.title.append(data) + +def request(url, headers): + conn = urllib.request.Request( + url, + headers=headers + ) + r = urllib.request.urlopen(conn) + return r + +def download_file(url, headers, dest): + BLOCK = 16 * 1024 + conn = urllib.request.Request( + url, + headers=headers + ) + resp = urllib.request.urlopen(conn) + with open(dest, 'wb') as f: + while True: + chunk = resp.read(BLOCK) + if not chunk: + break + f.write(chunk) diff --git a/springer_dl.py b/springer_dl.py new file mode 100755 index 0000000..b214250 --- /dev/null +++ b/springer_dl.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import urllib.parse +from lib.util import Parser, request, download_file + +if __name__ == '__main__': + desc = 'springer-dl: download the set of books Springer released for free '\ + 'during the 2020 COVID-19 outbreak' + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('--path', dest='path', type=str, + help='path to download books', required=True) + args = parser.parse_args() + + dl_path = args.path + os.makedirs(dl_path, exist_ok=True) + + HEADERS = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like '\ + 'Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '\ + 'CriOS/81.0.4044.124 Mobile/15E148 Safari/604.1'} + BOOK_PAGE = "https://hnarayanan.github.io/springer-books/" + + r = request(BOOK_PAGE, headers=HEADERS) + html = r.read().decode('utf-8') + + p = Parser() + p.feed(html) + + prefix = 'http://link.springer.com/openurl' + books = [(x, x.split('isbn=')[1]) for x in p.links if x.startswith(prefix)] + + for url, isbn in books: + r = request(url, HEADERS) + end_url = r.url + + html = r.read().decode('utf-8') + p = Parser() + p.feed(html) + + links = [x for x in p.links if 'content/pdf' in x or 'download/epub' in x] + links = [urllib.parse.urljoin(end_url, x) + '?javascript-disabled=true' for x in links] + + book_title = p.title[0].split(' |')[0] + book_path = os.path.join(dl_path, book_title) + os.makedirs(book_path, exist_ok=True) + + if 'epub' in links[1]: + links = links[:2] + else: + links = links[:1] + + for link in links: + filename = '%s - %s' % (book_title, isbn) + os.path.splitext( + urllib.parse.urlparse(link).path)[-1] + filepath = os.path.join(book_path, filename) + if os.path.exists(filepath): + continue + try: + print('[+] %s' % filename) + download_file(link, HEADERS, filepath) + except KeyboardInterrupt: + if os.path.exists(filepath): + os.remove(filepath) + sys.exit() + except: + print('[x] error downloading %s, skipping...' % filename) |