initial commit

author: Jordan <me@jordan.im> 2020-06-15 17:50:03 -0700
committer: Jordan <me@jordan.im> 2020-06-15 17:50:03 -0700
commit: 7239112a3e9d65dcb370e88b1d754cb77a7d9995 (patch)
tree: 3fda150da618ebd9e5cca9d56f3428e6170c5d6f
download: springer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.tar.gz
springer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.zip
4 files changed, 131 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6d669bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.swp
+*.swo
+__pycache__
diff --git a/README b/README
new file mode 100644
index 0000000..5beed0e
--- /dev/null
+++ b/README
@@ -0,0 +1,18 @@
+springer-dl: download the set of books Springer released for free during the
+2020 COVID-19 outbreak
+
+- bypasses Google captcha
+- supports PDF and EPUB book formats
+- sorts by title, includes ISBN in filename
+- no external dependencies
+
+usage: springer_dl.py [-h] --path PATH
+
+optional arguments:
+  -h, --help   show this help message and exit
+  --path PATH  path to download books
+
+$ ls -R books/
+'books/Understanding Statistics Using R':
+'Understanding Statistics Using R - 978-1-4614-6227-9.epub'
+'Understanding Statistics Using R - 978-1-4614-6227-9.pdf'
diff --git a/lib/util.py b/lib/util.py
new file mode 100644
index 0000000..1bb9811
--- /dev/null
+++ b/lib/util.py
@@ -0,0 +1,42 @@
+import re
+import urllib.request
+from html.parser import HTMLParser
+
+class Parser(HTMLParser):
+    def __init__(self, links=None):
+        HTMLParser.__init__(self)
+        if links is None:
+            self.links = []
+        else:
+            self.links = links
+        self.title = []
+        self.current_tag = None
+    def handle_starttag(self, tag, attrs):
+        self.current_tag = tag
+        if tag == 'a':
+            self.links.append(dict(attrs).get('href'))
+    def handle_data(self, data):
+        if self.current_tag == 'title':
+            self.title.append(data)
+
+def request(url, headers):
+    conn = urllib.request.Request(
+        url,
+        headers=headers
+    )
+    r = urllib.request.urlopen(conn)
+    return r
+
+def download_file(url, headers, dest):
+    BLOCK = 16 * 1024
+    conn = urllib.request.Request(
+        url,
+        headers=headers
+    )
+    resp = urllib.request.urlopen(conn)
+    with open(dest, 'wb') as f:
+        while True:
+            chunk = resp.read(BLOCK)
+            if not chunk:
+                break
+            f.write(chunk)
diff --git a/springer_dl.py b/springer_dl.py
new file mode 100755
index 0000000..b214250
--- /dev/null
+++ b/springer_dl.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import sys
+import urllib.parse
+from lib.util import Parser, request, download_file
+
+if __name__ == '__main__':
+    desc = 'springer-dl: download the set of books Springer released for free '\
+           'during the 2020 COVID-19 outbreak'
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('--path', dest='path', type=str,
+                        help='path to download books', required=True)
+    args = parser.parse_args()
+
+    dl_path = args.path
+    os.makedirs(dl_path, exist_ok=True)
+
+    HEADERS = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like '\
+               'Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '\
+               'CriOS/81.0.4044.124 Mobile/15E148 Safari/604.1'}
+    BOOK_PAGE = "https://hnarayanan.github.io/springer-books/"
+
+    r = request(BOOK_PAGE, headers=HEADERS)
+    html = r.read().decode('utf-8')
+
+    p = Parser()
+    p.feed(html)
+
+    prefix = 'http://link.springer.com/openurl'
+    books = [(x, x.split('isbn=')[1]) for x in p.links if x.startswith(prefix)]
+
+    for url, isbn in books:
+        r = request(url, HEADERS)
+        end_url = r.url
+
+        html = r.read().decode('utf-8')
+        p = Parser()
+        p.feed(html)
+
+        links = [x for x in p.links if 'content/pdf' in x or 'download/epub' in x]
+        links = [urllib.parse.urljoin(end_url, x) + '?javascript-disabled=true' for x in links]
+
+        book_title = p.title[0].split(' |')[0]
+        book_path = os.path.join(dl_path, book_title)
+        os.makedirs(book_path, exist_ok=True)
+
+        if 'epub' in links[1]:
+            links = links[:2]
+        else:
+            links = links[:1]
+
+        for link in links:
+            filename = '%s - %s' % (book_title, isbn) + os.path.splitext(
+                urllib.parse.urlparse(link).path)[-1]
+            filepath = os.path.join(book_path, filename)
+            if os.path.exists(filepath):
+                continue
+            try:
+                print('[+] %s' % filename)
+                download_file(link, HEADERS, filepath)
+            except KeyboardInterrupt:
+                if os.path.exists(filepath):
+                    os.remove(filepath)
+                sys.exit()
+            except:
+                print('[x] error downloading %s, skipping...' % filename)
author	Jordan <me@jordan.im>	2020-06-15 17:50:03 -0700
committer	Jordan <me@jordan.im>	2020-06-15 17:50:03 -0700
commit	7239112a3e9d65dcb370e88b1d754cb77a7d9995 (patch)
tree	3fda150da618ebd9e5cca9d56f3428e6170c5d6f
download	springer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.tar.gz springer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.zip