aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2020-06-15 17:50:03 -0700
committerJordan <me@jordan.im>2020-06-15 17:50:03 -0700
commit7239112a3e9d65dcb370e88b1d754cb77a7d9995 (patch)
tree3fda150da618ebd9e5cca9d56f3428e6170c5d6f
downloadspringer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.tar.gz
springer-dl-7239112a3e9d65dcb370e88b1d754cb77a7d9995.zip
initial commit
-rw-r--r--.gitignore3
-rw-r--r--README18
-rw-r--r--lib/util.py42
-rwxr-xr-xspringer_dl.py68
4 files changed, 131 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6d669bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.swp
+*.swo
+__pycache__
diff --git a/README b/README
new file mode 100644
index 0000000..5beed0e
--- /dev/null
+++ b/README
@@ -0,0 +1,18 @@
+springer-dl: download the set of books Springer released for free during the
+2020 COVID-19 outbreak
+
+- bypasses Google captcha
+- supports PDF and EPUB book formats
+- sorts by title, includes ISBN in filename
+- no external dependencies
+
+usage: springer_dl.py [-h] --path PATH
+
+optional arguments:
+ -h, --help show this help message and exit
+ --path PATH path to download books
+
+$ ls -R books/
+'books/Understanding Statistics Using R':
+'Understanding Statistics Using R - 978-1-4614-6227-9.epub'
+'Understanding Statistics Using R - 978-1-4614-6227-9.pdf'
diff --git a/lib/util.py b/lib/util.py
new file mode 100644
index 0000000..1bb9811
--- /dev/null
+++ b/lib/util.py
@@ -0,0 +1,42 @@
+import re
+import urllib.request
+from html.parser import HTMLParser
+
+class Parser(HTMLParser):
+ def __init__(self, links=None):
+ HTMLParser.__init__(self)
+ if links is None:
+ self.links = []
+ else:
+ self.links = links
+ self.title = []
+ self.current_tag = None
+ def handle_starttag(self, tag, attrs):
+ self.current_tag = tag
+ if tag == 'a':
+ self.links.append(dict(attrs).get('href'))
+ def handle_data(self, data):
+ if self.current_tag == 'title':
+ self.title.append(data)
+
+def request(url, headers):
+ conn = urllib.request.Request(
+ url,
+ headers=headers
+ )
+ r = urllib.request.urlopen(conn)
+ return r
+
+def download_file(url, headers, dest):
+ BLOCK = 16 * 1024
+ conn = urllib.request.Request(
+ url,
+ headers=headers
+ )
+ resp = urllib.request.urlopen(conn)
+ with open(dest, 'wb') as f:
+ while True:
+ chunk = resp.read(BLOCK)
+ if not chunk:
+ break
+ f.write(chunk)
diff --git a/springer_dl.py b/springer_dl.py
new file mode 100755
index 0000000..b214250
--- /dev/null
+++ b/springer_dl.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import sys
+import urllib.parse
+from lib.util import Parser, request, download_file
+
+if __name__ == '__main__':
+ desc = 'springer-dl: download the set of books Springer released for free '\
+ 'during the 2020 COVID-19 outbreak'
+ parser = argparse.ArgumentParser(description=desc)
+ parser.add_argument('--path', dest='path', type=str,
+ help='path to download books', required=True)
+ args = parser.parse_args()
+
+ dl_path = args.path
+ os.makedirs(dl_path, exist_ok=True)
+
+ HEADERS = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like '\
+ 'Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '\
+ 'CriOS/81.0.4044.124 Mobile/15E148 Safari/604.1'}
+ BOOK_PAGE = "https://hnarayanan.github.io/springer-books/"
+
+ r = request(BOOK_PAGE, headers=HEADERS)
+ html = r.read().decode('utf-8')
+
+ p = Parser()
+ p.feed(html)
+
+ prefix = 'http://link.springer.com/openurl'
+ books = [(x, x.split('isbn=')[1]) for x in p.links if x.startswith(prefix)]
+
+ for url, isbn in books:
+ r = request(url, HEADERS)
+ end_url = r.url
+
+ html = r.read().decode('utf-8')
+ p = Parser()
+ p.feed(html)
+
+ links = [x for x in p.links if 'content/pdf' in x or 'download/epub' in x]
+ links = [urllib.parse.urljoin(end_url, x) + '?javascript-disabled=true' for x in links]
+
+ book_title = p.title[0].split(' |')[0]
+ book_path = os.path.join(dl_path, book_title)
+ os.makedirs(book_path, exist_ok=True)
+
+ if 'epub' in links[1]:
+ links = links[:2]
+ else:
+ links = links[:1]
+
+ for link in links:
+ filename = '%s - %s' % (book_title, isbn) + os.path.splitext(
+ urllib.parse.urlparse(link).path)[-1]
+ filepath = os.path.join(book_path, filename)
+ if os.path.exists(filepath):
+ continue
+ try:
+ print('[+] %s' % filename)
+ download_file(link, HEADERS, filepath)
+ except KeyboardInterrupt:
+ if os.path.exists(filepath):
+ os.remove(filepath)
+ sys.exit()
+ except:
+ print('[x] error downloading %s, skipping...' % filename)