aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2020-07-11 22:45:16 -0700
committerJordan <me@jordan.im>2020-07-11 22:45:16 -0700
commit3ea7cee3bc97bf34e1c717ef0969914de58f228e (patch)
tree308e1250e68401ab2d95f270d2adb36a67f7957a
parent79cbc45cd2c83db5fc30686ffde93705ed298754 (diff)
downloadroka-3ea7cee3bc97bf34e1c717ef0969914de58f228e.tar.gz
roka-3ea7cee3bc97bf34e1c717ef0969914de58f228e.zip
classify rebuild.py, optimize logic
-rwxr-xr-xrebuild.py369
1 files changed, 192 insertions, 177 deletions
diff --git a/rebuild.py b/rebuild.py
index 3c0ebd9..756a2ae 100755
--- a/rebuild.py
+++ b/rebuild.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
+import datetime
import hashlib
import json
import math
@@ -8,185 +9,199 @@ from datetime import timedelta
from flask import Flask
from lib.tinytag import TinyTag
-def get_books(root_path, cache=dict()):
- '''
- Discover audiobooks under :root_path: and populate books object
-
- :cache: existing JSON cache, used to determine which content is new
- (existing content is not re-hashed)
- '''
- if not os.path.exists(root_path):
- raise ValueError('root path does not exist: %s' % root_path)
-
- # '/home/user/audiobooks/book': d815c7a3cc11f08558b4d91ca93de023
- existing_books = {}
- for k, _ in cache.items():
- path = cache[k]['path']
- if os.path.exists(path):
- existing_books[path] = k
-
- book_dirs = list()
- for root, dirs, _ in os.walk(root_path):
- for d in dirs:
- book_dirs.append(os.path.join(root, d))
-
- books = dict()
- for book_path in book_dirs:
- # if already cached, populate books with existing k/v
- if book_path in existing_books:
- _hash = existing_books[book_path]
- books[_hash] = cache[_hash]
- continue
- book = is_book(book_path)
- if book:
- books[book[0]] = book[1]
-
- return books
-
-def is_book(book_path):
- '''
- Determine if :book_path: contains (supported) audio files
-
- Returns False (not a book) or populated book dict
- '''
- ext = ['mp3'] # m4b seems to be unsupported by Apple
-
- # book attributes to be populated
- book = {
- 'author': None,
- 'duration': 0,
- 'duration_str': None,
- 'files': dict(),
- 'path': book_path,
- 'size_bytes': 0,
- 'size_str': None,
- 'title': None
- }
-
- # hash of each file in directory w/ track extension
- folder_hash = hashlib.md5()
-
- # a book_path is only a book if it contains at least one track
- is_book = False
-
- for f in os.listdir(book_path):
- file_path = os.path.join(book_path, f)
-
- # is a file and has a supported extension
- if not os.path.isfile(file_path) or not f.split('.')[-1] in ext:
- continue
-
- # track duration is required
- tag = TinyTag.get(file_path)
- if not tag.duration:
- continue
-
- # previous conditions met, we're a book! :D
- is_book = True
- print('[+] processing: %s' % f)
-
- # update collective hash of folder with MD5 of current file
- BLOCK = 1024
- file_hash = hashlib.md5()
- with open(file_path, 'rb') as f:
- while True:
- data = f.read(BLOCK)
- if not data:
- break
- folder_hash.update(data)
- file_hash.update(data)
-
- # 1 day, 10:59:58
- duration_str = str(timedelta(seconds=tag.duration))
-
- # per-file atributes, some values are populated conditionally
- track = {
- 'album': validate(tag.album, os.path.split(book_path)[1]),
- 'author': validate(tag.artist, 'Unknown'),
- 'duration': tag.duration,
- 'duration_str': duration_str.split('.')[0],
- 'filename': os.path.split(file_path)[1],
- 'path': file_path,
- 'size_bytes': tag.filesize,
- 'title': validate(tag.title, os.path.split(file_path)[1]),
- 'track': tag.track
- }
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+CACHE_PATH = os.path.join(ABS_PATH, 'cache')
+JSON_PATH = os.path.join(CACHE_PATH, 'audiobooks.json')
+
+# use Flask's config parser, configparser would be hacky
+APP = Flask(__name__)
+APP.config.from_pyfile(os.path.join(ABS_PATH, 'app.cfg'))
+
+class Books:
+ def __init__(self):
+ if os.path.exists(JSON_PATH):
+ self._cache = self._read_cache()
+ else:
+ self._cache = {}
+
+ self.books = self._get_books()
+ self._write_cache()
+
+ def _get_dirs(self, path):
+ '''
+ Return list of directories recursively discovered in :path:
+ '''
+ ret = list()
+ for root, dirs, _ in os.walk(path):
+ for d in dirs:
+ ret.append(os.path.join(root, d))
+
+ return ret
+
+ def _get_path_hash_dict(self):
+ '''
+ Return dict of book paths and their hash from cache, used to check paths
+ against existing cache
+
+ '/home/user/audiobooks/book': d815c7a3cc11f08558b4d91ca93de023
+ '''
+ ret = {}
+ for k, _ in self._cache.items():
+ path = self._cache[k]['path']
+ if os.path.exists(path):
+ ret[path] = k
+
+ return ret
+
+ def _write_cache(self):
+ '''
+ Dump contents of :books: to :json_path:
+ '''
+ if not os.path.exists(CACHE_PATH):
+ os.mkdir(CACHE_PATH)
+ with open(JSON_PATH, 'w') as cache:
+ json.dump(self.books, cache, indent=4)
+
+ def _read_cache(self):
+ '''
+ Return dict of existing cache
+ '''
+ with open(JSON_PATH, 'r') as cache:
+ data = json.load(cache)
+
+ return data
+
+ def _validate(self, v, b):
+ '''
+ Returns :v: if :v: and v.isspace(), otherwise :b:
+ '''
+ if v and not v.isspace():
+ return v
- # we assume author and album attributes are unchanged between tracks
- book['author'] = track['author']
- book['title'] = track['album']
-
- # increment book total size/duration
- book['duration'] += tag.duration
- book['size_bytes'] += tag.filesize
-
- # hexdigest: track dict
- book['files'][file_hash.hexdigest()] = track
-
- # if we're a book, store formatted book size and duration
- if is_book:
- folder_hash = folder_hash.hexdigest()
- total_size = book['size_bytes']
- try:
- _i = int(math.floor(math.log(total_size, 1024)))
- _p = math.pow(1024, _i)
- _s = round(total_size / _p, 2)
- except:
- _i = 1
- _s = 0
-
- # e.g. 1.48 GB
- SIZES = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
- book['size_str'] = '%s %s' % (str(_s), SIZES[_i])
-
- # e.g. 2 days, 5:47:47
- duration_str = str(timedelta(seconds=book['duration']))
- book['duration_str'] = duration_str.split('.')[0]
- return (folder_hash, book)
-
- return False
-
-def write_cache(books, json_path):
- '''
- Dump contents of :books: to :json_path:
- '''
- cache_path = os.path.dirname(json_path)
- if not os.path.exists(cache_path):
- os.mkdir(cache_path)
- with open(json_path, 'w') as f:
- json.dump(books, f, indent=4)
-
-def read_cache(json_path):
- '''
- Return dict of existing cache
- '''
- with open(json_path, 'r') as cache:
- books = json.load(cache)
-
- return books
-
-def validate(v, b):
- '''
- Returns :v: if :v: and v.isspace(), otherwise :b:
- '''
- if v and not v.isspace():
- return v
- else:
return b
-if __name__ == '__main__':
- ABS_PATH = os.path.dirname(os.path.abspath(__file__))
- CACHE_PATH = os.path.join(ABS_PATH, 'cache')
- JSON_PATH = os.path.join(CACHE_PATH, 'audiobooks.json')
-
- # use Flask's config parser, configparser would be hacky
- APP = Flask(__name__)
- APP.config.from_pyfile(os.path.join(ABS_PATH, 'app.cfg'))
+ def _log(self, msg):
+ '''
+ Prints :msg: with formatted ISO-8601 date
+ '''
+ now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
+ print('%s %s' % (now, msg))
+
+ def _get_books(self):
+ '''
+ Discover audiobooks under :root_path: and populate books object
+
+ :cache: existing JSON cache, used to determine which content is new
+ (existing content is not re-hashed)
+ '''
+ ex = self._get_path_hash_dict()
+ dirs = self._get_dirs(APP.config['ROOT_PATH'])
+
+ books = dict()
+ for path in dirs:
+ if path in ex:
+ _hash = ex[path]
+ books[_hash] = self._cache[_hash]
+ continue
+ book = self._check_dir(path)
+ if book:
+ books[book[0]] = book[1]
+
+ return books
+
+ def _check_dir(self, path):
+ '''
+ Determine if :path: contains (supported) audio files; return populated
+ book dict or None
+ '''
+ ext = ['mp3'] # m4b seems to be unsupported by Apple
+ is_book = False
+
+ # book attributes to be populated
+ book = {
+ 'author': None,
+ 'duration': 0,
+ 'duration_str': None,
+ 'files': dict(),
+ 'path': path,
+ 'size_bytes': 0,
+ 'size_str': None,
+ 'title': None
+ }
- if os.path.exists(JSON_PATH):
- cache = read_cache(JSON_PATH)
- BOOKS = get_books(APP.config['ROOT_PATH'], cache)
- else:
- BOOKS = get_books(APP.config['ROOT_PATH'])
+ # hash of each file in directory w/ track extension
+ folder_hash = hashlib.md5()
+
+ for f in os.listdir(path):
+ file_path = os.path.join(path, f)
+ if not os.path.isfile(file_path) or not f.split('.')[-1] in ext:
+ continue
+
+ tag = TinyTag.get(file_path)
+ if not tag.duration:
+ continue
+
+ is_book = True
+ self._log(f)
+
+ file_hash = hashlib.md5()
+ with open(file_path, 'rb') as f:
+ while True:
+ data = f.read(1024)
+ if not data:
+ break
+ folder_hash.update(data)
+ file_hash.update(data)
+
+ # 1 day, 10:59:58
+ duration_str = str(timedelta(seconds=tag.duration))
+
+ # per-file atributes, some values are populated conditionally
+ track = {
+ 'album': self._validate(tag.album, os.path.split(path)[1]),
+ 'author': self._validate(tag.artist, 'Unknown'),
+ 'duration': tag.duration,
+ 'duration_str': duration_str.split('.')[0],
+ 'filename': os.path.split(file_path)[1],
+ 'path': file_path,
+ 'size_bytes': tag.filesize,
+ 'title': self._validate(tag.title, os.path.split(file_path)[1]),
+ 'track': tag.track
+ }
+
+ # we assume author and album attributes are unchanged between tracks
+ book['author'] = track['author']
+ book['title'] = track['album']
+
+ # increment book total size/duration
+ book['duration'] += tag.duration
+ book['size_bytes'] += tag.filesize
+
+ # hexdigest: track dict
+ book['files'][file_hash.hexdigest()] = track
+
+ if is_book:
+ folder_hash = folder_hash.hexdigest()
+ total_size = book['size_bytes']
+
+ try:
+ _i = int(math.floor(math.log(total_size, 1024)))
+ _p = math.pow(1024, _i)
+ _s = round(total_size / _p, 2)
+ except:
+ _i = 1
+ _s = 0
+
+ # e.g. 1.48 GB
+ SIZES = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
+ book['size_str'] = '%s %s' % (str(_s), SIZES[_i])
+
+ # e.g. 2 days, 5:47:47
+ duration_str = str(timedelta(seconds=book['duration']))
+ book['duration_str'] = duration_str.split('.')[0]
+ return (folder_hash, book)
+
+ return None
- write_cache(BOOKS, JSON_PATH)
+if __name__ == '__main__':
+ books = Books()