From 3ea7cee3bc97bf34e1c717ef0969914de58f228e Mon Sep 17 00:00:00 2001 From: Jordan Date: Sat, 11 Jul 2020 22:45:16 -0700 Subject: classify rebuild.py, optimize logic --- rebuild.py | 369 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 192 insertions(+), 177 deletions(-) diff --git a/rebuild.py b/rebuild.py index 3c0ebd9..756a2ae 100755 --- a/rebuild.py +++ b/rebuild.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import datetime import hashlib import json import math @@ -8,185 +9,199 @@ from datetime import timedelta from flask import Flask from lib.tinytag import TinyTag -def get_books(root_path, cache=dict()): - ''' - Discover audiobooks under :root_path: and populate books object - - :cache: existing JSON cache, used to determine which content is new - (existing content is not re-hashed) - ''' - if not os.path.exists(root_path): - raise ValueError('root path does not exist: %s' % root_path) - - # '/home/user/audiobooks/book': d815c7a3cc11f08558b4d91ca93de023 - existing_books = {} - for k, _ in cache.items(): - path = cache[k]['path'] - if os.path.exists(path): - existing_books[path] = k - - book_dirs = list() - for root, dirs, _ in os.walk(root_path): - for d in dirs: - book_dirs.append(os.path.join(root, d)) - - books = dict() - for book_path in book_dirs: - # if already cached, populate books with existing k/v - if book_path in existing_books: - _hash = existing_books[book_path] - books[_hash] = cache[_hash] - continue - book = is_book(book_path) - if book: - books[book[0]] = book[1] - - return books - -def is_book(book_path): - ''' - Determine if :book_path: contains (supported) audio files - - Returns False (not a book) or populated book dict - ''' - ext = ['mp3'] # m4b seems to be unsupported by Apple - - # book attributes to be populated - book = { - 'author': None, - 'duration': 0, - 'duration_str': None, - 'files': dict(), - 'path': book_path, - 'size_bytes': 0, - 'size_str': None, - 'title': None - } - - # hash of each file in directory w/ track extension - folder_hash = hashlib.md5() - - # a book_path is only a book if it contains at least one track - is_book = False - - for f in os.listdir(book_path): - file_path = os.path.join(book_path, f) - - # is a file and has a supported extension - if not os.path.isfile(file_path) or not f.split('.')[-1] in ext: - continue - - # track duration is required - tag = TinyTag.get(file_path) - if not tag.duration: - continue - - # previous conditions met, we're a book! :D - is_book = True - print('[+] processing: %s' % f) - - # update collective hash of folder with MD5 of current file - BLOCK = 1024 - file_hash = hashlib.md5() - with open(file_path, 'rb') as f: - while True: - data = f.read(BLOCK) - if not data: - break - folder_hash.update(data) - file_hash.update(data) - - # 1 day, 10:59:58 - duration_str = str(timedelta(seconds=tag.duration)) - - # per-file atributes, some values are populated conditionally - track = { - 'album': validate(tag.album, os.path.split(book_path)[1]), - 'author': validate(tag.artist, 'Unknown'), - 'duration': tag.duration, - 'duration_str': duration_str.split('.')[0], - 'filename': os.path.split(file_path)[1], - 'path': file_path, - 'size_bytes': tag.filesize, - 'title': validate(tag.title, os.path.split(file_path)[1]), - 'track': tag.track - } +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) +CACHE_PATH = os.path.join(ABS_PATH, 'cache') +JSON_PATH = os.path.join(CACHE_PATH, 'audiobooks.json') + +# use Flask's config parser, configparser would be hacky +APP = Flask(__name__) +APP.config.from_pyfile(os.path.join(ABS_PATH, 'app.cfg')) + +class Books: + def __init__(self): + if os.path.exists(JSON_PATH): + self._cache = self._read_cache() + else: + self._cache = {} + + self.books = self._get_books() + self._write_cache() + + def _get_dirs(self, path): + ''' + Return list of directories recursively discovered in :path: + ''' + ret = list() + for root, dirs, _ in os.walk(path): + for d in dirs: + ret.append(os.path.join(root, d)) + + return ret + + def _get_path_hash_dict(self): + ''' + Return dict of book paths and their hash from cache, used to check paths + against existing cache + + '/home/user/audiobooks/book': d815c7a3cc11f08558b4d91ca93de023 + ''' + ret = {} + for k, _ in self._cache.items(): + path = self._cache[k]['path'] + if os.path.exists(path): + ret[path] = k + + return ret + + def _write_cache(self): + ''' + Dump contents of :books: to :json_path: + ''' + if not os.path.exists(CACHE_PATH): + os.mkdir(CACHE_PATH) + with open(JSON_PATH, 'w') as cache: + json.dump(self.books, cache, indent=4) + + def _read_cache(self): + ''' + Return dict of existing cache + ''' + with open(JSON_PATH, 'r') as cache: + data = json.load(cache) + + return data + + def _validate(self, v, b): + ''' + Returns :v: if :v: and v.isspace(), otherwise :b: + ''' + if v and not v.isspace(): + return v - # we assume author and album attributes are unchanged between tracks - book['author'] = track['author'] - book['title'] = track['album'] - - # increment book total size/duration - book['duration'] += tag.duration - book['size_bytes'] += tag.filesize - - # hexdigest: track dict - book['files'][file_hash.hexdigest()] = track - - # if we're a book, store formatted book size and duration - if is_book: - folder_hash = folder_hash.hexdigest() - total_size = book['size_bytes'] - try: - _i = int(math.floor(math.log(total_size, 1024))) - _p = math.pow(1024, _i) - _s = round(total_size / _p, 2) - except: - _i = 1 - _s = 0 - - # e.g. 1.48 GB - SIZES = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB') - book['size_str'] = '%s %s' % (str(_s), SIZES[_i]) - - # e.g. 2 days, 5:47:47 - duration_str = str(timedelta(seconds=book['duration'])) - book['duration_str'] = duration_str.split('.')[0] - return (folder_hash, book) - - return False - -def write_cache(books, json_path): - ''' - Dump contents of :books: to :json_path: - ''' - cache_path = os.path.dirname(json_path) - if not os.path.exists(cache_path): - os.mkdir(cache_path) - with open(json_path, 'w') as f: - json.dump(books, f, indent=4) - -def read_cache(json_path): - ''' - Return dict of existing cache - ''' - with open(json_path, 'r') as cache: - books = json.load(cache) - - return books - -def validate(v, b): - ''' - Returns :v: if :v: and v.isspace(), otherwise :b: - ''' - if v and not v.isspace(): - return v - else: return b -if __name__ == '__main__': - ABS_PATH = os.path.dirname(os.path.abspath(__file__)) - CACHE_PATH = os.path.join(ABS_PATH, 'cache') - JSON_PATH = os.path.join(CACHE_PATH, 'audiobooks.json') - - # use Flask's config parser, configparser would be hacky - APP = Flask(__name__) - APP.config.from_pyfile(os.path.join(ABS_PATH, 'app.cfg')) + def _log(self, msg): + ''' + Prints :msg: with formatted ISO-8601 date + ''' + now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + print('%s %s' % (now, msg)) + + def _get_books(self): + ''' + Discover audiobooks under :root_path: and populate books object + + :cache: existing JSON cache, used to determine which content is new + (existing content is not re-hashed) + ''' + ex = self._get_path_hash_dict() + dirs = self._get_dirs(APP.config['ROOT_PATH']) + + books = dict() + for path in dirs: + if path in ex: + _hash = ex[path] + books[_hash] = self._cache[_hash] + continue + book = self._check_dir(path) + if book: + books[book[0]] = book[1] + + return books + + def _check_dir(self, path): + ''' + Determine if :path: contains (supported) audio files; return populated + book dict or None + ''' + ext = ['mp3'] # m4b seems to be unsupported by Apple + is_book = False + + # book attributes to be populated + book = { + 'author': None, + 'duration': 0, + 'duration_str': None, + 'files': dict(), + 'path': path, + 'size_bytes': 0, + 'size_str': None, + 'title': None + } - if os.path.exists(JSON_PATH): - cache = read_cache(JSON_PATH) - BOOKS = get_books(APP.config['ROOT_PATH'], cache) - else: - BOOKS = get_books(APP.config['ROOT_PATH']) + # hash of each file in directory w/ track extension + folder_hash = hashlib.md5() + + for f in os.listdir(path): + file_path = os.path.join(path, f) + if not os.path.isfile(file_path) or not f.split('.')[-1] in ext: + continue + + tag = TinyTag.get(file_path) + if not tag.duration: + continue + + is_book = True + self._log(f) + + file_hash = hashlib.md5() + with open(file_path, 'rb') as f: + while True: + data = f.read(1024) + if not data: + break + folder_hash.update(data) + file_hash.update(data) + + # 1 day, 10:59:58 + duration_str = str(timedelta(seconds=tag.duration)) + + # per-file atributes, some values are populated conditionally + track = { + 'album': self._validate(tag.album, os.path.split(path)[1]), + 'author': self._validate(tag.artist, 'Unknown'), + 'duration': tag.duration, + 'duration_str': duration_str.split('.')[0], + 'filename': os.path.split(file_path)[1], + 'path': file_path, + 'size_bytes': tag.filesize, + 'title': self._validate(tag.title, os.path.split(file_path)[1]), + 'track': tag.track + } + + # we assume author and album attributes are unchanged between tracks + book['author'] = track['author'] + book['title'] = track['album'] + + # increment book total size/duration + book['duration'] += tag.duration + book['size_bytes'] += tag.filesize + + # hexdigest: track dict + book['files'][file_hash.hexdigest()] = track + + if is_book: + folder_hash = folder_hash.hexdigest() + total_size = book['size_bytes'] + + try: + _i = int(math.floor(math.log(total_size, 1024))) + _p = math.pow(1024, _i) + _s = round(total_size / _p, 2) + except: + _i = 1 + _s = 0 + + # e.g. 1.48 GB + SIZES = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB') + book['size_str'] = '%s %s' % (str(_s), SIZES[_i]) + + # e.g. 2 days, 5:47:47 + duration_str = str(timedelta(seconds=book['duration'])) + book['duration_str'] = duration_str.split('.')[0] + return (folder_hash, book) + + return None - write_cache(BOOKS, JSON_PATH) +if __name__ == '__main__': + books = Books() -- cgit v1.2.3-54-g00ecf