From 3ea7cee3bc97bf34e1c717ef0969914de58f228e Mon Sep 17 00:00:00 2001
From: Jordan <me@jordan.im>
Date: Sat, 11 Jul 2020 22:45:16 -0700
Subject: classify rebuild.py, optimize logic

---
 rebuild.py | 369 ++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 192 insertions(+), 177 deletions(-)

diff --git a/rebuild.py b/rebuild.py
index 3c0ebd9..756a2ae 100755
--- a/rebuild.py
+++ b/rebuild.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+import datetime
 import hashlib
 import json
 import math
@@ -8,185 +9,199 @@ from datetime import timedelta
 from flask import Flask
 from lib.tinytag import TinyTag
 
-def get_books(root_path, cache=dict()):
-    '''
-    Discover audiobooks under :root_path: and populate books object
-
-    :cache: existing JSON cache, used to determine which content is new
-            (existing content is not re-hashed)
-    '''
-    if not os.path.exists(root_path):
-        raise ValueError('root path does not exist: %s' % root_path)
-
-    # '/home/user/audiobooks/book': d815c7a3cc11f08558b4d91ca93de023
-    existing_books = {}
-    for k, _ in cache.items():
-        path = cache[k]['path']
-        if os.path.exists(path):
-            existing_books[path] = k
-
-    book_dirs = list()
-    for root, dirs, _ in os.walk(root_path):
-        for d in dirs:
-            book_dirs.append(os.path.join(root, d))
-
-    books = dict()
-    for book_path in book_dirs:
-        # if already cached, populate books with existing k/v
-        if book_path in existing_books:
-            _hash = existing_books[book_path]
-            books[_hash] = cache[_hash]
-            continue
-        book = is_book(book_path)
-        if book:
-            books[book[0]] = book[1]
-
-    return books
-
-def is_book(book_path):
-    '''
-    Determine if :book_path: contains (supported) audio files
-
-    Returns False (not a book) or populated book dict
-    '''
-    ext = ['mp3'] # m4b seems to be unsupported by Apple
-
-    # book attributes to be populated
-    book = {
-        'author':       None,
-        'duration':     0,
-        'duration_str': None,
-        'files':        dict(),
-        'path':         book_path,
-        'size_bytes':   0,
-        'size_str':     None,
-        'title':        None
-    }
-
-    # hash of each file in directory w/ track extension
-    folder_hash = hashlib.md5()
-
-    # a book_path is only a book if it contains at least one track
-    is_book = False
-
-    for f in os.listdir(book_path):
-        file_path = os.path.join(book_path, f)
-
-        # is a file and has a supported extension
-        if not os.path.isfile(file_path) or not f.split('.')[-1] in ext:
-            continue
-
-        # track duration is required
-        tag = TinyTag.get(file_path)
-        if not tag.duration:
-            continue
-
-        # previous conditions met, we're a book! :D
-        is_book = True
-        print('[+] processing: %s' % f)
-
-        # update collective hash of folder with MD5 of current file
-        BLOCK = 1024
-        file_hash = hashlib.md5()
-        with open(file_path, 'rb') as f:
-            while True:
-                data = f.read(BLOCK)
-                if not data:
-                    break
-                folder_hash.update(data)
-                file_hash.update(data)
-
-        # 1 day, 10:59:58
-        duration_str = str(timedelta(seconds=tag.duration))
-
-        # per-file atributes, some values are populated conditionally
-        track = {
-            'album':        validate(tag.album, os.path.split(book_path)[1]),
-            'author':       validate(tag.artist, 'Unknown'),
-            'duration':     tag.duration,
-            'duration_str': duration_str.split('.')[0],
-            'filename':     os.path.split(file_path)[1],
-            'path':         file_path,
-            'size_bytes':   tag.filesize,
-            'title':        validate(tag.title, os.path.split(file_path)[1]),
-            'track':        tag.track
-        }
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+CACHE_PATH = os.path.join(ABS_PATH, 'cache')
+JSON_PATH = os.path.join(CACHE_PATH, 'audiobooks.json')
+
+# use Flask's config parser, configparser would be hacky
+APP = Flask(__name__)
+APP.config.from_pyfile(os.path.join(ABS_PATH, 'app.cfg'))
+
+class Books:
+    def __init__(self):
+        if os.path.exists(JSON_PATH):
+            self._cache = self._read_cache()
+        else:
+            self._cache = {}
+
+        self.books = self._get_books()
+        self._write_cache()
+
+    def _get_dirs(self, path):
+        '''
+        Return list of directories recursively discovered in :path:
+        '''
+        ret = list()
+        for root, dirs, _ in os.walk(path):
+            for d in dirs:
+                ret.append(os.path.join(root, d))
+
+        return ret
+
+    def _get_path_hash_dict(self):
+        '''
+        Return dict of book paths and their hash from cache, used to check paths
+        against existing cache
+
+        '/home/user/audiobooks/book': d815c7a3cc11f08558b4d91ca93de023
+        '''
+        ret = {}
+        for k, _ in self._cache.items():
+            path = self._cache[k]['path']
+            if os.path.exists(path):
+                ret[path] = k
+
+        return ret
+
+    def _write_cache(self):
+        '''
+        Dump contents of :books: to :json_path:
+        '''
+        if not os.path.exists(CACHE_PATH):
+            os.mkdir(CACHE_PATH)
+        with open(JSON_PATH, 'w') as cache:
+            json.dump(self.books, cache, indent=4)
+
+    def _read_cache(self):
+        '''
+        Return dict of existing cache
+        '''
+        with open(JSON_PATH, 'r') as cache:
+            data = json.load(cache)
+
+        return data
+
+    def _validate(self, v, b):
+        '''
+        Returns :v: if :v: and v.isspace(), otherwise :b:
+        '''
+        if v and not v.isspace():
+            return v
 
-        # we assume author and album attributes are unchanged between tracks
-        book['author'] = track['author']
-        book['title'] = track['album']
-
-        # increment book total size/duration
-        book['duration'] += tag.duration
-        book['size_bytes'] += tag.filesize
-
-        # hexdigest: track dict
-        book['files'][file_hash.hexdigest()] = track
-
-    # if we're a book, store formatted book size and duration
-    if is_book:
-        folder_hash = folder_hash.hexdigest()
-        total_size = book['size_bytes']
-        try:
-            _i = int(math.floor(math.log(total_size, 1024)))
-            _p = math.pow(1024, _i)
-            _s = round(total_size / _p, 2)
-        except:
-            _i = 1
-            _s = 0
-
-        # e.g. 1.48 GB
-        SIZES = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
-        book['size_str'] = '%s %s' % (str(_s), SIZES[_i])
-
-        # e.g. 2 days, 5:47:47
-        duration_str = str(timedelta(seconds=book['duration']))
-        book['duration_str'] = duration_str.split('.')[0]
-        return (folder_hash, book)
-
-    return False
-
-def write_cache(books, json_path):
-    '''
-    Dump contents of :books: to :json_path:
-    '''
-    cache_path = os.path.dirname(json_path)
-    if not os.path.exists(cache_path):
-        os.mkdir(cache_path)
-    with open(json_path, 'w') as f:
-        json.dump(books, f, indent=4)
-
-def read_cache(json_path):
-    '''
-    Return dict of existing cache
-    '''
-    with open(json_path, 'r') as cache:
-        books = json.load(cache)
-
-    return books
-
-def validate(v, b):
-    '''
-    Returns :v: if :v: and v.isspace(), otherwise :b:
-    '''
-    if v and not v.isspace():
-        return v
-    else:
         return b
 
-if __name__ == '__main__':
-    ABS_PATH = os.path.dirname(os.path.abspath(__file__))
-    CACHE_PATH = os.path.join(ABS_PATH, 'cache')
-    JSON_PATH = os.path.join(CACHE_PATH, 'audiobooks.json')
-
-    # use Flask's config parser, configparser would be hacky
-    APP = Flask(__name__)
-    APP.config.from_pyfile(os.path.join(ABS_PATH, 'app.cfg'))
+    def _log(self, msg):
+        '''
+        Prints :msg: with formatted ISO-8601 date
+        '''
+        now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
+        print('%s %s' % (now, msg))
+
+    def _get_books(self):
+        '''
+        Discover audiobooks under :root_path: and populate books object
+
+        :cache: existing JSON cache, used to determine which content is new
+                (existing content is not re-hashed)
+        '''
+        ex = self._get_path_hash_dict()
+        dirs = self._get_dirs(APP.config['ROOT_PATH'])
+
+        books = dict()
+        for path in dirs:
+            if path in ex:
+                _hash = ex[path]
+                books[_hash] = self._cache[_hash]
+                continue
+            book = self._check_dir(path)
+            if book:
+                books[book[0]] = book[1]
+
+        return books
+
+    def _check_dir(self, path):
+        '''
+        Determine if :path: contains (supported) audio files; return populated
+        book dict or None
+        '''
+        ext = ['mp3'] # m4b seems to be unsupported by Apple
+        is_book = False
+
+        # book attributes to be populated
+        book = {
+            'author':       None,
+            'duration':     0,
+            'duration_str': None,
+            'files':        dict(),
+            'path':         path,
+            'size_bytes':   0,
+            'size_str':     None,
+            'title':        None
+        }
 
-    if os.path.exists(JSON_PATH):
-        cache = read_cache(JSON_PATH)
-        BOOKS = get_books(APP.config['ROOT_PATH'], cache)
-    else:
-        BOOKS = get_books(APP.config['ROOT_PATH'])
+        # hash of each file in directory w/ track extension
+        folder_hash = hashlib.md5()
+
+        for f in os.listdir(path):
+            file_path = os.path.join(path, f)
+            if not os.path.isfile(file_path) or not f.split('.')[-1] in ext:
+                continue
+
+            tag = TinyTag.get(file_path)
+            if not tag.duration:
+                continue
+
+            is_book = True
+            self._log(f)
+
+            file_hash = hashlib.md5()
+            with open(file_path, 'rb') as f:
+                while True:
+                    data = f.read(1024)
+                    if not data:
+                        break
+                    folder_hash.update(data)
+                    file_hash.update(data)
+
+            # 1 day, 10:59:58
+            duration_str = str(timedelta(seconds=tag.duration))
+
+            # per-file atributes, some values are populated conditionally
+            track = {
+                'album':        self._validate(tag.album, os.path.split(path)[1]),
+                'author':       self._validate(tag.artist, 'Unknown'),
+                'duration':     tag.duration,
+                'duration_str': duration_str.split('.')[0],
+                'filename':     os.path.split(file_path)[1],
+                'path':         file_path,
+                'size_bytes':   tag.filesize,
+                'title':        self._validate(tag.title, os.path.split(file_path)[1]),
+                'track':        tag.track
+            }
+
+            # we assume author and album attributes are unchanged between tracks
+            book['author'] = track['author']
+            book['title'] = track['album']
+
+            # increment book total size/duration
+            book['duration'] += tag.duration
+            book['size_bytes'] += tag.filesize
+
+            # hexdigest: track dict
+            book['files'][file_hash.hexdigest()] = track
+
+        if is_book:
+            folder_hash = folder_hash.hexdigest()
+            total_size = book['size_bytes']
+
+            try:
+                _i = int(math.floor(math.log(total_size, 1024)))
+                _p = math.pow(1024, _i)
+                _s = round(total_size / _p, 2)
+            except:
+                _i = 1
+                _s = 0
+
+            # e.g. 1.48 GB
+            SIZES = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
+            book['size_str'] = '%s %s' % (str(_s), SIZES[_i])
+
+            # e.g. 2 days, 5:47:47
+            duration_str = str(timedelta(seconds=book['duration']))
+            book['duration_str'] = duration_str.split('.')[0]
+            return (folder_hash, book)
+
+        return None
 
-    write_cache(BOOKS, JSON_PATH)
+if __name__ == '__main__':
+    books = Books()
-- 
cgit v1.2.3-54-g00ecf