#!/usr/bin/env python3 """ usage: ember.py [-h] --path PATH ember: process kindle highlights sequentially into unique files options: -h, --help show this help message and exit --path PATH path to highlights file """ import argparse import difflib import re class Ember: def __init__(self, path): self.path = path def sanitize(self, text): text = re.sub(r"[^a-zA-Z0-9_\ ]", "", text) return re.sub(" +", " ", text) def process(self): with open(self.path, 'r', encoding="utf-8") as f: self._parse(f) def _parse(self, highlights): name, highlight, seen_location = None, None, False # the kindle does not remove deleted highlights from the filesystem # store; we store the set in memory to check for sequence matches above # a dynamic threshold to skip apparent duplicates, preferring most # recent highlight h_list = list(highlights) for idx, line in enumerate(h_list): if not line.strip(): continue if line.strip() == "==========": name, highlight, seen_location = None, None, False continue if not name: title = line[:line.rfind("(")].strip() author = re.findall(r'\((.*?)\)', line) if author: author = self.sanitize(author[-1]) name = f"{author} - {title}" else: name = self.sanitize(title) elif not seen_location: seen_location = True continue else: if 0 <= idx+5 < len(h_list): # do not record current highlight if longest matched # sequence exceeds 75% of the length of the shortest # highlight between current and next next_h = h_list[idx+5] threshold = len(min(line, next_h, key=len))*3/4 s = difflib.SequenceMatcher(None, line, next_h) m = s.find_longest_match(0, len(line), 0, len(next_h)) if m.size > threshold: continue with open(name + ".txt", "a+") as record: record.write(line) record.write("\n") if __name__ == "__main__": desc = "ember: process kindle highlights sequentially into unique files" parser = argparse.ArgumentParser(description=desc) parser.add_argument("--path", dest="path", type=str, action="store", help="path to highlights file", required=True) args = parser.parse_args() ember = Ember(args.path) ember.process()