aboutsummaryrefslogtreecommitdiff
path: root/ember.py
diff options
context:
space:
mode:
Diffstat (limited to 'ember.py')
-rwxr-xr-xember.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/ember.py b/ember.py
new file mode 100755
index 0000000..0b879c8
--- /dev/null
+++ b/ember.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+"""
+usage: ember.py [-h] --path PATH
+
+ember: process kindle highlights sequentially into unique files
+
+options:
+ -h, --help show this help message and exit
+ --path PATH path to highlights file
+"""
+
+import argparse
+import difflib
+import re
+
+class Ember:
+ def __init__(self, path):
+ self.path = path
+
+ def sanitize(self, text):
+ text = re.sub(r"[^a-zA-Z0-9_\ ]", "", text)
+ return re.sub(" +", " ", text)
+
+ def process(self):
+ with open(self.path, 'r', encoding="utf-8") as f:
+ self._parse(f)
+
+ def _parse(self, highlights):
+ name, highlight, seen_location = None, None, False
+
+ # the kindle does not remove deleted highlights from the filesystem
+ # store; we store the set in memory to check for sequence matches above
+ # a dynamic threshold to skip apparent duplicates, preferring most
+ # recent highlight
+ h_list = list(highlights)
+ for idx, line in enumerate(h_list):
+ if not line.strip():
+ continue
+
+ if line.strip() == "==========":
+ name, highlight, seen_location = None, None, False
+ continue
+
+ if not name:
+ title = line[:line.rfind("(")].strip()
+ author = re.findall(r'\((.*?)\)', line)
+ if author:
+ author = self.sanitize(author[-1])
+ name = f"{author} - {title}"
+ else:
+ name = self.sanitize(title)
+ elif not seen_location:
+ seen_location = True
+ continue
+ else:
+ if 0 <= idx+5 < len(h_list):
+ # do not record current highlight if longest matched
+ # sequence exceeds 75% of the length of the shortest
+ # highlight between current and next
+ next_h = h_list[idx+5]
+ threshold = len(min(line, next_h, key=len))*3/4
+
+ s = difflib.SequenceMatcher(None, line, next_h)
+ m = s.find_longest_match(0, len(line), 0, len(next_h))
+
+ if m.size > threshold:
+ continue
+
+ with open(name + ".txt", "a+") as record:
+ record.write(line)
+ record.write("\n")
+
+if __name__ == "__main__":
+ desc = "ember: process kindle highlights sequentially into unique files"
+ parser = argparse.ArgumentParser(description=desc)
+ parser.add_argument("--path", dest="path", type=str, action="store",
+ help="path to highlights file",
+ required=True)
+ args = parser.parse_args()
+
+ ember = Ember(args.path)
+ ember.process()