1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
#!/usr/bin/env python3
"""
usage: ember.py [-h] --path PATH
ember: process kindle highlights sequentially into unique files
options:
-h, --help show this help message and exit
--path PATH path to highlights file
"""
import argparse
import difflib
import re
class Ember:
def __init__(self, path):
self.path = path
def sanitize(self, text):
text = re.sub(r"[^a-zA-Z0-9_\ ]", "", text)
return re.sub(" +", " ", text)
def process(self):
with open(self.path, 'r', encoding="utf-8") as f:
self._parse(f)
def _parse(self, highlights):
name, highlight, seen_location = None, None, False
# the kindle does not remove deleted highlights from the filesystem
# store; we store the set in memory to check for sequence matches above
# a dynamic threshold to skip apparent duplicates, preferring most
# recent highlight
h_list = list(highlights)
for idx, line in enumerate(h_list):
if not line.strip():
continue
if line.strip() == "==========":
name, highlight, seen_location = None, None, False
continue
if not name:
title = line[:line.rfind("(")].strip()
author = re.findall(r'\((.*?)\)', line)
if author:
author = self.sanitize(author[-1])
name = f"{author} - {title}"
else:
name = self.sanitize(title)
elif not seen_location:
seen_location = True
continue
else:
if 0 <= idx+5 < len(h_list):
# do not record current highlight if longest matched
# sequence exceeds 75% of the length of the shortest
# highlight between current and next
next_h = h_list[idx+5]
threshold = len(min(line, next_h, key=len))*3/4
s = difflib.SequenceMatcher(None, line, next_h)
m = s.find_longest_match(0, len(line), 0, len(next_h))
if m.size > threshold:
continue
with open(name + ".txt", "a+") as record:
record.write(line)
record.write("\n")
if __name__ == "__main__":
desc = "ember: process kindle highlights sequentially into unique files"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument("--path", dest="path", type=str, action="store",
help="path to highlights file",
required=True)
args = parser.parse_args()
ember = Ember(args.path)
ember.process()
|