aboutsummaryrefslogtreecommitdiff
path: root/ember.py
blob: 0b879c86f5c2c147c158f152118f0676835633ce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3

"""
usage: ember.py [-h] --path PATH

ember: process kindle highlights sequentially into unique files

options:
  -h, --help   show this help message and exit
  --path PATH  path to highlights file
"""

import argparse
import difflib
import re

class Ember:
    def __init__(self, path):
        self.path = path

    def sanitize(self, text):
        text = re.sub(r"[^a-zA-Z0-9_\ ]", "", text)
        return re.sub(" +", " ", text)

    def process(self):
        with open(self.path, 'r', encoding="utf-8") as f:
            self._parse(f)

    def _parse(self, highlights):
        name, highlight, seen_location = None, None, False

        # the kindle does not remove deleted highlights from the filesystem
        # store; we store the set in memory to check for sequence matches above
        # a dynamic threshold to skip apparent duplicates, preferring most
        # recent highlight
        h_list = list(highlights)
        for idx, line in enumerate(h_list):
            if not line.strip():
                continue

            if line.strip() == "==========":
                name, highlight, seen_location = None, None, False
                continue

            if not name:
                title = line[:line.rfind("(")].strip()
                author = re.findall(r'\((.*?)\)', line)
                if author:
                    author = self.sanitize(author[-1])
                    name = f"{author} - {title}"
                else:
                    name = self.sanitize(title)
            elif not seen_location:
                seen_location = True
                continue
            else:
                if 0 <= idx+5 < len(h_list):
                    # do not record current highlight if longest matched
                    # sequence exceeds 75% of the length of the shortest
                    # highlight between current and next
                    next_h = h_list[idx+5]
                    threshold = len(min(line, next_h, key=len))*3/4

                    s = difflib.SequenceMatcher(None, line, next_h)
                    m = s.find_longest_match(0, len(line), 0, len(next_h))

                    if m.size > threshold:
                        continue

                with open(name + ".txt", "a+") as record:
                    record.write(line)
                    record.write("\n")

if __name__ == "__main__":
    desc = "ember: process kindle highlights sequentially into unique files"
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument("--path", dest="path", type=str, action="store",
                        help="path to highlights file",
                        required=True)
    args = parser.parse_args()

    ember = Ember(args.path)
    ember.process()