From 3fcdcfe3454d21ae90886bd1e7e3430b23ec48f2 Mon Sep 17 00:00:00 2001 From: Jordan Date: Tue, 4 Oct 2022 16:42:54 -0700 Subject: initial commit --- README | 7 ++++++ UNLICENSE | 24 ++++++++++++++++++ ember.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+) create mode 100644 README create mode 100644 UNLICENSE create mode 100755 ember.py diff --git a/README b/README new file mode 100644 index 0000000..fedcfd6 --- /dev/null +++ b/README @@ -0,0 +1,7 @@ +ember: process kindle highlights sequentially into unique files + +usage: ember.py [-h] --path PATH + +options: + -h, --help show this help message and exit + --path PATH path to highlights file diff --git a/UNLICENSE b/UNLICENSE new file mode 100644 index 0000000..68a49da --- /dev/null +++ b/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/ember.py b/ember.py new file mode 100755 index 0000000..0b879c8 --- /dev/null +++ b/ember.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +""" +usage: ember.py [-h] --path PATH + +ember: process kindle highlights sequentially into unique files + +options: + -h, --help show this help message and exit + --path PATH path to highlights file +""" + +import argparse +import difflib +import re + +class Ember: + def __init__(self, path): + self.path = path + + def sanitize(self, text): + text = re.sub(r"[^a-zA-Z0-9_\ ]", "", text) + return re.sub(" +", " ", text) + + def process(self): + with open(self.path, 'r', encoding="utf-8") as f: + self._parse(f) + + def _parse(self, highlights): + name, highlight, seen_location = None, None, False + + # the kindle does not remove deleted highlights from the filesystem + # store; we store the set in memory to check for sequence matches above + # a dynamic threshold to skip apparent duplicates, preferring most + # recent highlight + h_list = list(highlights) + for idx, line in enumerate(h_list): + if not line.strip(): + continue + + if line.strip() == "==========": + name, highlight, seen_location = None, None, False + continue + + if not name: + title = line[:line.rfind("(")].strip() + author = re.findall(r'\((.*?)\)', line) + if author: + author = self.sanitize(author[-1]) + name = f"{author} - {title}" + else: + name = self.sanitize(title) + elif not seen_location: + seen_location = True + continue + else: + if 0 <= idx+5 < len(h_list): + # do not record current highlight if longest matched + # sequence exceeds 75% of the length of the shortest + # highlight between current and next + next_h = h_list[idx+5] + threshold = len(min(line, next_h, key=len))*3/4 + + s = difflib.SequenceMatcher(None, line, next_h) + m = s.find_longest_match(0, len(line), 0, len(next_h)) + + if m.size > threshold: + continue + + with open(name + ".txt", "a+") as record: + record.write(line) + record.write("\n") + +if __name__ == "__main__": + desc = "ember: process kindle highlights sequentially into unique files" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument("--path", dest="path", type=str, action="store", + help="path to highlights file", + required=True) + args = parser.parse_args() + + ember = Ember(args.path) + ember.process() -- cgit v1.2.3-54-g00ecf