aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-10-04 16:42:54 -0700
committerJordan <me@jordan.im>2022-10-04 16:42:54 -0700
commit3fcdcfe3454d21ae90886bd1e7e3430b23ec48f2 (patch)
tree034e79efa9a233cda38b326693950fc4e526b3c5
downloadember-master.tar.gz
ember-master.zip
initial commitHEADmaster
-rw-r--r--README7
-rw-r--r--UNLICENSE24
-rwxr-xr-xember.py83
3 files changed, 114 insertions, 0 deletions
diff --git a/README b/README
new file mode 100644
index 0000000..fedcfd6
--- /dev/null
+++ b/README
@@ -0,0 +1,7 @@
+ember: process kindle highlights sequentially into unique files
+
+usage: ember.py [-h] --path PATH
+
+options:
+ -h, --help show this help message and exit
+ --path PATH path to highlights file
diff --git a/UNLICENSE b/UNLICENSE
new file mode 100644
index 0000000..68a49da
--- /dev/null
+++ b/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/ember.py b/ember.py
new file mode 100755
index 0000000..0b879c8
--- /dev/null
+++ b/ember.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+"""
+usage: ember.py [-h] --path PATH
+
+ember: process kindle highlights sequentially into unique files
+
+options:
+ -h, --help show this help message and exit
+ --path PATH path to highlights file
+"""
+
+import argparse
+import difflib
+import re
+
+class Ember:
+ def __init__(self, path):
+ self.path = path
+
+ def sanitize(self, text):
+ text = re.sub(r"[^a-zA-Z0-9_\ ]", "", text)
+ return re.sub(" +", " ", text)
+
+ def process(self):
+ with open(self.path, 'r', encoding="utf-8") as f:
+ self._parse(f)
+
+ def _parse(self, highlights):
+ name, highlight, seen_location = None, None, False
+
+ # the kindle does not remove deleted highlights from the filesystem
+ # store; we store the set in memory to check for sequence matches above
+ # a dynamic threshold to skip apparent duplicates, preferring most
+ # recent highlight
+ h_list = list(highlights)
+ for idx, line in enumerate(h_list):
+ if not line.strip():
+ continue
+
+ if line.strip() == "==========":
+ name, highlight, seen_location = None, None, False
+ continue
+
+ if not name:
+ title = line[:line.rfind("(")].strip()
+ author = re.findall(r'\((.*?)\)', line)
+ if author:
+ author = self.sanitize(author[-1])
+ name = f"{author} - {title}"
+ else:
+ name = self.sanitize(title)
+ elif not seen_location:
+ seen_location = True
+ continue
+ else:
+ if 0 <= idx+5 < len(h_list):
+ # do not record current highlight if longest matched
+ # sequence exceeds 75% of the length of the shortest
+ # highlight between current and next
+ next_h = h_list[idx+5]
+ threshold = len(min(line, next_h, key=len))*3/4
+
+ s = difflib.SequenceMatcher(None, line, next_h)
+ m = s.find_longest_match(0, len(line), 0, len(next_h))
+
+ if m.size > threshold:
+ continue
+
+ with open(name + ".txt", "a+") as record:
+ record.write(line)
+ record.write("\n")
+
+if __name__ == "__main__":
+ desc = "ember: process kindle highlights sequentially into unique files"
+ parser = argparse.ArgumentParser(description=desc)
+ parser.add_argument("--path", dest="path", type=str, action="store",
+ help="path to highlights file",
+ required=True)
+ args = parser.parse_args()
+
+ ember = Ember(args.path)
+ ember.process()