initial commit

author: Jordan <me@jordan.im> 2022-03-09 07:21:53 +0000
committer: Jordan <me@jordan.im> 2022-03-09 07:21:53 +0000
commit: 64f16b6a7684a4054f46b009b1cb5a0c3751c6dd (patch)
tree: eb17658e881fd19ac4a8c40d704dace1b55cddb9
download: bin-64f16b6a7684a4054f46b009b1cb5a0c3751c6dd.tar.gz
bin-64f16b6a7684a4054f46b009b1cb5a0c3751c6dd.zip
4 files changed, 115 insertions, 0 deletions
diff --git a/cgit-last-modified.py b/cgit-last-modified.py
new file mode 100755
index 0000000..2eac23c
--- /dev/null
+++ b/cgit-last-modified.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import git
+import os
+import sys
+from datetime import datetime, timezone
+
+REPOS = "/var/www/git.jordan.im/repositories"
+
+for x in os.listdir(REPOS):
+    if not os.path.isdir(os.path.join(REPOS, x)):
+        continue
+
+    print(f"processing {x}...")
+
+    try:
+        repo = git.Repo(os.path.join(REPOS, x))
+        for remote in repo.remotes:
+            remote.fetch()
+        md = repo.head.commit.committed_datetime
+        md = md.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+    except Exception as e:
+        sys.stderr.write(str(e))
+        continue
+
+    d = os.path.join(REPOS, x, "info/web/")
+    os.makedirs(d, exist_ok=True)
+
+    f = open(os.path.join(d, "last-modified"), "w")
+    f.write(md)
+    f.close()
diff --git a/doi-to-filename.py b/doi-to-filename.py
new file mode 100755
index 0000000..9bacf97
--- /dev/null
+++ b/doi-to-filename.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import sys
+import argparse
+import urllib.request
+import xml.etree.cElementTree as ET
+
+def doi_to_filename(doi):
+    url = 'https://doi.org/%s' % doi
+    headers = {'Accept': 'application/vnd.crossref.unixref+xml;q=1,' +
+                         'application/rdf+xml;q=0.5'}
+    # doi.org API request
+    try:
+        resp = request(url, headers)
+    except Exception as err:
+        raise ValueError('error making API request; invalid DOI? %s' % err)
+
+    # derive filename from XML response
+    try:
+        xml_root = get_xml_root(resp)
+        filename = get_filename_from_xml(xml_root)
+    except Exception as err:
+        raise ValueError('error parsing XML response; invalid DOI? %s' % err)
+
+    return filename
+
+def request(url, headers):
+    conn = urllib.request.Request(
+        url,
+        headers=headers
+    )
+    r = urllib.request.urlopen(conn)
+    return r.read().decode('utf-8')
+
+def get_xml_root(resp):
+    tree = ET.ElementTree(ET.fromstring(resp))
+    return tree.getroot()
+
+def get_filename_from_xml(xml_root):
+    title = None
+    year = None
+    authors = []
+
+    title = xml_root.find('.//title').text
+    if not title:
+        raise ValueError('title could not be parsed, aborting...')
+
+    year = xml_root.find('.//year').text
+    if year:
+        year = ' (%s)' % year
+    else:
+        year = ''
+
+    for a in xml_root.iter('surname'):
+        authors.append(a.text)
+    if authors:
+        authors = ' - ' + ', '.join(authors)
+    else:
+        authors = ''
+
+    filename = title + year + authors
+    return filename
diff --git a/ia-upload-crawl.sh b/ia-upload-crawl.sh
new file mode 100755
index 0000000..7610545
--- /dev/null
+++ b/ia-upload-crawl.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+title="$(echo $1 | sed 's/\(-[0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+.*\)//g')"
+date="$(echo $1 | sed -n 's/.*\([0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+\).*/\1/p')"
+id=$title-$date
+seeds="$(sed 'N;s/\n/, /' $1/seed_urls)"
+
+ia upload $id $1/*.gz --metadata="mediatype:web" \
+--metadata="date:$date" --metadata="title:$title" \
+--metadata="description:recursive crawl of $seeds using https://git.jordan.im/crawl, \
+taken on $date" --metadata="source:$title"
diff --git a/ia-upload-gs.sh b/ia-upload-gs.sh
new file mode 100755
index 0000000..1d5b397
--- /dev/null
+++ b/ia-upload-gs.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+title="$(echo $1 | sed 's/\(-[0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+.*\)//g')"
+date="$(echo $1 | sed -n 's/.*\([0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+\).*/\1/p')"
+id=$title-$date
+start_url="$(cat $1/start_url)"
+
+ia upload $id $1/*.gz $1/*.cdx --metadata="mediatype:web" \
+--metadata="date:$date" --metadata="title:$title" \
+--metadata="description:recursive crawl of $start_url using grab-site/wpull, \
+taken on $date" --metadata="source:$title"
author	Jordan <me@jordan.im>	2022-03-09 07:21:53 +0000
committer	Jordan <me@jordan.im>	2022-03-09 07:21:53 +0000
commit	64f16b6a7684a4054f46b009b1cb5a0c3751c6dd (patch)
tree	eb17658e881fd19ac4a8c40d704dace1b55cddb9
download	bin-64f16b6a7684a4054f46b009b1cb5a0c3751c6dd.tar.gz bin-64f16b6a7684a4054f46b009b1cb5a0c3751c6dd.zip