summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-03-09 07:21:53 +0000
committerJordan <me@jordan.im>2022-03-09 07:21:53 +0000
commit64f16b6a7684a4054f46b009b1cb5a0c3751c6dd (patch)
treeeb17658e881fd19ac4a8c40d704dace1b55cddb9
downloadbin-64f16b6a7684a4054f46b009b1cb5a0c3751c6dd.tar.gz
bin-64f16b6a7684a4054f46b009b1cb5a0c3751c6dd.zip
initial commit
-rwxr-xr-xcgit-last-modified.py31
-rwxr-xr-xdoi-to-filename.py62
-rwxr-xr-xia-upload-crawl.sh11
-rwxr-xr-xia-upload-gs.sh11
4 files changed, 115 insertions, 0 deletions
diff --git a/cgit-last-modified.py b/cgit-last-modified.py
new file mode 100755
index 0000000..2eac23c
--- /dev/null
+++ b/cgit-last-modified.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import git
+import os
+import sys
+from datetime import datetime, timezone
+
+REPOS = "/var/www/git.jordan.im/repositories"
+
+for x in os.listdir(REPOS):
+ if not os.path.isdir(os.path.join(REPOS, x)):
+ continue
+
+ print(f"processing {x}...")
+
+ try:
+ repo = git.Repo(os.path.join(REPOS, x))
+ for remote in repo.remotes:
+ remote.fetch()
+ md = repo.head.commit.committed_datetime
+ md = md.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+ except Exception as e:
+ sys.stderr.write(str(e))
+ continue
+
+ d = os.path.join(REPOS, x, "info/web/")
+ os.makedirs(d, exist_ok=True)
+
+ f = open(os.path.join(d, "last-modified"), "w")
+ f.write(md)
+ f.close()
diff --git a/doi-to-filename.py b/doi-to-filename.py
new file mode 100755
index 0000000..9bacf97
--- /dev/null
+++ b/doi-to-filename.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import sys
+import argparse
+import urllib.request
+import xml.etree.cElementTree as ET
+
+def doi_to_filename(doi):
+ url = 'https://doi.org/%s' % doi
+ headers = {'Accept': 'application/vnd.crossref.unixref+xml;q=1,' +
+ 'application/rdf+xml;q=0.5'}
+ # doi.org API request
+ try:
+ resp = request(url, headers)
+ except Exception as err:
+ raise ValueError('error making API request; invalid DOI? %s' % err)
+
+ # derive filename from XML response
+ try:
+ xml_root = get_xml_root(resp)
+ filename = get_filename_from_xml(xml_root)
+ except Exception as err:
+ raise ValueError('error parsing XML response; invalid DOI? %s' % err)
+
+ return filename
+
+def request(url, headers):
+ conn = urllib.request.Request(
+ url,
+ headers=headers
+ )
+ r = urllib.request.urlopen(conn)
+ return r.read().decode('utf-8')
+
+def get_xml_root(resp):
+ tree = ET.ElementTree(ET.fromstring(resp))
+ return tree.getroot()
+
+def get_filename_from_xml(xml_root):
+ title = None
+ year = None
+ authors = []
+
+ title = xml_root.find('.//title').text
+ if not title:
+ raise ValueError('title could not be parsed, aborting...')
+
+ year = xml_root.find('.//year').text
+ if year:
+ year = ' (%s)' % year
+ else:
+ year = ''
+
+ for a in xml_root.iter('surname'):
+ authors.append(a.text)
+ if authors:
+ authors = ' - ' + ', '.join(authors)
+ else:
+ authors = ''
+
+ filename = title + year + authors
+ return filename
diff --git a/ia-upload-crawl.sh b/ia-upload-crawl.sh
new file mode 100755
index 0000000..7610545
--- /dev/null
+++ b/ia-upload-crawl.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+title="$(echo $1 | sed 's/\(-[0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+.*\)//g')"
+date="$(echo $1 | sed -n 's/.*\([0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+\).*/\1/p')"
+id=$title-$date
+seeds="$(sed 'N;s/\n/, /' $1/seed_urls)"
+
+ia upload $id $1/*.gz --metadata="mediatype:web" \
+--metadata="date:$date" --metadata="title:$title" \
+--metadata="description:recursive crawl of $seeds using https://git.jordan.im/crawl, \
+taken on $date" --metadata="source:$title"
diff --git a/ia-upload-gs.sh b/ia-upload-gs.sh
new file mode 100755
index 0000000..1d5b397
--- /dev/null
+++ b/ia-upload-gs.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+title="$(echo $1 | sed 's/\(-[0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+.*\)//g')"
+date="$(echo $1 | sed -n 's/.*\([0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+\).*/\1/p')"
+id=$title-$date
+start_url="$(cat $1/start_url)"
+
+ia upload $id $1/*.gz $1/*.cdx --metadata="mediatype:web" \
+--metadata="date:$date" --metadata="title:$title" \
+--metadata="description:recursive crawl of $start_url using grab-site/wpull, \
+taken on $date" --metadata="source:$title"