From 64f16b6a7684a4054f46b009b1cb5a0c3751c6dd Mon Sep 17 00:00:00 2001 From: Jordan Date: Wed, 9 Mar 2022 07:21:53 +0000 Subject: initial commit --- cgit-last-modified.py | 31 ++++++++++++++++++++++++++ doi-to-filename.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ ia-upload-crawl.sh | 11 +++++++++ ia-upload-gs.sh | 11 +++++++++ 4 files changed, 115 insertions(+) create mode 100755 cgit-last-modified.py create mode 100755 doi-to-filename.py create mode 100755 ia-upload-crawl.sh create mode 100755 ia-upload-gs.sh diff --git a/cgit-last-modified.py b/cgit-last-modified.py new file mode 100755 index 0000000..2eac23c --- /dev/null +++ b/cgit-last-modified.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import git +import os +import sys +from datetime import datetime, timezone + +REPOS = "/var/www/git.jordan.im/repositories" + +for x in os.listdir(REPOS): + if not os.path.isdir(os.path.join(REPOS, x)): + continue + + print(f"processing {x}...") + + try: + repo = git.Repo(os.path.join(REPOS, x)) + for remote in repo.remotes: + remote.fetch() + md = repo.head.commit.committed_datetime + md = md.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + except Exception as e: + sys.stderr.write(str(e)) + continue + + d = os.path.join(REPOS, x, "info/web/") + os.makedirs(d, exist_ok=True) + + f = open(os.path.join(d, "last-modified"), "w") + f.write(md) + f.close() diff --git a/doi-to-filename.py b/doi-to-filename.py new file mode 100755 index 0000000..9bacf97 --- /dev/null +++ b/doi-to-filename.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import urllib.request +import xml.etree.cElementTree as ET + +def doi_to_filename(doi): + url = 'https://doi.org/%s' % doi + headers = {'Accept': 'application/vnd.crossref.unixref+xml;q=1,' + + 'application/rdf+xml;q=0.5'} + # doi.org API request + try: + resp = request(url, headers) + except Exception as err: + raise ValueError('error making API request; invalid DOI? %s' % err) + + # derive filename from XML response + try: + xml_root = get_xml_root(resp) + filename = get_filename_from_xml(xml_root) + except Exception as err: + raise ValueError('error parsing XML response; invalid DOI? %s' % err) + + return filename + +def request(url, headers): + conn = urllib.request.Request( + url, + headers=headers + ) + r = urllib.request.urlopen(conn) + return r.read().decode('utf-8') + +def get_xml_root(resp): + tree = ET.ElementTree(ET.fromstring(resp)) + return tree.getroot() + +def get_filename_from_xml(xml_root): + title = None + year = None + authors = [] + + title = xml_root.find('.//title').text + if not title: + raise ValueError('title could not be parsed, aborting...') + + year = xml_root.find('.//year').text + if year: + year = ' (%s)' % year + else: + year = '' + + for a in xml_root.iter('surname'): + authors.append(a.text) + if authors: + authors = ' - ' + ', '.join(authors) + else: + authors = '' + + filename = title + year + authors + return filename diff --git a/ia-upload-crawl.sh b/ia-upload-crawl.sh new file mode 100755 index 0000000..7610545 --- /dev/null +++ b/ia-upload-crawl.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +title="$(echo $1 | sed 's/\(-[0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+.*\)//g')" +date="$(echo $1 | sed -n 's/.*\([0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+\).*/\1/p')" +id=$title-$date +seeds="$(sed 'N;s/\n/, /' $1/seed_urls)" + +ia upload $id $1/*.gz --metadata="mediatype:web" \ +--metadata="date:$date" --metadata="title:$title" \ +--metadata="description:recursive crawl of $seeds using https://git.jordan.im/crawl, \ +taken on $date" --metadata="source:$title" diff --git a/ia-upload-gs.sh b/ia-upload-gs.sh new file mode 100755 index 0000000..1d5b397 --- /dev/null +++ b/ia-upload-gs.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +title="$(echo $1 | sed 's/\(-[0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+.*\)//g')" +date="$(echo $1 | sed -n 's/.*\([0-9][0-9][0-9][0-9]-[0-9]\+-[0-9]\+\).*/\1/p')" +id=$title-$date +start_url="$(cat $1/start_url)" + +ia upload $id $1/*.gz $1/*.cdx --metadata="mediatype:web" \ +--metadata="date:$date" --metadata="title:$title" \ +--metadata="description:recursive crawl of $start_url using grab-site/wpull, \ +taken on $date" --metadata="source:$title" -- cgit v1.2.3-54-g00ecf