aboutsummaryrefslogtreecommitdiff
path: root/gen-ignores.py
diff options
context:
space:
mode:
authorale <ale@incal.net>2015-06-28 21:57:59 +0100
committerale <ale@incal.net>2015-06-28 21:57:59 +0100
commit63bd51e06b32d48878da68df8931809d42996df1 (patch)
tree40a6e124545015654c56d0600221530301b78fce /gen-ignores.py
parentaa6e67d7b2996b3b3c4e93ad6608c5753f03f03b (diff)
downloadcrawl-63bd51e06b32d48878da68df8931809d42996df1.tar.gz
crawl-63bd51e06b32d48878da68df8931809d42996df1.zip
add ignore list from ArchiveBot
Diffstat (limited to 'gen-ignores.py')
-rwxr-xr-xgen-ignores.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/gen-ignores.py b/gen-ignores.py
new file mode 100755
index 0000000..25b3cac
--- /dev/null
+++ b/gen-ignores.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+#
+# Parse ArchiveBot ignore regexp patterns and generate a Go source
+# file with a global variable including all of them.
+#
+# Invoke with a single argument, the location of a checked-out copy of
+# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns.
+#
+
+import glob
+import json
+import os
+import sys
+
+archivebot_ignore_path = sys.argv[1]
+print 'package crawl\n\nvar defaultIgnorePatterns = []string{'
+for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
+ try:
+ with open(fn) as fd:
+ print '\n\t// %s' % os.path.basename(fn)
+ for p in json.load(fd)['patterns']:
+ if '\\\\1' in p or '(?!' in p:
+ # RE2 does not support backreferences or other
+ # fancy PCRE constructs. This excludes <10
+ # patterns from the ignore list.
+ continue
+ print '\t%s,' % json.dumps(p)
+ except Exception, e:
+ print >>sys.stderr, 'error in %s: %s' % (fn, e)
+print '}'
+