diff options
author | ale <ale@incal.net> | 2015-06-28 21:57:59 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2015-06-28 21:57:59 +0100 |
commit | 63bd51e06b32d48878da68df8931809d42996df1 (patch) | |
tree | 40a6e124545015654c56d0600221530301b78fce /gen-ignores.py | |
parent | aa6e67d7b2996b3b3c4e93ad6608c5753f03f03b (diff) | |
download | crawl-63bd51e06b32d48878da68df8931809d42996df1.tar.gz crawl-63bd51e06b32d48878da68df8931809d42996df1.zip |
add ignore list from ArchiveBot
Diffstat (limited to 'gen-ignores.py')
-rwxr-xr-x | gen-ignores.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/gen-ignores.py b/gen-ignores.py new file mode 100755 index 0000000..25b3cac --- /dev/null +++ b/gen-ignores.py @@ -0,0 +1,31 @@ +#!/usr/bin/python +# +# Parse ArchiveBot ignore regexp patterns and generate a Go source +# file with a global variable including all of them. +# +# Invoke with a single argument, the location of a checked-out copy of +# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns. +# + +import glob +import json +import os +import sys + +archivebot_ignore_path = sys.argv[1] +print 'package crawl\n\nvar defaultIgnorePatterns = []string{' +for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')): + try: + with open(fn) as fd: + print '\n\t// %s' % os.path.basename(fn) + for p in json.load(fd)['patterns']: + if '\\\\1' in p or '(?!' in p: + # RE2 does not support backreferences or other + # fancy PCRE constructs. This excludes <10 + # patterns from the ignore list. + continue + print '\t%s,' % json.dumps(p) + except Exception, e: + print >>sys.stderr, 'error in %s: %s' % (fn, e) +print '}' + |