blob: cd372f841458ee803955e6b33d757ed497e89952 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
#!/usr/bin/python
#
# Parse ArchiveBot ignore regexp patterns and generate a Go source
# file with a global variable including all of them.
#
# Invoke with a single argument, the location of a checked-out copy of
# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns.
#
import glob
import json
import os
import re
import sys
archivebot_ignore_path = sys.argv[1]
print('package crawl\n\nvar defaultIgnorePatterns = []string{')
for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
try:
with open(fn) as fd:
print('\n\t// %s' % os.path.basename(fn))
for p in json.load(fd)['patterns']:
if re.search(r'\\[0-9]', p) or ('(?!' in p) or ('(?=' in p):
# RE2 does not support backreferences or other
# fancy PCRE constructs. This excludes <10
# patterns from the ignore list.
continue
if re.search(r'(?:%[0-9A-F]{2}){3,}', p):
continue
p = p.replace('{primary_netloc}', '.*')
print('\t%s,' % json.dumps(p))
except Exception as e:
print('error in %s: %s' % (fn, e), file=sys.stderr)
print('}')
|