aboutsummaryrefslogtreecommitdiff
path: root/gen-ignores.py
diff options
context:
space:
mode:
Diffstat (limited to 'gen-ignores.py')
-rwxr-xr-xgen-ignores.py23
1 files changed, 11 insertions, 12 deletions
diff --git a/gen-ignores.py b/gen-ignores.py
index ede0529..cd372f8 100755
--- a/gen-ignores.py
+++ b/gen-ignores.py
@@ -10,26 +10,25 @@
import glob
import json
import os
+import re
import sys
archivebot_ignore_path = sys.argv[1]
-unsupported = ['\\1', '\\2', '(?!', '(?=']
-
-print 'package crawl\n\nvar defaultIgnorePatterns = []string{'
-
+print('package crawl\n\nvar defaultIgnorePatterns = []string{')
for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
try:
with open(fn) as fd:
- print '\n\t// %s' % os.path.basename(fn)
+ print('\n\t// %s' % os.path.basename(fn))
for p in json.load(fd)['patterns']:
- if any(x in p for x in unsupported):
+ if re.search(r'\\[0-9]', p) or ('(?!' in p) or ('(?=' in p):
# RE2 does not support backreferences or other
# fancy PCRE constructs. This excludes <10
# patterns from the ignore list.
continue
- print '\t%s,' % json.dumps(p)
- except Exception, e:
- print >>sys.stderr, 'error in %s: %s' % (fn, e)
-
-print '}'
-
+ if re.search(r'(?:%[0-9A-F]{2}){3,}', p):
+ continue
+ p = p.replace('{primary_netloc}', '.*')
+ print('\t%s,' % json.dumps(p))
+ except Exception as e:
+ print('error in %s: %s' % (fn, e), file=sys.stderr)
+print('}')