From d85efeec01cd0a09dd31fd63e16919e74a82a2ed Mon Sep 17 00:00:00 2001 From: Jordan Date: Tue, 24 Jan 2023 23:14:44 -0700 Subject: ignore patterns: merge updates from upstream, regenerate --- gen-ignores.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'gen-ignores.py') diff --git a/gen-ignores.py b/gen-ignores.py index ede0529..cd372f8 100755 --- a/gen-ignores.py +++ b/gen-ignores.py @@ -10,26 +10,25 @@ import glob import json import os +import re import sys archivebot_ignore_path = sys.argv[1] -unsupported = ['\\1', '\\2', '(?!', '(?='] - -print 'package crawl\n\nvar defaultIgnorePatterns = []string{' - +print('package crawl\n\nvar defaultIgnorePatterns = []string{') for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')): try: with open(fn) as fd: - print '\n\t// %s' % os.path.basename(fn) + print('\n\t// %s' % os.path.basename(fn)) for p in json.load(fd)['patterns']: - if any(x in p for x in unsupported): + if re.search(r'\\[0-9]', p) or ('(?!' in p) or ('(?=' in p): # RE2 does not support backreferences or other # fancy PCRE constructs. This excludes <10 # patterns from the ignore list. continue - print '\t%s,' % json.dumps(p) - except Exception, e: - print >>sys.stderr, 'error in %s: %s' % (fn, e) - -print '}' - + if re.search(r'(?:%[0-9A-F]{2}){3,}', p): + continue + p = p.replace('{primary_netloc}', '.*') + print('\t%s,' % json.dumps(p)) + except Exception as e: + print('error in %s: %s' % (fn, e), file=sys.stderr) +print('}') -- cgit v1.2.3-54-g00ecf