aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-02-10 16:53:48 -0700
committerJordan <me@jordan.im>2022-02-10 16:53:48 -0700
commit3897d5bbdcc9aa52d88b6602e3542e690ee74f6c (patch)
tree4ec03fcd6ee70964e0f032762f98f9d484d6e775
parent07f4f6e08341ba60a7c49ed55c8e2682147b5156 (diff)
downloadcrawl-3897d5bbdcc9aa52d88b6602e3542e690ee74f6c.tar.gz
crawl-3897d5bbdcc9aa52d88b6602e3542e690ee74f6c.zip
gen-ignores, ignore_patterns: update to exclude unsupported Perl syntax, backreferences
-rwxr-xr-xgen-ignores.py6
-rw-r--r--ignore_patterns.go9
2 files changed, 5 insertions, 10 deletions
diff --git a/gen-ignores.py b/gen-ignores.py
index 25b3cac..ede0529 100755
--- a/gen-ignores.py
+++ b/gen-ignores.py
@@ -13,13 +13,16 @@ import os
import sys
archivebot_ignore_path = sys.argv[1]
+unsupported = ['\\1', '\\2', '(?!', '(?=']
+
print 'package crawl\n\nvar defaultIgnorePatterns = []string{'
+
for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
try:
with open(fn) as fd:
print '\n\t// %s' % os.path.basename(fn)
for p in json.load(fd)['patterns']:
- if '\\\\1' in p or '(?!' in p:
+ if any(x in p for x in unsupported):
# RE2 does not support backreferences or other
# fancy PCRE constructs. This excludes <10
# patterns from the ignore list.
@@ -27,5 +30,6 @@ for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
print '\t%s,' % json.dumps(p)
except Exception, e:
print >>sys.stderr, 'error in %s: %s' % (fn, e)
+
print '}'
diff --git a/ignore_patterns.go b/ignore_patterns.go
index 5d32109..65a4550 100644
--- a/ignore_patterns.go
+++ b/ignore_patterns.go
@@ -74,7 +74,6 @@ var defaultIgnorePatterns = []string{
"[\\?&]undo(after)?=\\d+",
"^https?://a\\.wikia-beacon\\.com/__track/",
"^https?://beacon\\.wikia-services\\.com/__track/",
- "([\\?&]title=|/)Sp%C3%A9cial:((Index|Pr%C3%A9fixes|Prefixes)|(Pages_li%C3%A9es|PagesLi%C3%A9es|Pages_liees|PagesLiees)|Contributions|(Liste_des_fichiers|ListeDesFichiers|Liste_des_images|ListeDesImages)|(Liste_des_utilisateurs|ListeDesUtilisateurs|Utilisateurs))/.*/\\2/",
"/Discussion_utilisateur:.+/Discussion_utilisateur:",
"/User_blog:.+/User_blog:",
"/Utilisateur:.+/Utilisateur:",
@@ -111,7 +110,6 @@ var defaultIgnorePatterns = []string{
"[\\?&]undo(after)?=\\d+",
"^https?://a\\.wikia-beacon\\.com/__track/",
"^https?://beacon\\.wikia-services\\.com/__track/",
- "([\\?&]title=|/)Special:(PrefixIndex|WhatLinksHere|Contributions|ListFiles|ListUsers)/.*/\\2/",
"/User_talk:.+/User_talk:",
"/User_blog:.+/User_blog:",
"/User:.+/User:",
@@ -185,7 +183,6 @@ var defaultIgnorePatterns = []string{
"[\\?&]undo(after)?=\\d+",
"^https?://a\\.wikia-beacon\\.com/__track/",
"^https?://beacon\\.wikia-services\\.com/__track/",
- "([\\?&]title=|/)%ED%8A%B9%EC%88%98:(%EC%A0%91%EB%91%90%EC%96%B4%EC%B0%BE%EA%B8%B0|(%EA%B0%80%EB%A6%AC%ED%82%A4%EB%8A%94%EB%AC%B8%EC%84%9C|%EB%A7%81%ED%81%AC%ED%95%98%EB%8A%94%EB%AC%B8%EC%84%9C)|(%EA%B8%B0%EC%97%AC|%EA%B8%B0%EC%97%AC%EB%AA%A9%EB%A1%9D)|(%ED%8C%8C%EC%9D%BC%EB%AA%A9%EB%A1%9D|%EA%B7%B8%EB%A6%BC%EB%AA%A9%EB%A1%9D|%ED%8C%8C%EC%9D%BC|%EA%B7%B8%EB%A6%BC)|(%EC%82%AC%EC%9A%A9%EC%9E%90%EB%AA%A9%EB%A1%9D|%EC%82%AC%EC%9A%A9%EC%9E%90))/.*/\\2/",
"/%EC%82%AC%EC%9A%A9%EC%9E%90%ED%86%A0%EB%A1%A0:.+/%EC%82%AC%EC%9A%A9%EC%9E%90%ED%86%A0%EB%A1%A0:",
"/User_blog:.+/User_blog:",
"/%EC%82%AC%EC%9A%A9%EC%9E%90:.+/%EC%82%AC%EC%9A%A9%EC%9E%90:",
@@ -493,7 +490,6 @@ var defaultIgnorePatterns = []string{
"^https?://[^/]+\\.livejournal\\.com/.*/fbmerging\\.[^/]+\\.text$",
"^https?://[^/]+\\.livejournal\\.com/.*/popup\\.[^/]+\\.text$",
"^https?://[^/]+\\.livejournal\\.com/.*/\\*sup_ru/ru/UTF-8/",
- "^https?://(?=[^/]+\\.livejournal\\.com/){primary_netloc}/(.*/)?https?%3A%2F%2F{primary_netloc}%2F",
"^https?://[^/]+\\.livejournal\\.com/.*/(photo/(\\{\\{|%7B%7B)photo\\.siteroot(\\}\\}|%7D%7D)/){2}",
"^https?://[^/]+\\.livejournal\\.com/.*/gtm\\.js$",
@@ -523,7 +519,6 @@ var defaultIgnorePatterns = []string{
"[\\?&]undo(after)?=\\d+",
"^https?://a\\.wikia-beacon\\.com/__track/",
"^https?://beacon\\.wikia-services\\.com/__track/",
- "([\\?&]title=|/)Spezial:(Pr%C3%A4fixindex|(Linkliste|Verweisliste)|Beitr%C3%A4ge|(Dateien|Dateiliste)|(Benutzer|Benutzerliste))/.*/\\2/",
"/Benutzer_Diskussion:.+/Benutzer_Diskussion:",
"/User_blog:.+/User_blog:",
"/Benutzer:.+/Benutzer:",
@@ -625,7 +620,6 @@ var defaultIgnorePatterns = []string{
"SESSION_ID=",
"showcal\\.php",
"site=guestbook.+type=(?:ASC|DESC)",
- "/sites/all/(sites|modules|libraries|scripts|themes)/.+/\\1",
"Souprava=.+Souprava=.+",
"Special:Whatlinkshere",
"start-index=-\\d+",
@@ -684,7 +678,6 @@ var defaultIgnorePatterns = []string{
"[\\?&]undo(after)?=\\d+",
"^https?://a\\.wikia-beacon\\.com/__track/",
"^https?://beacon\\.wikia-services\\.com/__track/",
- "([\\?&]title=|/)%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:(%D0%A3%D0%BA%D0%B0%D0%B7%D0%B0%D1%82%D0%B5%D0%BB%D1%8C_%D0%BF%D0%BE_%D0%BD%D0%B0%D1%87%D0%B0%D0%BB%D1%83_%D0%BD%D0%B0%D0%B7%D0%B2%D0%B0%D0%BD%D0%B8%D1%8F|%D0%A1%D1%81%D1%8B%D0%BB%D0%BA%D0%B8_%D1%81%D1%8E%D0%B4%D0%B0|%D0%92%D0%BA%D0%BB%D0%B0%D0%B4|(%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%84%D0%B0%D0%B9%D0%BB%D0%BE%D0%B2|%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D0%B8%D0%B7%D0%BE%D0%B1%D1%80%D0%B0%D0%B6%D0%B5%D0%BD%D0%B8%D0%B9)|%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%83%D1%87%D0%B0%D1%81%D1%82%D0%BD%D0%B8%D0%BA%D0%BE%D0%B2)/.*/\\2/",
"/%D0%9E%D0%B1%D1%81%D1%83%D0%B6%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5_%D1%83%D1%87%D0%B0%D1%81%D1%82%D0%BD%D0%B8%D0%BA%D0%B0:.+/%D0%9E%D0%B1%D1%81%D1%83%D0%B6%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5_%D1%83%D1%87%D0%B0%D1%81%D1%82%D0%BD%D0%B8%D0%BA%D0%B0:",
"/User_blog:.+/User_blog:",
"/%D0%A3%D1%87%D0%B0%D1%81%D1%82%D0%BD%D0%B8%D0%BA:.+/%D0%A3%D1%87%D0%B0%D1%81%D1%82%D0%BD%D0%B8%D0%BA:",
@@ -792,7 +785,6 @@ var defaultIgnorePatterns = []string{
"[\\?&]undo(after)?=\\d+",
"^https?://a\\.wikia-beacon\\.com/__track/",
"^https?://beacon\\.wikia-services\\.com/__track/",
- "([\\?&]title=|/)Especial:((P%C3%A1ginasPorPrefijo|P%C3%A1ginas_por_prefijo)|(LoQueEnlazaAqu%C3%AD|Lo_que_enlaza_aqu%C3%AD)|Contribuciones|(ListaIm%C3%A1genes|Lista_de_im%C3%A1genes)|(ListaUsuarios|Lista_de_usuarios))/.*/\\2/",
"/Usuario_discusi%C3%B3n:.+/Usuario_discusi%C3%B3n:",
"/User_blog:.+/User_blog:",
"/Usuario:.+/Usuario:",
@@ -831,7 +823,6 @@ var defaultIgnorePatterns = []string{
"[\\?&]undo(after)?=\\d+",
"^https?://a\\.wikia-beacon\\.com/__track/",
"^https?://beacon\\.wikia-services\\.com/__track/",
- "([\\?&]title=|/)%E7%89%B9%E5%88%A5:((%E5%89%8D%E6%96%B9%E4%B8%80%E8%87%B4%E3%83%9A%E3%83%BC%E3%82%B8%E4%B8%80%E8%A6%A7|%E5%A7%8B%E7%82%B9%E6%8C%87%E5%AE%9A%E3%83%9A%E3%83%BC%E3%82%B8%E4%B8%80%E8%A6%A7)|%E3%83%AA%E3%83%B3%E3%82%AF%E5%85%83|%E6%8A%95%E7%A8%BF%E8%A8%98%E9%8C%B2|(%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E4%B8%80%E8%A6%A7|%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%83%AA%E3%82%B9%E3%83%88)|(%E7%99%BB%E9%8C%B2%E5%88%A9%E7%94%A8%E8%80%85%E4%B8%80%E8%A6%A7|%E7%99%BB%E9%8C%B2%E5%88%A9%E7%94%A8%E8%80%85%E3%81%AE%E4%B8%80%E8%A6%A7))/.*/\\2/",
"/%E5%88%A9%E7%94%A8%E8%80%85%E3%83%BB%E3%83%88%E3%83%BC%E3%82%AF:.+/%E5%88%A9%E7%94%A8%E8%80%85%E3%83%BB%E3%83%88%E3%83%BC%E3%82%AF:",
"/User_blog:.+/User_blog:",
"/%E5%88%A9%E7%94%A8%E8%80%85:.+/%E5%88%A9%E7%94%A8%E8%80%85:",