From 63bd51e06b32d48878da68df8931809d42996df1 Mon Sep 17 00:00:00 2001 From: ale Date: Sun, 28 Jun 2015 21:57:59 +0100 Subject: add ignore list from ArchiveBot --- cmd/crawl/crawl.go | 74 ++++++++- gen-ignores.py | 31 ++++ ignore_patterns.go | 453 +++++++++++++++++++++++++++++++++++++++++++++++++++++ scope.go | 28 ++++ 4 files changed, 585 insertions(+), 1 deletion(-) create mode 100755 gen-ignores.py create mode 100644 ignore_patterns.go diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 63a5924..d68ac5e 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -13,6 +13,9 @@ import ( "os" "strconv" "strings" + "sync" + "sync/atomic" + "time" "git.autistici.org/ale/crawl" "git.autistici.org/ale/crawl/analysis" @@ -115,6 +118,74 @@ func NewSaveHandler(w *warc.Writer) crawl.Handler { } } +type crawlStats struct { + bytes int64 + start time.Time + + lock sync.Mutex + states map[int]int +} + +func (c *crawlStats) Update(resp *http.Response) { + c.lock.Lock() + defer c.lock.Unlock() + + c.states[resp.StatusCode]++ + resp.Body = &byteCounter{resp.Body} +} + +func (c *crawlStats) UpdateBytes(n int64) { + atomic.AddInt64(&c.bytes, n) +} + +func (c *crawlStats) Dump() { + c.lock.Lock() + defer c.lock.Unlock() + rate := float64(c.bytes) / time.Since(c.start).Seconds() / 1000 + fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states) +} + +var ( + stats *crawlStats + + client *http.Client +) + +func fetch(urlstr string) (*http.Response, error) { + resp, err := client.Get(urlstr) + if err == nil { + stats.Update(resp) + } + return resp, err +} + +func init() { + client = &http.Client{} + + stats = &crawlStats{ + states: make(map[int]int), + start: time.Now(), + } + + go func() { + for range time.Tick(10 * time.Second) { + stats.Dump() + } + }() +} + +type byteCounter struct { + io.ReadCloser +} + +func (b *byteCounter) Read(buf []byte) (int, error) { + n, err := b.ReadCloser.Read(buf) + if n > 0 { + stats.UpdateBytes(int64(n)) + } + return n, err +} + func main() { flag.Parse() @@ -128,6 +199,7 @@ func main() { crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), crawl.NewDepthScope(*depth), crawl.NewSeedScope(seeds), + crawl.NewRegexpIgnoreScope(nil), } w := warc.NewWriter(outf) @@ -135,7 +207,7 @@ func main() { saver := NewSaveHandler(w) - crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(saver)) + crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) if err != nil { log.Fatal(err) } diff --git a/gen-ignores.py b/gen-ignores.py new file mode 100755 index 0000000..25b3cac --- /dev/null +++ b/gen-ignores.py @@ -0,0 +1,31 @@ +#!/usr/bin/python +# +# Parse ArchiveBot ignore regexp patterns and generate a Go source +# file with a global variable including all of them. +# +# Invoke with a single argument, the location of a checked-out copy of +# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns. +# + +import glob +import json +import os +import sys + +archivebot_ignore_path = sys.argv[1] +print 'package crawl\n\nvar defaultIgnorePatterns = []string{' +for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')): + try: + with open(fn) as fd: + print '\n\t// %s' % os.path.basename(fn) + for p in json.load(fd)['patterns']: + if '\\\\1' in p or '(?!' in p: + # RE2 does not support backreferences or other + # fancy PCRE constructs. This excludes <10 + # patterns from the ignore list. + continue + print '\t%s,' % json.dumps(p) + except Exception, e: + print >>sys.stderr, 'error in %s: %s' % (fn, e) +print '}' + diff --git a/ignore_patterns.go b/ignore_patterns.go new file mode 100644 index 0000000..2c6d949 --- /dev/null +++ b/ignore_patterns.go @@ -0,0 +1,453 @@ +package crawl + +var defaultIgnorePatterns = []string{ + + // WordPress. + "wp-login\\.php", + "/wp-admin/", + "/xmlrpc\\.php", + + // googleplus.json + "^https?://accounts\\.google\\.com/ServiceLogin", + "^https?://accounts\\.google\\.com/SignUp", + "^https?://lh4\\.googleusercontent\\.com/proxy/[^/]+", + "^https?://plus\\.google\\.com/_/scs/apps-static/", + + // mediawiki.json + "[\\?&]oldid=\\d+", + "[\\?&]curid=\\d+", + "[\\?&]limit=(20|100|250|500)", + "[\\?&]hide(minor|bots|anons|liu|myself|redirs|links|trans|patrolled)=", + "([\\?&]title=|/)Special:(UserLogin|UserLogout|Translate|MobileFeedback|MobileOptions|RecentChangesLinked|Diff|MobileDiff)", + "([\\?&]title=|/)Special:RecentChanges&from=\\d+", + "([\\?&]title=|/)Special:ListFiles&dir=prev&offset=\\d+", + "([\\?&]title=|/)Special:(ListFiles|PrefixIndex).*&", + "([\\?&]title=|/)Special:ListFiles.*&user=", + "([\\?&]title=|/)Special:Log/", + "[\\?&]action=edit§ion=(\\d+|new)", + "[\\?&]feed(format)?=atom", + "[\\?&]redlink=1", + "[\\?&]printable=yes", + "[\\?&]mobileaction=", + "[\\?&]undo(after)?=\\d+", + "^http://a\\.wikia-beacon\\.com/__track/", + "/User_talk:.+/User_talk:", + "/User_blog:.+/User_blog:", + "/User:.+/User:", + + // nosortedindex.json + "\\?C=[NMSD];O=[AD]$", + + // coppermine.json + "(?:displayimage|thumbnails)\\.php[?&]album=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)", + "ratepic\\.php", + "addfav\\.php\\?.*ref=displayimage\\.php", + "displayimage\\.php\\?.*slideshow=\\d+", + + // youtube.json + "^https?://accounts\\.google\\.com/ServiceLogin", + "\\.?youtube\\.com/user/[^/]+/(playlists|channels|videos)\\?(flow|view|sort|live_view)=", + + // reddit.json + "^https?://www\\.reddit\\.com/gold\\?goldtype=", + "^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/[a-z0-9]+", + "^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+.*\\?sort=", + "^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/\\.compact", + "^https?://www\\.reddit\\.com/r/[^/]+/(top|new|rising|controversial|gilded|ads)/.+[\\?&]after=", + "^https?://www\\.reddit\\.com/r/[^/]+/related/", + "^https?://www\\.reddit\\.com/r/[^/]+/(gilded)?\\.mobile\\?", + "^https?://www\\.reddit\\.com/r/[^/]+/search/?\\?", + "^https?://www\\.reddit\\.com/r/[^/]+/wiki/(revisions|discussions)/user/.+", + "^https?://www\\.reddit\\.com/user/[^/]+/(comments/)?.+[\\?&]sort=", + "^https?://www\\.reddit\\.com/.+/\\.rss$", + "^https?://simple\\.reddit\\.com/", + "^https?://pixel\\.redditmedia\\.com/pixel/", + "\\.reddit\\.com/message/compose/?\\?", + "^https?://m\\.reddit\\.com/", + + // nogravatar.json + "^https?://(\\d|secure)\\.gravatar\\.com/avatar/", + + // meetupeverywhere.json + "^https?://.*\\.meetup\\.com/login/", + + // pinterest.json + "^https?://www\\.pinterest\\.com/[^/]+/\\^/[^/]+/", + "^https?://www\\.pinterest\\.com/[^/]+/[^/]+/\\^/[^/]+/", + "^https?://www\\.pinterest\\.com/[^/]+/[^/]+\\.[^/]+", + "^https?://www\\.pinterest\\.com/[^/]+/[^/]+/[^/]+\\.[^/]+", + "^https?://www\\.pinterest\\.com/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\\.js", + "^https?://www\\.pinterest\\.com/[^/]+/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\\.js", + + // noonion.json + + // blogs.json + "[\\?&]replytocom=", + "[\\?&]share=", + "/page/%d/$", + "\\?showComment(=|%5C)", + "/quote-comment-\\d+/$", + "/wp-login\\.php\\?", + "^https?://r\\-login\\.wordpress\\.com/remote\\-login\\.php", + "'\\%20\\+\\%20liker\\.(avatar|profile)_URL\\%20\\+\\%20'", + "\\%22\\%20\\+\\%20$wrapper\\.data\\(", + "^http://.+\\.blogspot\\.(com|in|com\\.au|co\\.uk|jp|co\\.nz|ca|de|it|fr|se|sg|es|pt|com\\.br|ar|mx|kr)/(search(\\?|/label/)|\\d{4,4}/\\d{2,2}/CSI/$)", + "livejournal\\.com/ljcounter/?\\?", + "\\?replyto=[0-9]+", + "[\\?&]mode=reply", + "xiti\\.com/hit\\.xiti\\?", + "/stats\\.g\\.doubleclick\\.net/dc\\.js$", + "/jetpack-comment/\\?", + "\\?like_comment=\\d+", + "^https?://.+/.+/disqus\\.com/forums/$", + "(\\?|%5Cx26)route=(/page/:page|/archive/:year/:month|/tagged/:tag|/post/:id|/image/:post_id)", + "%5Cx26route=/archive", + "^http://\\d+\\.media\\.tumblr\\.com/avatar_.+_16\\.png$", + "^http://www\\.livejournal\\.com/(tools/memadd|update|login)\\.bml\\?", + "^http://[^\\.]+\\.livejournal\\.com/.+[\\?&]mode=reply", + "^http://[^\\.]+\\.livejournal\\.com/.+/\\*sup_ru/ru/UTF-8/", + "^http://[^\\.]+\\.livejournal\\.com/.+http://[^\\.]+\\.livejournal\\.com/", + "^http://[^\\.]+\\.livejournal\\.com/.+/stats\\.g\\.doubleclick\\.net/dc\\.js$", + "^https?://www\\.dreamwidth\\.org/tools/(memadd|tellafriend)\\?", + "^https?://[^\\.]+\\.dreamwidth\\.org/.+[\\?&]mode=reply", + + // global.json + //"/(.*)/(\\1/){3,}", + "%25252525", + "/App_Themes/.+/App_Themes/", + "/bxSlider/.+/bxSlider/", + "/bxSlider/bxSlider/", + "/slides/slides/.+/slides/", + "/slides/.+/slides/slides/", + "/slides/slides/slides/", + "/js/js/.+/js/", + "/js/.+/js/js/", + "/js/js/js/", + "/css/css/.+/css/", + "/css/.+/css/css/", + "/css/css/css/", + "/styles/styles/.+/styles/", + "/styles/.+/styles/styles/", + "/styles/styles/styles/", + "/scripts/scripts/.+/scripts/", + "/scripts/.+/scripts/scripts/", + "/scripts/scripts/scripts/", + "/images/images/.+/images/", + "/images/.+/images/images/", + "/images/images/images/", + "/img/img/.+/img/", + "/img/.+/img/img/", + "/img/img/img/", + "/clientscript/clientscript/.+/clientscript/", + "/clientscript/.+/clientscript/clientscript/", + "/clientscript/clientscript/clientscript/", + "/lib/exe/.*lib[-_]exe[-_]lib[-_]exe[-_]", + "/(%5C)+(%22|%27)", + "/%5C/%5C/", + "/%27\\+[^/]+\\+%27", + "/%22\\+[^/]+\\+%22", + "/%27%20\\+[^/]+\\+%20%27", + "/%22%20\\+[^/]+\\+%20%22", + "/\\\\+(%22|%27)", + "/\\\\+[\"']", + "/\\\\/\\\\/", + "/'\\+[^/]+\\+'", + "^https?://localhost(:\\d+)?/", + "^https?://(127|10)\\.\\d+\\.\\d+\\.\\d+(:\\d+)?/", + "^https?://172\\.(1[6-9]|2\\d|3[01])\\.\\d+\\.\\d+(:\\d+)?/", + "^https?://192\\.168\\.\\d+\\.\\d+(:\\d+)?/", + "^https?://www\\.google\\.com/recaptcha/api", + "^https?://geo\\.yahoo\\.com/b\\?", + "^https?://((s-)?static\\.ak\\.fbcdn\\.net|(connect\\.|www\\.)?facebook\\.com)/connect\\.php/js/.*rsrc\\.php", + "^https?://www\\.flickr\\.com/change_language\\.gne", + "^https?://((www|web|web-beta|wayback)\\.)?archive\\.org/", + "^https?://www\\.google\\.((com|ad|ae|al|am|as|at|az|ba|be|bf|bg|bi|bj|bs|bt|by|ca|cd|cf|cg|ch|ci|cl|cm|cn|cv|cz|de|dj|dk|dm|dz|ee|es|fi|fm|fr|ga|ge|gg|gl|gm|gp|gr|gy|hn|hr|ht|hu|ie|im|iq|is|it|je|jo|ki|kg|kz|la|li|lk|lt|lu|lv|md|me|mg|mk|ml|mn|ms|mu|mv|mw|ne|nl|no|nr|nu|pl|pn|ps|pt|ro|ru|rw|sc|se|sh|si|sk|sn|so|sm|sr|st|td|tg|tk|tl|tm|tn|to|tt|vg|vu|ws|rs|cat)|(com\\.(af|ag|ai|ar|au|bd|bh|bn|bo|br|bz|co|cu|cy|do|ec|eg|et|fj|gh|gi|gt|hk|jm|kh|kw|lb|ly|mm|mt|mx|my|na|nf|ng|ni|np|om|pa|pe|pg|ph|pk|pr|py|qa|sa|sb|sg|sl|sv|tj|tr|tw|ua|uy|vc|vn))|(co\\.(ao|bw|ck|cr|id|il|in|jp|ke|kr|ls|ma|mz|nz|th|tz|ug|uk|uz|ve|vi|za|zm|zw)))/finance\\?noIL=1&q=[^&]+&ei=", + "^https?://upload\\.wikimedia\\.org/wikipedia/[^/]+/thumb/", + "^http://b\\.scorecardresearch\\.com/", + "^http://i\\.dev\\.cdn\\.turner\\.com/", + "^http://video-subtitle\\.tedcdn\\.com/", + "^http://download\\.ted\\.com/", + "^http://msft\\.digitalrivercontent\\.net/win/.+\\.iso", + "^https?://tmz\\.vo\\.llnwd\\.net/", + "^https?://(www\\.)?megaupload\\.com/", + "^https?://(www\\.)?filesonic\\.com/", + "^https?://(www\\.)?wupload\\.com/", + "^https?://prod-preview\\.wired\\.com/", + "^http://([^\\./]+\\.)?stream\\.publicradio\\.org/", + "^http://icecast\\.streaming\\.castor\\.nl/", + "^http://wm1\\.streaming\\.castor\\.nl:8000/", + "^http://icecast\\.databoss\\.nl:8000/", + "^http://stream\\.rynothebearded\\.com:8000/", + "^http://mp3\\.live\\.tv-radio\\.com/", + "^http://av\\.rasset\\.ie/av/live/", + "^http://gcnplayer\\.gcnlive\\.com/.+", + "^http://streaming\\.radionomy\\.com/", + "^http://mp3\\.ffh\\.de/", + "^http://(www\\.)?theradio\\.cc\\:8000/", + "^http://(audio\\d?|nfw)\\.video\\.ria\\.ru/", + "^http://eu1\\.fastcast4u\\.com:3048/", + "^http://[^\\./]+\\.radioscoop\\.(com|net):\\d+/", + "^http://[^\\./]+\\.streamchan\\.org:\\d+/", + "^http://[^/]*musicproxy\\.s12\\.de/", + "^http://stream\\.rfi\\.fr/", + "^http://striiming\\d?\\.trio\\.ee/", + "^http://streamer\\.radiocampus\\.be(:\\d+)?/", + "^http://relay\\.broadcastify\\.com/", + "^http://audio\\d?\\.radioreference\\.com/", + "^http://[^/]+\\.akadostream\\.ru(:\\d+)?/", + "^http://radio\\.silver\\.ru(:\\d+)?/", + "^http://icecast\\.szwoelf\\.com:8000/", + "^http://altair\\.micronick\\.com:8080/\\?action=stream", + "^http://94\\.25\\.53\\.13[1-4]/.+\\.mp3$", + "^http://server\\.lradio\\.ru:\\d+/", + "^http://188\\.93\\.17\\.201:8080/", + "^http://81\\.19\\.85\\.19[56]/.+\\.mp3$", + "^http://81\\.19\\.85\\.203/.+\\.mp3$", + "^http://play(\\d+)?\\.radio13\\.ru:8000/", + "^http://stream(\\d+)?\\.media\\.rambler\\.ru/", + "^http://pub(\\d+)?\\.di\\.fm/", + "^http://vostok\\.fmtuner\\.ru/", + "^http://109\\.120\\.141\\.181:8000/", + "^http://195\\.88\\.63\\.114:8000/", + "^http://radiosilver\\.corbina\\.net:8000/", + "^http://89\\.251\\.147\\.100/", + "^http://bcs\\d?\\.fontanka\\.fm:8000/", + "^http://stream2\\.cnmns\\.net/", + "^http://[^/]+\\.streamtheworld\\.com/", + "^http://[^/]+\\.gaduradio\\.pl/", + "^http://anka\\.org:8080/", + "^http://radio\\.visionotaku\\.com:8000/", + "^http://stream\\.r-a-d\\.io/", + "^http://r-a-d\\.io/.+\\.mp3$", + "^http://95\\.81\\.155\\.17/", + "^https?://icecast\\.rtl2?\\.fr/", + "^http://mp3tslg\\.tdf-cdn\\.com/", + "^http://[^/]+/anony/mjpg\\.cgi$", + "^https?://air\\.radiorecord\\.ru(:\\d+)?/", + "^https?://[^/]+\\.rastream\\.com(:\\d+)?/", + "^https?://audiots\\.scdn\\.arkena\\.com/", + "^https?://(www|draft)\\.blogger\\.com/(navbar\\.g|post-edit\\.g|delete-comment\\.g|comment-iframe\\.g|share-post\\.g|email-post\\.g|blog-this\\.g|delete-backlink\\.g|rearrange|blog_this\\.pyra)\\?", + "^https?://www\\.tumblr\\.com/(impixu\\?|share(/link/?)?\\?|reblog/)", + "^https?://plus\\.google\\.com/share\\?", + "^https?://(apis|plusone)\\.google\\.com/_/\\+1/", + "^https?://(ssl\\.|www\\.)?reddit\\.com/(login\\?dest=|submit\\?|static/button/button)", + "^https?://digg\\.com/submit\\?", + "^https?://(www\\.)?facebook\\.com/(plugins/like(box)?\\.php|sharer/sharer\\.php|sharer?\\.php|dialog/(feed|share))\\?", + "^https?://(www\\.)?twitter\\.com/(share\\?|intent/((re)?tweet|favorite)|home/?\\?status=|\\?status=)", + "^https?://platform\\d?\\.twitter\\.com/widgets/tweet_button.html\\?", + "^https?://www\\.newsvine\\.com/_wine/save\\?", + "^https?://www\\.netvibes\\.com/subscribe\\.php\\?", + "^https?://add\\.my\\.yahoo\\.com/(rss|content)\\?", + "^http://www\\.addtoany\\.com/(add_to/|share_save\\?)", + "^https?://www\\.addthis\\.com/bookmark\\.php\\?", + "^https?://(www\\.)?pinterest\\.com/pin/create/", + "^https?://www\\.linkedin\\.com/(cws/share|shareArticle)\\?", + "^https?://(www\\.)?stumbleupon\\.com/(submit\\?|badge/embed/)", + "^https?://csp\\.cyworld\\.com/bi/bi_recommend_pop\\.php\\?", + "^https://share\\.flipboard\\.com/bookmarklet/popout\\?", + "^https?://flattr.com/submit/auto\\?", + "^https?://(www\\.)?myspace\\.com/Modules/PostTo/", + "^https?://www\\.google\\.com/bookmarks/mark\\?", + "^http://myweb2\\.search\\.yahoo\\.com/myresults/bookmarklet\\?", + "^http://vuible\\.com/pins-settings/", + "^https?://news\\.ycombinator\\.com/submitlink\\?", + "^http://reporter\\.es\\.msn\\.com/\\?fn=contribute", + "^http://www\\.blinklist\\.com/index\\.php\\?Action=Blink/addblink\\.php", + "^http://sphinn\\.com/index\\.php\\?c=post&m=submit&", + "^http://posterous\\.com/share\\?", + "^http://del\\.icio\\.us/post\\?", + "^https?://delicious\\.com/(save|post)\\?", + "^https?://(www\\.)?friendfeed\\.com/share\\?", + "^https?://(www\\.)?xing\\.com/(app/user\\?op=share|social_plugins/share\\?)", + "^http://iwiw\\.hu/pages/share/share\\.jsp\\?", + "^http://memori(\\.qip)?\\.ru/link/\\?", + "^http://wow\\.ya\\.ru/posts_(add|share)_link\\.xml\\?", + "^https?://connect\\.mail\\.ru/share\\?", + "^http://zakladki\\.yandex\\.ru/newlink\\.xml\\?", + "^https?://vkontakte\\.ru/share\\.php\\?", + "^https?://www\\.odnoklassniki\\.ru/dk\\?st\\.cmd=addShare", + "^https?://www\\.google\\.com/(reader/link\\?|buzz/post\\?)", + "^https?://service\\.weibo\\.com/share/share\\.php\\?", + "^https?://(www\\.)?technorati\\.com/faves/?\\?add=", + "^https?://bufferapp\\.com/add\\?", + "^https?://b\\.hatena\\.ne\\.jp/add\\?", + "^https?://api\\.addthis\\.com/", + "^https?://bookmark\\.naver\\.com/post\\?", + "^https?://mail\\.google\\.com/mail/", + "^http://pixel\\.blog\\.hu/", + "^https?://pixel\\.quantserve\\.com/", + "^http://b\\.scorecardresearch\\.com/", + "^https?://(www|ssl)\\.google-analytics\\.com/(r/)?(__utm\\.gif|collect\\?)", + "^https?://p\\.opt\\.fimserve\\.com/", + "^https?://(\\d|www|secure)\\.gravatar\\.com/avatar/ad516503a11cd5ca435acc9bb6523536", + "^https?://imageshack\\.com/lost$", + "^https?://[^/]+\\.corp\\.ne1\\.yahoo\\.com/", + "^https?://.+/js-agent\\.newrelic\\.com/nr-\\d{3,3}(\\.min)?\\.js$", + "^https?://.+/stats\\.g\\.doubleclick\\.net/dc\\.js$", + "^https?://.+/js/chartbeat\\.js$", + "^http://www\\.khaleejtimes\\.com/.+/kt_.+/kt_", + "^http://www\\.khaleejtimes\\.com/.+/images/.+/images/", + "^http://www\\.khaleejtimes\\.com/.+/imgactv/.+/imgactv/", + "^http://photobucket\\.com/.+/albums/.+/albums/", + "^https?://([^/]+\\.)?gdcvault\\.com(/.*/|/)(fonts(/.*/|/)fonts/|css(/.*/|/)css/|img(/.*/|/)img/)", + "^https://static\\.licdn\\.com/sc/p/com\\.linkedin\\.nux(:|%3A)nux-static-content(\\+|%2B)[\\d\\.]+/f/", + "^https?://www\\.flickr\\.com/(explore/|photos/[^/]+/(sets/\\d+/(page\\d+/)?)?)\\d+_[a-f0-9]+(_[a-z])?\\.jpg$", + "^https?://static\\.licdn\\.com/sc/p/.+/f//", + "^http://www\\.warnerbros\\.com/\\d+$", + "^https?://tm\\.uol\\.com\\.br/h/.+/h/", + "^https?://media\\.opb\\.org/clips/embed/.+\\.js$", + + // twitter.json + "^https?://((?:www|mobile)\\.)?twitter\\.com/.+\\?(?:id|lang|locale|screen_name)=", + "^https?://mobile\\.twitter\\.com/i/anonymize\\?data=", + + // imdb.json + "^http://b\\.scorecardresearch\\.com/", + "^http://ad\\.doubleclick\\.net/", + "^http://www\\.imdb\\.com/rd/", + "^http://www\\.imdb\\.com/.+\\?ref_=", + "^http://www\\.imdb\\.com/.+/board/flat/", + "^http://www\\.imdb\\.com/.+/board/inline/", + "^http://www\\.imdb\\.com/.+/board/thread/", + "^http://www\\.imdb\\.com/help/boards_posting\\.html", + "^http://www\\.imdb\\.com/register/", + "^http://www\\.imdb\\.com/.+/board/.+/\\d+\\?d=", + "^http://www\\.imdb\\.com/.+/videogallery/.+/.+/", + + // facebook.json + "^https?://error\\.facebook\\.com/common/scribe_endpoint\\.php\\?c=", + "^https?://www\\.facebook\\.com/[^/]+/(posts/|app_)[^/]+\\?(ref=page_internal&)?_fb_noscript=", + "^https?://www\\.facebook\\.com/[^/]+/photos/(pb|a)\\.[^/]+/[^/]+/.{4,4}/", + "^https?://www\\.facebook\\.com/[^/]+/photos/(pb|a)\\.[^/]+/[^/]+/\\?type=", + + // internetcentrum.json + "%3Bamp%3Bamp", + "&action=edit", + "action=(?:komentar|send)", + "action=(?:multiple_products_add_product|notify|add_product|buy_now)", + "&action=submit", + "&action=edit", + "amp;amp;", + "answer=.+?&anksent=true", + "[a-z0-9]=(?:off|on)", + "blog=1&disp=msgform", + "\\?cal=", + "calendar_menu/calendar\\.php", + "calendar_menu/event\\.php", + "calendar\\.php", + "calendar_scheduler\\.php", + "captcha.php", + "cas12&cas12", + "comment\\.php\\?akce=new", + "/comment/reply/\\d+", + "cPath=.+&sort=.+", + "destination=node/%2F\\d+", + "destination=node/\\d+", + "(?:displayimage|thumbnails)\\.php\\?pos=-\\d+", + "file=posting.+mode=quote", + "&highlight=&", + "^http://harizzzma\\.com", + "^http://www.nahraj.net/", + "index.*\\.php\\?option=com_eventcal", + "index.php\\?site=calendar", + "index\\.php\\?site=guestbook&type=(?:ASC|DESC)", + "index.php/Speci%C3%A1ln%C3%AD", + "index.php\\?title=Diskuse:", + "index.php\\?title=MediaWiki_diskuse:", + "index.php\\?title=Soubor_diskuse", + "index.php\\?title=Speci%C3%A1ln%C3%AD", + "index\\.php\\?\\w+&rok=(1995|2016)&mesic=\\d+&autor=\\d+$", + "index\\.php\\?.+year=198.", + "index\\.php\\?.+year=203.", + "kalendar-akci", + "kalendar\\.php", + "kalendarrok=\\d{4}", + //"lang=(?!czech|english)", + //"language=(?!cs|en)", + "LightNEasy\\.php\\?do=login", + "limit=.+limit=.+", + "login=", + "login\\.php", + "(?:login|registrace|live\\?)", + "mact=Calendar", + "main_page=(?:product_reviews_write|login|cookie_usage)", + "memberlist\\.php\\?mode=email", + "memberlist\\.php\\?mode=.+order=", + "(?:memberlist|viewprofile|viewtopic)\\.php\\?.*sk=.&sd=.", + "mini.+calendar", + "mm=\\d+.+yy=\\d{4}", + "mode=(?:lostpassword|sendpassword)", + "modules.+name=Forums.+view=(?:next|previous)", + "modules\\.php\\?name=coppermine.*file=displayimage.+&slideshow=\\d+", + "modules\\.php\\?name=coppermine.*meta=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)", + "modules\\.php\\?name=Statistics", + "mo=\\d+.+ye=\\d{4}", + "name=Kalender", + "name=Statistics", + "option=com_jcalpro.+date=\\d{4}-", + "\\?option=com.+&month=.+&year=\\d{4}", + "option=&Itemid=.+&date=\\d{4}-", + //"order=(?!1)", + "orderby=(?:name|note|count|news)", + "photo.php\\?i=-\\d+", + "/photos.+\\?url=", + ".*\\..*\\..*\\.pl", + "p=ordersBasket.+sOption=add", + "portal\\.php\\?month=[\\d]+", + "postdays=0&postorder=asc", + "prev_next=(?:prev|next)", + "/calendar/", + "product_reviews_write\\.php\\?", + "profile\\.php\\?mode=email", + "profile\\.php\\?mode=register", + "\\?q=event.+/(?:day|list|month|table|week)/all/all", + "random_num=\\d+", + "Recentchangeslinked/", + "report\\.php\\?f=.+", + "search_id=mini_cal&d=\\d+", + "SESSION_ID=", + "showcal\\.php", + "site=guestbook.+type=(?:ASC|DESC)", + //"/sites/all/(sites|modules|libraries|scripts|themes)/.+/\\1", + "Souprava=.+Souprava=.+", + "Special:Whatlinkshere", + "start-index=-\\d+", + "/switchuilocale/", + "target[xy]=.+target[xy]=.+", + "tellafriend\\.php", + ":Userlogin&", + "user/(?:register|login)", + "viewtopic\\.php\\?.*highlight=", + "viewtopic\\.php\\?p=\\d+", + "viewtopic\\.php\\?.+view=print", + "y=\\d{4}&m=\\d+", + + // forums.json + "/cron\\.php\\?", + "/external\\.php\\?type=rss", + "/login\\.php\\?", + "/newreply\\.php\\?", + "/private\\.php\\?", + "/privmsg\\.php\\?", + "/register\\.php\\?", + "/sendmessage\\.php\\?", + "/subscription\\.php\\?", + "/posting\\.php\\?", + "/viewtopic\\.php\\?.+&view=(next|previous)", + "/viewtopic\\.php\\?.+&hilit=", + "/feed\\.php\\?", + "/index\\.php\\?option=com_mailto", + "&view=login&return=", + "&format=opensearch", + "/misc\\.php\\?do=whoposted", + "/newthread\\.php\\?", + "/post_thanks\\.php\\?", + "/blog_post\\.php\\?do=newblog", + "/forumdisplay\\.php.*[\\?&]do=markread", + "/userpoll/vote\\.php\\?", + "/showthread\\.php.*[\\?&]goto=(next(old|new)est|newpost)", + "/editpost\\.php\\?", + "/\\?view=getlastpost$", + "/index\\.php\\?sharelink=", + "/ucp\\.php\\?mode=delete_cookies", +} diff --git a/scope.go b/scope.go index a2c06b6..ccba5f5 100644 --- a/scope.go +++ b/scope.go @@ -3,6 +3,7 @@ package crawl import ( "fmt" "net/url" + "regexp" "strings" ) @@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope { } return NewURLPrefixScope(pfx) } + +type regexpIgnoreScope struct { + ignores []*regexp.Regexp +} + +func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool { + uriStr := uri.String() + for _, i := range s.ignores { + if i.MatchString(uriStr) { + return false + } + } + return true +} + +func NewRegexpIgnoreScope(ignores []string) Scope { + if ignores == nil { + ignores = defaultIgnorePatterns + } + r := regexpIgnoreScope{ + ignores: make([]*regexp.Regexp, 0, len(ignores)), + } + for _, i := range ignores { + r.ignores = append(r.ignores, regexp.MustCompile(i)) + } + return &r +} -- cgit v1.2.3-54-g00ecf