aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@incal.net>2015-06-28 21:57:59 +0100
committerale <ale@incal.net>2015-06-28 21:57:59 +0100
commit63bd51e06b32d48878da68df8931809d42996df1 (patch)
tree40a6e124545015654c56d0600221530301b78fce
parentaa6e67d7b2996b3b3c4e93ad6608c5753f03f03b (diff)
downloadcrawl-63bd51e06b32d48878da68df8931809d42996df1.tar.gz
crawl-63bd51e06b32d48878da68df8931809d42996df1.zip
add ignore list from ArchiveBot
-rw-r--r--cmd/crawl/crawl.go74
-rwxr-xr-xgen-ignores.py31
-rw-r--r--ignore_patterns.go453
-rw-r--r--scope.go28
4 files changed, 585 insertions, 1 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 63a5924..d68ac5e 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -13,6 +13,9 @@ import (
"os"
"strconv"
"strings"
+ "sync"
+ "sync/atomic"
+ "time"
"git.autistici.org/ale/crawl"
"git.autistici.org/ale/crawl/analysis"
@@ -115,6 +118,74 @@ func NewSaveHandler(w *warc.Writer) crawl.Handler {
}
}
+type crawlStats struct {
+ bytes int64
+ start time.Time
+
+ lock sync.Mutex
+ states map[int]int
+}
+
+func (c *crawlStats) Update(resp *http.Response) {
+ c.lock.Lock()
+ defer c.lock.Unlock()
+
+ c.states[resp.StatusCode]++
+ resp.Body = &byteCounter{resp.Body}
+}
+
+func (c *crawlStats) UpdateBytes(n int64) {
+ atomic.AddInt64(&c.bytes, n)
+}
+
+func (c *crawlStats) Dump() {
+ c.lock.Lock()
+ defer c.lock.Unlock()
+ rate := float64(c.bytes) / time.Since(c.start).Seconds() / 1000
+ fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states)
+}
+
+var (
+ stats *crawlStats
+
+ client *http.Client
+)
+
+func fetch(urlstr string) (*http.Response, error) {
+ resp, err := client.Get(urlstr)
+ if err == nil {
+ stats.Update(resp)
+ }
+ return resp, err
+}
+
+func init() {
+ client = &http.Client{}
+
+ stats = &crawlStats{
+ states: make(map[int]int),
+ start: time.Now(),
+ }
+
+ go func() {
+ for range time.Tick(10 * time.Second) {
+ stats.Dump()
+ }
+ }()
+}
+
+type byteCounter struct {
+ io.ReadCloser
+}
+
+func (b *byteCounter) Read(buf []byte) (int, error) {
+ n, err := b.ReadCloser.Read(buf)
+ if n > 0 {
+ stats.UpdateBytes(int64(n))
+ }
+ return n, err
+}
+
func main() {
flag.Parse()
@@ -128,6 +199,7 @@ func main() {
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
+ crawl.NewRegexpIgnoreScope(nil),
}
w := warc.NewWriter(outf)
@@ -135,7 +207,7 @@ func main() {
saver := NewSaveHandler(w)
- crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(saver))
+ crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
if err != nil {
log.Fatal(err)
}
diff --git a/gen-ignores.py b/gen-ignores.py
new file mode 100755
index 0000000..25b3cac
--- /dev/null
+++ b/gen-ignores.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+#
+# Parse ArchiveBot ignore regexp patterns and generate a Go source
+# file with a global variable including all of them.
+#
+# Invoke with a single argument, the location of a checked-out copy of
+# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns.
+#
+
+import glob
+import json
+import os
+import sys
+
+archivebot_ignore_path = sys.argv[1]
+print 'package crawl\n\nvar defaultIgnorePatterns = []string{'
+for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
+ try:
+ with open(fn) as fd:
+ print '\n\t// %s' % os.path.basename(fn)
+ for p in json.load(fd)['patterns']:
+ if '\\\\1' in p or '(?!' in p:
+ # RE2 does not support backreferences or other
+ # fancy PCRE constructs. This excludes <10
+ # patterns from the ignore list.
+ continue
+ print '\t%s,' % json.dumps(p)
+ except Exception, e:
+ print >>sys.stderr, 'error in %s: %s' % (fn, e)
+print '}'
+
diff --git a/ignore_patterns.go b/ignore_patterns.go
new file mode 100644
index 0000000..2c6d949
--- /dev/null
+++ b/ignore_patterns.go
@@ -0,0 +1,453 @@
+package crawl
+
+var defaultIgnorePatterns = []string{
+
+ // WordPress.
+ "wp-login\\.php",
+ "/wp-admin/",
+ "/xmlrpc\\.php",
+
+ // googleplus.json
+ "^https?://accounts\\.google\\.com/ServiceLogin",
+ "^https?://accounts\\.google\\.com/SignUp",
+ "^https?://lh4\\.googleusercontent\\.com/proxy/[^/]+",
+ "^https?://plus\\.google\\.com/_/scs/apps-static/",
+
+ // mediawiki.json
+ "[\\?&]oldid=\\d+",
+ "[\\?&]curid=\\d+",
+ "[\\?&]limit=(20|100|250|500)",
+ "[\\?&]hide(minor|bots|anons|liu|myself|redirs|links|trans|patrolled)=",
+ "([\\?&]title=|/)Special:(UserLogin|UserLogout|Translate|MobileFeedback|MobileOptions|RecentChangesLinked|Diff|MobileDiff)",
+ "([\\?&]title=|/)Special:RecentChanges&from=\\d+",
+ "([\\?&]title=|/)Special:ListFiles&dir=prev&offset=\\d+",
+ "([\\?&]title=|/)Special:(ListFiles|PrefixIndex).*&amp;",
+ "([\\?&]title=|/)Special:ListFiles.*&user=",
+ "([\\?&]title=|/)Special:Log/",
+ "[\\?&]action=edit&section=(\\d+|new)",
+ "[\\?&]feed(format)?=atom",
+ "[\\?&]redlink=1",
+ "[\\?&]printable=yes",
+ "[\\?&]mobileaction=",
+ "[\\?&]undo(after)?=\\d+",
+ "^http://a\\.wikia-beacon\\.com/__track/",
+ "/User_talk:.+/User_talk:",
+ "/User_blog:.+/User_blog:",
+ "/User:.+/User:",
+
+ // nosortedindex.json
+ "\\?C=[NMSD];O=[AD]$",
+
+ // coppermine.json
+ "(?:displayimage|thumbnails)\\.php[?&]album=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)",
+ "ratepic\\.php",
+ "addfav\\.php\\?.*ref=displayimage\\.php",
+ "displayimage\\.php\\?.*slideshow=\\d+",
+
+ // youtube.json
+ "^https?://accounts\\.google\\.com/ServiceLogin",
+ "\\.?youtube\\.com/user/[^/]+/(playlists|channels|videos)\\?(flow|view|sort|live_view)=",
+
+ // reddit.json
+ "^https?://www\\.reddit\\.com/gold\\?goldtype=",
+ "^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/[a-z0-9]+",
+ "^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+.*\\?sort=",
+ "^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/\\.compact",
+ "^https?://www\\.reddit\\.com/r/[^/]+/(top|new|rising|controversial|gilded|ads)/.+[\\?&]after=",
+ "^https?://www\\.reddit\\.com/r/[^/]+/related/",
+ "^https?://www\\.reddit\\.com/r/[^/]+/(gilded)?\\.mobile\\?",
+ "^https?://www\\.reddit\\.com/r/[^/]+/search/?\\?",
+ "^https?://www\\.reddit\\.com/r/[^/]+/wiki/(revisions|discussions)/user/.+",
+ "^https?://www\\.reddit\\.com/user/[^/]+/(comments/)?.+[\\?&]sort=",
+ "^https?://www\\.reddit\\.com/.+/\\.rss$",
+ "^https?://simple\\.reddit\\.com/",
+ "^https?://pixel\\.redditmedia\\.com/pixel/",
+ "\\.reddit\\.com/message/compose/?\\?",
+ "^https?://m\\.reddit\\.com/",
+
+ // nogravatar.json
+ "^https?://(\\d|secure)\\.gravatar\\.com/avatar/",
+
+ // meetupeverywhere.json
+ "^https?://.*\\.meetup\\.com/login/",
+
+ // pinterest.json
+ "^https?://www\\.pinterest\\.com/[^/]+/\\^/[^/]+/",
+ "^https?://www\\.pinterest\\.com/[^/]+/[^/]+/\\^/[^/]+/",
+ "^https?://www\\.pinterest\\.com/[^/]+/[^/]+\\.[^/]+",
+ "^https?://www\\.pinterest\\.com/[^/]+/[^/]+/[^/]+\\.[^/]+",
+ "^https?://www\\.pinterest\\.com/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\\.js",
+ "^https?://www\\.pinterest\\.com/[^/]+/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\\.js",
+
+ // noonion.json
+
+ // blogs.json
+ "[\\?&]replytocom=",
+ "[\\?&]share=",
+ "/page/%d/$",
+ "\\?showComment(=|%5C)",
+ "/quote-comment-\\d+/$",
+ "/wp-login\\.php\\?",
+ "^https?://r\\-login\\.wordpress\\.com/remote\\-login\\.php",
+ "'\\%20\\+\\%20liker\\.(avatar|profile)_URL\\%20\\+\\%20'",
+ "\\%22\\%20\\+\\%20$wrapper\\.data\\(",
+ "^http://.+\\.blogspot\\.(com|in|com\\.au|co\\.uk|jp|co\\.nz|ca|de|it|fr|se|sg|es|pt|com\\.br|ar|mx|kr)/(search(\\?|/label/)|\\d{4,4}/\\d{2,2}/CSI/$)",
+ "livejournal\\.com/ljcounter/?\\?",
+ "\\?replyto=[0-9]+",
+ "[\\?&]mode=reply",
+ "xiti\\.com/hit\\.xiti\\?",
+ "/stats\\.g\\.doubleclick\\.net/dc\\.js$",
+ "/jetpack-comment/\\?",
+ "\\?like_comment=\\d+",
+ "^https?://.+/.+/disqus\\.com/forums/$",
+ "(\\?|%5Cx26)route=(/page/:page|/archive/:year/:month|/tagged/:tag|/post/:id|/image/:post_id)",
+ "%5Cx26route=/archive",
+ "^http://\\d+\\.media\\.tumblr\\.com/avatar_.+_16\\.png$",
+ "^http://www\\.livejournal\\.com/(tools/memadd|update|login)\\.bml\\?",
+ "^http://[^\\.]+\\.livejournal\\.com/.+[\\?&]mode=reply",
+ "^http://[^\\.]+\\.livejournal\\.com/.+/\\*sup_ru/ru/UTF-8/",
+ "^http://[^\\.]+\\.livejournal\\.com/.+http://[^\\.]+\\.livejournal\\.com/",
+ "^http://[^\\.]+\\.livejournal\\.com/.+/stats\\.g\\.doubleclick\\.net/dc\\.js$",
+ "^https?://www\\.dreamwidth\\.org/tools/(memadd|tellafriend)\\?",
+ "^https?://[^\\.]+\\.dreamwidth\\.org/.+[\\?&]mode=reply",
+
+ // global.json
+ //"/(.*)/(\\1/){3,}",
+ "%25252525",
+ "/App_Themes/.+/App_Themes/",
+ "/bxSlider/.+/bxSlider/",
+ "/bxSlider/bxSlider/",
+ "/slides/slides/.+/slides/",
+ "/slides/.+/slides/slides/",
+ "/slides/slides/slides/",
+ "/js/js/.+/js/",
+ "/js/.+/js/js/",
+ "/js/js/js/",
+ "/css/css/.+/css/",
+ "/css/.+/css/css/",
+ "/css/css/css/",
+ "/styles/styles/.+/styles/",
+ "/styles/.+/styles/styles/",
+ "/styles/styles/styles/",
+ "/scripts/scripts/.+/scripts/",
+ "/scripts/.+/scripts/scripts/",
+ "/scripts/scripts/scripts/",
+ "/images/images/.+/images/",
+ "/images/.+/images/images/",
+ "/images/images/images/",
+ "/img/img/.+/img/",
+ "/img/.+/img/img/",
+ "/img/img/img/",
+ "/clientscript/clientscript/.+/clientscript/",
+ "/clientscript/.+/clientscript/clientscript/",
+ "/clientscript/clientscript/clientscript/",
+ "/lib/exe/.*lib[-_]exe[-_]lib[-_]exe[-_]",
+ "/(%5C)+(%22|%27)",
+ "/%5C/%5C/",
+ "/%27\\+[^/]+\\+%27",
+ "/%22\\+[^/]+\\+%22",
+ "/%27%20\\+[^/]+\\+%20%27",
+ "/%22%20\\+[^/]+\\+%20%22",
+ "/\\\\+(%22|%27)",
+ "/\\\\+[\"']",
+ "/\\\\/\\\\/",
+ "/'\\+[^/]+\\+'",
+ "^https?://localhost(:\\d+)?/",
+ "^https?://(127|10)\\.\\d+\\.\\d+\\.\\d+(:\\d+)?/",
+ "^https?://172\\.(1[6-9]|2\\d|3[01])\\.\\d+\\.\\d+(:\\d+)?/",
+ "^https?://192\\.168\\.\\d+\\.\\d+(:\\d+)?/",
+ "^https?://www\\.google\\.com/recaptcha/api",
+ "^https?://geo\\.yahoo\\.com/b\\?",
+ "^https?://((s-)?static\\.ak\\.fbcdn\\.net|(connect\\.|www\\.)?facebook\\.com)/connect\\.php/js/.*rsrc\\.php",
+ "^https?://www\\.flickr\\.com/change_language\\.gne",
+ "^https?://((www|web|web-beta|wayback)\\.)?archive\\.org/",
+ "^https?://www\\.google\\.((com|ad|ae|al|am|as|at|az|ba|be|bf|bg|bi|bj|bs|bt|by|ca|cd|cf|cg|ch|ci|cl|cm|cn|cv|cz|de|dj|dk|dm|dz|ee|es|fi|fm|fr|ga|ge|gg|gl|gm|gp|gr|gy|hn|hr|ht|hu|ie|im|iq|is|it|je|jo|ki|kg|kz|la|li|lk|lt|lu|lv|md|me|mg|mk|ml|mn|ms|mu|mv|mw|ne|nl|no|nr|nu|pl|pn|ps|pt|ro|ru|rw|sc|se|sh|si|sk|sn|so|sm|sr|st|td|tg|tk|tl|tm|tn|to|tt|vg|vu|ws|rs|cat)|(com\\.(af|ag|ai|ar|au|bd|bh|bn|bo|br|bz|co|cu|cy|do|ec|eg|et|fj|gh|gi|gt|hk|jm|kh|kw|lb|ly|mm|mt|mx|my|na|nf|ng|ni|np|om|pa|pe|pg|ph|pk|pr|py|qa|sa|sb|sg|sl|sv|tj|tr|tw|ua|uy|vc|vn))|(co\\.(ao|bw|ck|cr|id|il|in|jp|ke|kr|ls|ma|mz|nz|th|tz|ug|uk|uz|ve|vi|za|zm|zw)))/finance\\?noIL=1&q=[^&]+&ei=",
+ "^https?://upload\\.wikimedia\\.org/wikipedia/[^/]+/thumb/",
+ "^http://b\\.scorecardresearch\\.com/",
+ "^http://i\\.dev\\.cdn\\.turner\\.com/",
+ "^http://video-subtitle\\.tedcdn\\.com/",
+ "^http://download\\.ted\\.com/",
+ "^http://msft\\.digitalrivercontent\\.net/win/.+\\.iso",
+ "^https?://tmz\\.vo\\.llnwd\\.net/",
+ "^https?://(www\\.)?megaupload\\.com/",
+ "^https?://(www\\.)?filesonic\\.com/",
+ "^https?://(www\\.)?wupload\\.com/",
+ "^https?://prod-preview\\.wired\\.com/",
+ "^http://([^\\./]+\\.)?stream\\.publicradio\\.org/",
+ "^http://icecast\\.streaming\\.castor\\.nl/",
+ "^http://wm1\\.streaming\\.castor\\.nl:8000/",
+ "^http://icecast\\.databoss\\.nl:8000/",
+ "^http://stream\\.rynothebearded\\.com:8000/",
+ "^http://mp3\\.live\\.tv-radio\\.com/",
+ "^http://av\\.rasset\\.ie/av/live/",
+ "^http://gcnplayer\\.gcnlive\\.com/.+",
+ "^http://streaming\\.radionomy\\.com/",
+ "^http://mp3\\.ffh\\.de/",
+ "^http://(www\\.)?theradio\\.cc\\:8000/",
+ "^http://(audio\\d?|nfw)\\.video\\.ria\\.ru/",
+ "^http://eu1\\.fastcast4u\\.com:3048/",
+ "^http://[^\\./]+\\.radioscoop\\.(com|net):\\d+/",
+ "^http://[^\\./]+\\.streamchan\\.org:\\d+/",
+ "^http://[^/]*musicproxy\\.s12\\.de/",
+ "^http://stream\\.rfi\\.fr/",
+ "^http://striiming\\d?\\.trio\\.ee/",
+ "^http://streamer\\.radiocampus\\.be(:\\d+)?/",
+ "^http://relay\\.broadcastify\\.com/",
+ "^http://audio\\d?\\.radioreference\\.com/",
+ "^http://[^/]+\\.akadostream\\.ru(:\\d+)?/",
+ "^http://radio\\.silver\\.ru(:\\d+)?/",
+ "^http://icecast\\.szwoelf\\.com:8000/",
+ "^http://altair\\.micronick\\.com:8080/\\?action=stream",
+ "^http://94\\.25\\.53\\.13[1-4]/.+\\.mp3$",
+ "^http://server\\.lradio\\.ru:\\d+/",
+ "^http://188\\.93\\.17\\.201:8080/",
+ "^http://81\\.19\\.85\\.19[56]/.+\\.mp3$",
+ "^http://81\\.19\\.85\\.203/.+\\.mp3$",
+ "^http://play(\\d+)?\\.radio13\\.ru:8000/",
+ "^http://stream(\\d+)?\\.media\\.rambler\\.ru/",
+ "^http://pub(\\d+)?\\.di\\.fm/",
+ "^http://vostok\\.fmtuner\\.ru/",
+ "^http://109\\.120\\.141\\.181:8000/",
+ "^http://195\\.88\\.63\\.114:8000/",
+ "^http://radiosilver\\.corbina\\.net:8000/",
+ "^http://89\\.251\\.147\\.100/",
+ "^http://bcs\\d?\\.fontanka\\.fm:8000/",
+ "^http://stream2\\.cnmns\\.net/",
+ "^http://[^/]+\\.streamtheworld\\.com/",
+ "^http://[^/]+\\.gaduradio\\.pl/",
+ "^http://anka\\.org:8080/",
+ "^http://radio\\.visionotaku\\.com:8000/",
+ "^http://stream\\.r-a-d\\.io/",
+ "^http://r-a-d\\.io/.+\\.mp3$",
+ "^http://95\\.81\\.155\\.17/",
+ "^https?://icecast\\.rtl2?\\.fr/",
+ "^http://mp3tslg\\.tdf-cdn\\.com/",
+ "^http://[^/]+/anony/mjpg\\.cgi$",
+ "^https?://air\\.radiorecord\\.ru(:\\d+)?/",
+ "^https?://[^/]+\\.rastream\\.com(:\\d+)?/",
+ "^https?://audiots\\.scdn\\.arkena\\.com/",
+ "^https?://(www|draft)\\.blogger\\.com/(navbar\\.g|post-edit\\.g|delete-comment\\.g|comment-iframe\\.g|share-post\\.g|email-post\\.g|blog-this\\.g|delete-backlink\\.g|rearrange|blog_this\\.pyra)\\?",
+ "^https?://www\\.tumblr\\.com/(impixu\\?|share(/link/?)?\\?|reblog/)",
+ "^https?://plus\\.google\\.com/share\\?",
+ "^https?://(apis|plusone)\\.google\\.com/_/\\+1/",
+ "^https?://(ssl\\.|www\\.)?reddit\\.com/(login\\?dest=|submit\\?|static/button/button)",
+ "^https?://digg\\.com/submit\\?",
+ "^https?://(www\\.)?facebook\\.com/(plugins/like(box)?\\.php|sharer/sharer\\.php|sharer?\\.php|dialog/(feed|share))\\?",
+ "^https?://(www\\.)?twitter\\.com/(share\\?|intent/((re)?tweet|favorite)|home/?\\?status=|\\?status=)",
+ "^https?://platform\\d?\\.twitter\\.com/widgets/tweet_button.html\\?",
+ "^https?://www\\.newsvine\\.com/_wine/save\\?",
+ "^https?://www\\.netvibes\\.com/subscribe\\.php\\?",
+ "^https?://add\\.my\\.yahoo\\.com/(rss|content)\\?",
+ "^http://www\\.addtoany\\.com/(add_to/|share_save\\?)",
+ "^https?://www\\.addthis\\.com/bookmark\\.php\\?",
+ "^https?://(www\\.)?pinterest\\.com/pin/create/",
+ "^https?://www\\.linkedin\\.com/(cws/share|shareArticle)\\?",
+ "^https?://(www\\.)?stumbleupon\\.com/(submit\\?|badge/embed/)",
+ "^https?://csp\\.cyworld\\.com/bi/bi_recommend_pop\\.php\\?",
+ "^https://share\\.flipboard\\.com/bookmarklet/popout\\?",
+ "^https?://flattr.com/submit/auto\\?",
+ "^https?://(www\\.)?myspace\\.com/Modules/PostTo/",
+ "^https?://www\\.google\\.com/bookmarks/mark\\?",
+ "^http://myweb2\\.search\\.yahoo\\.com/myresults/bookmarklet\\?",
+ "^http://vuible\\.com/pins-settings/",
+ "^https?://news\\.ycombinator\\.com/submitlink\\?",
+ "^http://reporter\\.es\\.msn\\.com/\\?fn=contribute",
+ "^http://www\\.blinklist\\.com/index\\.php\\?Action=Blink/addblink\\.php",
+ "^http://sphinn\\.com/index\\.php\\?c=post&m=submit&",
+ "^http://posterous\\.com/share\\?",
+ "^http://del\\.icio\\.us/post\\?",
+ "^https?://delicious\\.com/(save|post)\\?",
+ "^https?://(www\\.)?friendfeed\\.com/share\\?",
+ "^https?://(www\\.)?xing\\.com/(app/user\\?op=share|social_plugins/share\\?)",
+ "^http://iwiw\\.hu/pages/share/share\\.jsp\\?",
+ "^http://memori(\\.qip)?\\.ru/link/\\?",
+ "^http://wow\\.ya\\.ru/posts_(add|share)_link\\.xml\\?",
+ "^https?://connect\\.mail\\.ru/share\\?",
+ "^http://zakladki\\.yandex\\.ru/newlink\\.xml\\?",
+ "^https?://vkontakte\\.ru/share\\.php\\?",
+ "^https?://www\\.odnoklassniki\\.ru/dk\\?st\\.cmd=addShare",
+ "^https?://www\\.google\\.com/(reader/link\\?|buzz/post\\?)",
+ "^https?://service\\.weibo\\.com/share/share\\.php\\?",
+ "^https?://(www\\.)?technorati\\.com/faves/?\\?add=",
+ "^https?://bufferapp\\.com/add\\?",
+ "^https?://b\\.hatena\\.ne\\.jp/add\\?",
+ "^https?://api\\.addthis\\.com/",
+ "^https?://bookmark\\.naver\\.com/post\\?",
+ "^https?://mail\\.google\\.com/mail/",
+ "^http://pixel\\.blog\\.hu/",
+ "^https?://pixel\\.quantserve\\.com/",
+ "^http://b\\.scorecardresearch\\.com/",
+ "^https?://(www|ssl)\\.google-analytics\\.com/(r/)?(__utm\\.gif|collect\\?)",
+ "^https?://p\\.opt\\.fimserve\\.com/",
+ "^https?://(\\d|www|secure)\\.gravatar\\.com/avatar/ad516503a11cd5ca435acc9bb6523536",
+ "^https?://imageshack\\.com/lost$",
+ "^https?://[^/]+\\.corp\\.ne1\\.yahoo\\.com/",
+ "^https?://.+/js-agent\\.newrelic\\.com/nr-\\d{3,3}(\\.min)?\\.js$",
+ "^https?://.+/stats\\.g\\.doubleclick\\.net/dc\\.js$",
+ "^https?://.+/js/chartbeat\\.js$",
+ "^http://www\\.khaleejtimes\\.com/.+/kt_.+/kt_",
+ "^http://www\\.khaleejtimes\\.com/.+/images/.+/images/",
+ "^http://www\\.khaleejtimes\\.com/.+/imgactv/.+/imgactv/",
+ "^http://photobucket\\.com/.+/albums/.+/albums/",
+ "^https?://([^/]+\\.)?gdcvault\\.com(/.*/|/)(fonts(/.*/|/)fonts/|css(/.*/|/)css/|img(/.*/|/)img/)",
+ "^https://static\\.licdn\\.com/sc/p/com\\.linkedin\\.nux(:|%3A)nux-static-content(\\+|%2B)[\\d\\.]+/f/",
+ "^https?://www\\.flickr\\.com/(explore/|photos/[^/]+/(sets/\\d+/(page\\d+/)?)?)\\d+_[a-f0-9]+(_[a-z])?\\.jpg$",
+ "^https?://static\\.licdn\\.com/sc/p/.+/f//",
+ "^http://www\\.warnerbros\\.com/\\d+$",
+ "^https?://tm\\.uol\\.com\\.br/h/.+/h/",
+ "^https?://media\\.opb\\.org/clips/embed/.+\\.js$",
+
+ // twitter.json
+ "^https?://((?:www|mobile)\\.)?twitter\\.com/.+\\?(?:id|lang|locale|screen_name)=",
+ "^https?://mobile\\.twitter\\.com/i/anonymize\\?data=",
+
+ // imdb.json
+ "^http://b\\.scorecardresearch\\.com/",
+ "^http://ad\\.doubleclick\\.net/",
+ "^http://www\\.imdb\\.com/rd/",
+ "^http://www\\.imdb\\.com/.+\\?ref_=",
+ "^http://www\\.imdb\\.com/.+/board/flat/",
+ "^http://www\\.imdb\\.com/.+/board/inline/",
+ "^http://www\\.imdb\\.com/.+/board/thread/",
+ "^http://www\\.imdb\\.com/help/boards_posting\\.html",
+ "^http://www\\.imdb\\.com/register/",
+ "^http://www\\.imdb\\.com/.+/board/.+/\\d+\\?d=",
+ "^http://www\\.imdb\\.com/.+/videogallery/.+/.+/",
+
+ // facebook.json
+ "^https?://error\\.facebook\\.com/common/scribe_endpoint\\.php\\?c=",
+ "^https?://www\\.facebook\\.com/[^/]+/(posts/|app_)[^/]+\\?(ref=page_internal&)?_fb_noscript=",
+ "^https?://www\\.facebook\\.com/[^/]+/photos/(pb|a)\\.[^/]+/[^/]+/.{4,4}/",
+ "^https?://www\\.facebook\\.com/[^/]+/photos/(pb|a)\\.[^/]+/[^/]+/\\?type=",
+
+ // internetcentrum.json
+ "%3Bamp%3Bamp",
+ "&action=edit",
+ "action=(?:komentar|send)",
+ "action=(?:multiple_products_add_product|notify|add_product|buy_now)",
+ "&action=submit",
+ "&amp;action=edit",
+ "amp;amp;",
+ "answer=.+?&anksent=true",
+ "[a-z0-9]=(?:off|on)",
+ "blog=1&disp=msgform",
+ "\\?cal=",
+ "calendar_menu/calendar\\.php",
+ "calendar_menu/event\\.php",
+ "calendar\\.php",
+ "calendar_scheduler\\.php",
+ "captcha.php",
+ "cas12&cas12",
+ "comment\\.php\\?akce=new",
+ "/comment/reply/\\d+",
+ "cPath=.+&sort=.+",
+ "destination=node/%2F\\d+",
+ "destination=node/\\d+",
+ "(?:displayimage|thumbnails)\\.php\\?pos=-\\d+",
+ "file=posting.+mode=quote",
+ "&highlight=&",
+ "^http://harizzzma\\.com",
+ "^http://www.nahraj.net/",
+ "index.*\\.php\\?option=com_eventcal",
+ "index.php\\?site=calendar",
+ "index\\.php\\?site=guestbook&type=(?:ASC|DESC)",
+ "index.php/Speci%C3%A1ln%C3%AD",
+ "index.php\\?title=Diskuse:",
+ "index.php\\?title=MediaWiki_diskuse:",
+ "index.php\\?title=Soubor_diskuse",
+ "index.php\\?title=Speci%C3%A1ln%C3%AD",
+ "index\\.php\\?\\w+&rok=(1995|2016)&mesic=\\d+&autor=\\d+$",
+ "index\\.php\\?.+year=198.",
+ "index\\.php\\?.+year=203.",
+ "kalendar-akci",
+ "kalendar\\.php",
+ "kalendarrok=\\d{4}",
+ //"lang=(?!czech|english)",
+ //"language=(?!cs|en)",
+ "LightNEasy\\.php\\?do=login",
+ "limit=.+limit=.+",
+ "login=",
+ "login\\.php",
+ "(?:login|registrace|live\\?)",
+ "mact=Calendar",
+ "main_page=(?:product_reviews_write|login|cookie_usage)",
+ "memberlist\\.php\\?mode=email",
+ "memberlist\\.php\\?mode=.+order=",
+ "(?:memberlist|viewprofile|viewtopic)\\.php\\?.*sk=.&sd=.",
+ "mini.+calendar",
+ "mm=\\d+.+yy=\\d{4}",
+ "mode=(?:lostpassword|sendpassword)",
+ "modules.+name=Forums.+view=(?:next|previous)",
+ "modules\\.php\\?name=coppermine.*file=displayimage.+&slideshow=\\d+",
+ "modules\\.php\\?name=coppermine.*meta=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)",
+ "modules\\.php\\?name=Statistics",
+ "mo=\\d+.+ye=\\d{4}",
+ "name=Kalender",
+ "name=Statistics",
+ "option=com_jcalpro.+date=\\d{4}-",
+ "\\?option=com.+&month=.+&year=\\d{4}",
+ "option=&Itemid=.+&date=\\d{4}-",
+ //"order=(?!1)",
+ "orderby=(?:name|note|count|news)",
+ "photo.php\\?i=-\\d+",
+ "/photos.+\\?url=",
+ ".*\\..*\\..*\\.pl",
+ "p=ordersBasket.+sOption=add",
+ "portal\\.php\\?month=[\\d]+",
+ "postdays=0&postorder=asc",
+ "prev_next=(?:prev|next)",
+ "/calendar/",
+ "product_reviews_write\\.php\\?",
+ "profile\\.php\\?mode=email",
+ "profile\\.php\\?mode=register",
+ "\\?q=event.+/(?:day|list|month|table|week)/all/all",
+ "random_num=\\d+",
+ "Recentchangeslinked/",
+ "report\\.php\\?f=.+",
+ "search_id=mini_cal&d=\\d+",
+ "SESSION_ID=",
+ "showcal\\.php",
+ "site=guestbook.+type=(?:ASC|DESC)",
+ //"/sites/all/(sites|modules|libraries|scripts|themes)/.+/\\1",
+ "Souprava=.+Souprava=.+",
+ "Special:Whatlinkshere",
+ "start-index=-\\d+",
+ "/switchuilocale/",
+ "target[xy]=.+target[xy]=.+",
+ "tellafriend\\.php",
+ ":Userlogin&",
+ "user/(?:register|login)",
+ "viewtopic\\.php\\?.*highlight=",
+ "viewtopic\\.php\\?p=\\d+",
+ "viewtopic\\.php\\?.+view=print",
+ "y=\\d{4}&m=\\d+",
+
+ // forums.json
+ "/cron\\.php\\?",
+ "/external\\.php\\?type=rss",
+ "/login\\.php\\?",
+ "/newreply\\.php\\?",
+ "/private\\.php\\?",
+ "/privmsg\\.php\\?",
+ "/register\\.php\\?",
+ "/sendmessage\\.php\\?",
+ "/subscription\\.php\\?",
+ "/posting\\.php\\?",
+ "/viewtopic\\.php\\?.+&view=(next|previous)",
+ "/viewtopic\\.php\\?.+&hilit=",
+ "/feed\\.php\\?",
+ "/index\\.php\\?option=com_mailto",
+ "&view=login&return=",
+ "&format=opensearch",
+ "/misc\\.php\\?do=whoposted",
+ "/newthread\\.php\\?",
+ "/post_thanks\\.php\\?",
+ "/blog_post\\.php\\?do=newblog",
+ "/forumdisplay\\.php.*[\\?&]do=markread",
+ "/userpoll/vote\\.php\\?",
+ "/showthread\\.php.*[\\?&]goto=(next(old|new)est|newpost)",
+ "/editpost\\.php\\?",
+ "/\\?view=getlastpost$",
+ "/index\\.php\\?sharelink=",
+ "/ucp\\.php\\?mode=delete_cookies",
+}
diff --git a/scope.go b/scope.go
index a2c06b6..ccba5f5 100644
--- a/scope.go
+++ b/scope.go
@@ -3,6 +3,7 @@ package crawl
import (
"fmt"
"net/url"
+ "regexp"
"strings"
)
@@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope {
}
return NewURLPrefixScope(pfx)
}
+
+type regexpIgnoreScope struct {
+ ignores []*regexp.Regexp
+}
+
+func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
+ uriStr := uri.String()
+ for _, i := range s.ignores {
+ if i.MatchString(uriStr) {
+ return false
+ }
+ }
+ return true
+}
+
+func NewRegexpIgnoreScope(ignores []string) Scope {
+ if ignores == nil {
+ ignores = defaultIgnorePatterns
+ }
+ r := regexpIgnoreScope{
+ ignores: make([]*regexp.Regexp, 0, len(ignores)),
+ }
+ for _, i := range ignores {
+ r.ignores = append(r.ignores, regexp.MustCompile(i))
+ }
+ return &r
+}