aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorale <ale@incal.net>2020-02-17 21:39:06 +0000
committerale <ale@incal.net>2020-02-17 21:40:29 +0000
commit533f472553d6db42a1ae704285e33f53cf90f81d (patch)
tree122c472cc685e567d25794357c90ff92b7165b1c /cmd
parentfec78595f9986cb908ef1ff61cfb3a5828986456 (diff)
downloadcrawl-533f472553d6db42a1ae704285e33f53cf90f81d.tar.gz
crawl-533f472553d6db42a1ae704285e33f53cf90f81d.zip
Propagate the link tag through redirects
In order to do this we have to plumb it through the queue and the Handler interface, but it should allow fetches of the resources associated with a page via the IncludeRelatedScope even if it's behind a redirect.
Diffstat (limited to 'cmd')
-rw-r--r--cmd/crawl/crawl.go4
1 files changed, 2 insertions, 2 deletions
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 54bb505..a79e0a6 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -33,7 +33,7 @@ var (
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 100, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
- excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
+ excludeRelated = flag.Bool("exclude-related", false, "do not include related resources (css, images, etc) if their URL is not in scope")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file or pattern (patterns must include a \"%s\" literal token)")
warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
@@ -127,7 +127,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
return w.Close()
}
-func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error {
+func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, tag, depth int, resp *http.Response, _ error) error {
// Read the response body (so we can save it to the WARC
// output) and replace it with a buffer.
data, derr := ioutil.ReadAll(resp.Body)