aboutsummaryrefslogtreecommitdiff
path: root/analysis
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-03-24 09:08:13 -0700
committerJordan <me@jordan.im>2022-03-24 09:08:13 -0700
commit6355aa4310ff0c32b056580e812ca6f0e2a5ee2f (patch)
tree3a3008d4d50e5e19f6805b1e1e03460e202048f9 /analysis
parenta39310f111cef49ff630cc12fdebabc4df37ec28 (diff)
downloadcrawl-6355aa4310ff0c32b056580e812ca6f0e2a5ee2f.tar.gz
crawl-6355aa4310ff0c32b056580e812ca6f0e2a5ee2f.zip
links, crawl: dramatically reduce memory usage
to prevent excessive memory usage and OOM crashes, rather than store and pass around response bodies in memory buffers, let's store them temporarily on the filesystem wget-style and delete them when processed
Diffstat (limited to 'analysis')
-rw-r--r--analysis/links.go11
1 files changed, 6 insertions, 5 deletions
diff --git a/analysis/links.go b/analysis/links.go
index c0663fa..f1b3e99 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -7,6 +7,7 @@ import (
"io"
"io/ioutil"
"net/http"
+ "os"
"regexp"
"strings"
@@ -41,12 +42,12 @@ type rawOutlink struct {
// GetLinks returns all the links found in a document. Currently only
// parses HTML pages and CSS stylesheets.
-func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
+func GetLinks(resp *http.Response, body *os.File) ([]crawl.Outlink, error) {
// Parse outbound links relative to the request URI, and
// return unique results.
var result []crawl.Outlink
links := make(map[string]crawl.Outlink)
- for _, l := range extractLinks(resp) {
+ for _, l := range extractLinks(resp, body) {
// Skip data: URLs altogether.
if strings.HasPrefix(l.URL, "data:") {
continue
@@ -64,13 +65,13 @@ func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
return result, nil
}
-func extractLinks(resp *http.Response) []rawOutlink {
+func extractLinks(resp *http.Response, body *os.File) []rawOutlink {
ctype := resp.Header.Get("Content-Type")
switch {
case strings.HasPrefix(ctype, "text/html"):
- return extractLinksFromHTML(resp.Body, nil)
+ return extractLinksFromHTML(body, nil)
case strings.HasPrefix(ctype, "text/css"):
- return extractLinksFromCSS(resp.Body, nil)
+ return extractLinksFromCSS(body, nil)
default:
return nil
}