diff options
author | ale <ale@incal.net> | 2018-08-31 08:29:14 +0100 |
---|---|---|
committer | ale <ale@incal.net> | 2018-08-31 08:29:14 +0100 |
commit | ee1a3d8e5278a4a4e8435f9129852b95a9c22afb (patch) | |
tree | fd7a42cfff4aed5bd2379feb35f7172287430ba2 /analysis | |
parent | b3d419486a87c9193c2fd6c16168f600876e0f73 (diff) | |
download | crawl-ee1a3d8e5278a4a4e8435f9129852b95a9c22afb.tar.gz crawl-ee1a3d8e5278a4a4e8435f9129852b95a9c22afb.zip |
Improve error checking
Detect write errors (both on the database and to the WARC output) and
abort with an error message.
Also fix a bunch of harmless lint warnings.
Diffstat (limited to 'analysis')
-rw-r--r-- | analysis/links.go | 73 |
1 files changed, 44 insertions, 29 deletions
diff --git a/analysis/links.go b/analysis/links.go index 97957ad..2bcfff1 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -39,39 +39,11 @@ type rawOutlink struct { // GetLinks returns all the links found in a document. Currently only // parses HTML pages and CSS stylesheets. func GetLinks(resp *http.Response) ([]crawl.Outlink, error) { - var outlinks []rawOutlink - - ctype := resp.Header.Get("Content-Type") - if strings.HasPrefix(ctype, "text/html") { - // Use goquery to extract links from the parsed HTML - // contents (query patterns are described in the - // linkMatches table). - doc, err := goquery.NewDocumentFromResponse(resp) - if err != nil { - return nil, err - } - - for _, lm := range linkMatches { - doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { - val, _ := s.Attr(lm.attr) - outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) - }) - } - } else if strings.HasPrefix(ctype, "text/css") { - // Use a simple (and actually quite bad) regular - // expression to extract "url()" links from CSS. - if data, err := ioutil.ReadAll(resp.Body); err == nil { - for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { - outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated}) - } - } - } - // Parse outbound links relative to the request URI, and // return unique results. var result []crawl.Outlink links := make(map[string]crawl.Outlink) - for _, l := range outlinks { + for _, l := range extractLinks(resp) { // Skip data: URLs altogether. if strings.HasPrefix(l.URL, "data:") { continue @@ -88,3 +60,46 @@ func GetLinks(resp *http.Response) ([]crawl.Outlink, error) { } return result, nil } + +func extractLinks(resp *http.Response) []rawOutlink { + ctype := resp.Header.Get("Content-Type") + switch { + case strings.HasPrefix(ctype, "text/html"): + return extractLinksFromHTML(resp) + case strings.HasPrefix(ctype, "text/css"): + return extractLinksFromCSS(resp) + default: + return nil + } +} + +func extractLinksFromHTML(resp *http.Response) []rawOutlink { + var outlinks []rawOutlink + // Use goquery to extract links from the parsed HTML + // contents (query patterns are described in the + // linkMatches table). + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return nil + } + + for _, lm := range linkMatches { + doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { + val, _ := s.Attr(lm.attr) + outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) + }) + } + return outlinks +} + +func extractLinksFromCSS(resp *http.Response) []rawOutlink { + // Use a simple (and actually quite bad) regular + // expression to extract "url()" links from CSS. + var outlinks []rawOutlink + if data, err := ioutil.ReadAll(resp.Body); err == nil { + for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { + outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated}) + } + } + return outlinks +} |