diff options
Diffstat (limited to 'analysis/links.go')
-rw-r--r-- | analysis/links.go | 31 |
1 files changed, 17 insertions, 14 deletions
diff --git a/analysis/links.go b/analysis/links.go index 398a9e0..521b8fe 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -4,6 +4,7 @@ package analysis import ( "fmt" + "io" "io/ioutil" "net/http" "regexp" @@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink { ctype := resp.Header.Get("Content-Type") switch { case strings.HasPrefix(ctype, "text/html"): - return extractLinksFromHTML(resp) + return extractLinksFromHTML(resp.Body, nil) case strings.HasPrefix(ctype, "text/css"): - return extractLinksFromCSS(resp) + return extractLinksFromCSS(resp.Body, nil) default: return nil } } -func extractLinksFromHTML(resp *http.Response) []rawOutlink { - var outlinks []rawOutlink - // Use goquery to extract links from the parsed HTML - // contents (query patterns are described in the - // linkMatches table). - doc, err := goquery.NewDocumentFromReader(resp.Body) +func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use goquery to extract links from the parsed HTML contents + // (query patterns are described in the linkMatches table). + doc, err := goquery.NewDocumentFromReader(r) if err != nil { return nil } - for _, lm := range linkMatches { doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { val, _ := s.Attr(lm.attr) outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) }) } + + // Find the inline <style> sections and parse them separately as CSS. + doc.Find("style").Each(func(i int, s *goquery.Selection) { + outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks) + }) + return outlinks } -func extractLinksFromCSS(resp *http.Response) []rawOutlink { - // Use a simple (and actually quite bad) regular - // expression to extract "url()" links from CSS. - var outlinks []rawOutlink - if data, err := ioutil.ReadAll(resp.Body); err == nil { +func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use a simple (and actually quite bad) regular expression to + // extract "url()" and "@import" links from CSS. + if data, err := ioutil.ReadAll(r); err == nil { for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated}) } |