aboutsummaryrefslogtreecommitdiff
path: root/analysis/links.go
diff options
context:
space:
mode:
Diffstat (limited to 'analysis/links.go')
-rw-r--r--analysis/links.go31
1 files changed, 17 insertions, 14 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 398a9e0..521b8fe 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -4,6 +4,7 @@ package analysis
import (
"fmt"
+ "io"
"io/ioutil"
"net/http"
"regexp"
@@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink {
ctype := resp.Header.Get("Content-Type")
switch {
case strings.HasPrefix(ctype, "text/html"):
- return extractLinksFromHTML(resp)
+ return extractLinksFromHTML(resp.Body, nil)
case strings.HasPrefix(ctype, "text/css"):
- return extractLinksFromCSS(resp)
+ return extractLinksFromCSS(resp.Body, nil)
default:
return nil
}
}
-func extractLinksFromHTML(resp *http.Response) []rawOutlink {
- var outlinks []rawOutlink
- // Use goquery to extract links from the parsed HTML
- // contents (query patterns are described in the
- // linkMatches table).
- doc, err := goquery.NewDocumentFromReader(resp.Body)
+func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+ // Use goquery to extract links from the parsed HTML contents
+ // (query patterns are described in the linkMatches table).
+ doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil
}
-
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
+
+ // Find the inline <style> sections and parse them separately as CSS.
+ doc.Find("style").Each(func(i int, s *goquery.Selection) {
+ outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks)
+ })
+
return outlinks
}
-func extractLinksFromCSS(resp *http.Response) []rawOutlink {
- // Use a simple (and actually quite bad) regular
- // expression to extract "url()" links from CSS.
- var outlinks []rawOutlink
- if data, err := ioutil.ReadAll(resp.Body); err == nil {
+func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+ // Use a simple (and actually quite bad) regular expression to
+ // extract "url()" and "@import" links from CSS.
+ if data, err := ioutil.ReadAll(r); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}