diff options
author | ale <ale@incal.net> | 2014-12-20 10:49:36 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2014-12-20 10:49:36 +0000 |
commit | f0c14e5e36d640d0f6801691c69dffb69459fe10 (patch) | |
tree | b9bd2ae00aa3ca65175e37959875cdd52488b007 /analysis | |
parent | d4c561c23d016cf6a7507840153e835994915cb8 (diff) | |
download | crawl-f0c14e5e36d640d0f6801691c69dffb69459fe10.tar.gz crawl-f0c14e5e36d640d0f6801691c69dffb69459fe10.zip |
move link extraction to a common location
Diffstat (limited to 'analysis')
-rw-r--r-- | analysis/links.go | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/analysis/links.go b/analysis/links.go new file mode 100644 index 0000000..36244c0 --- /dev/null +++ b/analysis/links.go @@ -0,0 +1,67 @@ +// Extract links from HTML/CSS content. + +package analysis + +import ( + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +var ( + urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`) + + linkMatches = []struct { + tag string + attr string + }{ + {"a", "href"}, + {"link", "href"}, + {"img", "src"}, + {"script", "src"}, + } +) + +func GetLinks(resp *http.Response) ([]*url.URL, error) { + var outlinks []string + + ctype := resp.Header.Get("Content-Type") + if strings.HasPrefix(ctype, "text/html") { + doc, err := goquery.NewDocumentFromResponse(resp) + if err != nil { + return nil, err + } + + for _, lm := range linkMatches { + doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { + val, _ := s.Attr(lm.attr) + outlinks = append(outlinks, val) + }) + } + } else if strings.HasPrefix(ctype, "text/css") { + if data, err := ioutil.ReadAll(resp.Body); err == nil { + for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { + outlinks = append(outlinks, val[1]) + } + } + } + + // Uniquify and parse outbound links. + var result []*url.URL + links := make(map[string]*url.URL) + for _, val := range outlinks { + if linkurl, err := resp.Request.URL.Parse(val); err == nil { + links[linkurl.String()] = linkurl + } + } + for _, link := range links { + result = append(result, link) + } + + return result, nil +} |