aboutsummaryrefslogtreecommitdiff
path: root/analysis
diff options
context:
space:
mode:
authorale <ale@incal.net>2017-12-19 00:12:11 +0000
committerale <ale@incal.net>2017-12-19 00:12:11 +0000
commit4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
treec3bf3e88729291ecf0e371d0dd43977cdd1d08ea /analysis
parent77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
downloadcrawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
Add tags (primary/related) to links
This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
Diffstat (limited to 'analysis')
-rw-r--r--analysis/links.go49
1 files changed, 30 insertions, 19 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 9fdf8fb..5d61547 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -6,31 +6,39 @@ import (
"fmt"
"io/ioutil"
"net/http"
- "net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
+
+ "git.autistici.org/ale/crawl"
)
var (
- urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`)
+ urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`)
linkMatches = []struct {
- tag string
- attr string
+ tag string
+ attr string
+ linkTag int
}{
- {"a", "href"},
- {"link", "href"},
- {"img", "src"},
- {"script", "src"},
+ {"a", "href", crawl.TagPrimary},
+ {"link", "href", crawl.TagRelated},
+ {"img", "src", crawl.TagRelated},
+ {"script", "src", crawl.TagRelated},
}
)
+// The unparsed version of an Outlink.
+type rawOutlink struct {
+ URL string
+ Tag int
+}
+
// GetLinks returns all the links found in a document. Currently only
// parses HTML pages and CSS stylesheets.
-func GetLinks(resp *http.Response) ([]*url.URL, error) {
- var outlinks []string
+func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
+ var outlinks []rawOutlink
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
@@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
- outlinks = append(outlinks, val)
+ outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
} else if strings.HasPrefix(ctype, "text/css") {
@@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
// expression to extract "url()" links from CSS.
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
- outlinks = append(outlinks, val[1])
+ outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
}
}
// Parse outbound links relative to the request URI, and
// return unique results.
- var result []*url.URL
- links := make(map[string]*url.URL)
- for _, val := range outlinks {
- if linkurl, err := resp.Request.URL.Parse(val); err == nil {
- links[linkurl.String()] = linkurl
+ var result []crawl.Outlink
+ links := make(map[string]crawl.Outlink)
+ for _, l := range outlinks {
+ if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil {
+ links[linkurl.String()] = crawl.Outlink{
+ URL: linkurl,
+ Tag: l.Tag,
+ }
}
}
- for _, u := range links {
- result = append(result, u)
+ for _, l := range links {
+ result = append(result, l)
}
return result, nil
}