diff options
author | ale <ale@incal.net> | 2017-12-19 00:12:11 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2017-12-19 00:12:11 +0000 |
commit | 4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch) | |
tree | c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /analysis | |
parent | 77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff) | |
download | crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip |
Add tags (primary/related) to links
This change allows more complex scope boundaries, including loosening
edges a bit to include related resources of HTML pages (which makes
for more complete archives if desired).
Diffstat (limited to 'analysis')
-rw-r--r-- | analysis/links.go | 49 |
1 files changed, 30 insertions, 19 deletions
diff --git a/analysis/links.go b/analysis/links.go index 9fdf8fb..5d61547 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -6,31 +6,39 @@ import ( "fmt" "io/ioutil" "net/http" - "net/url" "regexp" "strings" "github.com/PuerkitoBio/goquery" + + "git.autistici.org/ale/crawl" ) var ( - urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`) + urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`) linkMatches = []struct { - tag string - attr string + tag string + attr string + linkTag int }{ - {"a", "href"}, - {"link", "href"}, - {"img", "src"}, - {"script", "src"}, + {"a", "href", crawl.TagPrimary}, + {"link", "href", crawl.TagRelated}, + {"img", "src", crawl.TagRelated}, + {"script", "src", crawl.TagRelated}, } ) +// The unparsed version of an Outlink. +type rawOutlink struct { + URL string + Tag int +} + // GetLinks returns all the links found in a document. Currently only // parses HTML pages and CSS stylesheets. -func GetLinks(resp *http.Response) ([]*url.URL, error) { - var outlinks []string +func GetLinks(resp *http.Response) ([]crawl.Outlink, error) { + var outlinks []rawOutlink ctype := resp.Header.Get("Content-Type") if strings.HasPrefix(ctype, "text/html") { @@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { for _, lm := range linkMatches { doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { val, _ := s.Attr(lm.attr) - outlinks = append(outlinks, val) + outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) }) } } else if strings.HasPrefix(ctype, "text/css") { @@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { // expression to extract "url()" links from CSS. if data, err := ioutil.ReadAll(resp.Body); err == nil { for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { - outlinks = append(outlinks, val[1]) + outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated}) } } } // Parse outbound links relative to the request URI, and // return unique results. - var result []*url.URL - links := make(map[string]*url.URL) - for _, val := range outlinks { - if linkurl, err := resp.Request.URL.Parse(val); err == nil { - links[linkurl.String()] = linkurl + var result []crawl.Outlink + links := make(map[string]crawl.Outlink) + for _, l := range outlinks { + if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil { + links[linkurl.String()] = crawl.Outlink{ + URL: linkurl, + Tag: l.Tag, + } } } - for _, u := range links { - result = append(result, u) + for _, l := range links { + result = append(result, l) } return result, nil } |