Add tags (primary/related) to links

This change allows more complex scope boundaries, including loosening edges a bit to include related resources of HTML pages (which makes for more complete archives if desired).
author: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
committer: ale <ale@incal.net> 2017-12-19 00:12:11 +0000
commit: 4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree: c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /analysis
parent: 77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download: crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz
crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip
1 files changed, 30 insertions, 19 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 9fdf8fb..5d61547 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -6,31 +6,39 @@ import (
 	"fmt"
 	"io/ioutil"
 	"net/http"
-	"net/url"
 	"regexp"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
+
+	"git.autistici.org/ale/crawl"
 )
 
 var (
-	urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`)
+	urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`)
 
 	linkMatches = []struct {
-		tag  string
-		attr string
+		tag     string
+		attr    string
+		linkTag int
 	}{
-		{"a", "href"},
-		{"link", "href"},
-		{"img", "src"},
-		{"script", "src"},
+		{"a", "href", crawl.TagPrimary},
+		{"link", "href", crawl.TagRelated},
+		{"img", "src", crawl.TagRelated},
+		{"script", "src", crawl.TagRelated},
 	}
 )
 
+// The unparsed version of an Outlink.
+type rawOutlink struct {
+	URL string
+	Tag int
+}
+
 // GetLinks returns all the links found in a document. Currently only
 // parses HTML pages and CSS stylesheets.
-func GetLinks(resp *http.Response) ([]*url.URL, error) {
-	var outlinks []string
+func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
+	var outlinks []rawOutlink
 
 	ctype := resp.Header.Get("Content-Type")
 	if strings.HasPrefix(ctype, "text/html") {
@@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
 		for _, lm := range linkMatches {
 			doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
 				val, _ := s.Attr(lm.attr)
-				outlinks = append(outlinks, val)
+				outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
 			})
 		}
 	} else if strings.HasPrefix(ctype, "text/css") {
@@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
 		// expression to extract "url()" links from CSS.
 		if data, err := ioutil.ReadAll(resp.Body); err == nil {
 			for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
-				outlinks = append(outlinks, val[1])
+				outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
 			}
 		}
 	}
 
 	// Parse outbound links relative to the request URI, and
 	// return unique results.
-	var result []*url.URL
-	links := make(map[string]*url.URL)
-	for _, val := range outlinks {
-		if linkurl, err := resp.Request.URL.Parse(val); err == nil {
-			links[linkurl.String()] = linkurl
+	var result []crawl.Outlink
+	links := make(map[string]crawl.Outlink)
+	for _, l := range outlinks {
+		if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil {
+			links[linkurl.String()] = crawl.Outlink{
+				URL: linkurl,
+				Tag: l.Tag,
+			}
 		}
 	}
-	for _, u := range links {
-		result = append(result, u)
+	for _, l := range links {
+		result = append(result, l)
 	}
 	return result, nil
 }
author	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
committer	ale <ale@incal.net>	2017-12-19 00:12:11 +0000
commit	4cd67e7234943baf31b2e122f8ee3c70c21fb489 (patch)
tree	c3bf3e88729291ecf0e371d0dd43977cdd1d08ea /analysis
parent	77211d4f6952a4d9cc92378f6a1cbacd3b5426ca (diff)
download	crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.tar.gz crawl-4cd67e7234943baf31b2e122f8ee3c70c21fb489.zip