Parse links in inline style blocks

author: ale <ale@gramma> 2019-10-07 15:07:15 +0200
committer: ale <ale@gramma> 2019-10-07 15:07:15 +0200
commit: 6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 (patch)
tree: 65a668ea9507a38fa30b886ab2bf06e1621638a5 /analysis
parent: f4ef521da8711c8c4b5cc144fa275b212fe695e7 (diff)
download: crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.tar.gz
crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.zip
2 files changed, 97 insertions, 14 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 398a9e0..521b8fe 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -4,6 +4,7 @@ package analysis
 
 import (
 	"fmt"
+	"io"
 	"io/ioutil"
 	"net/http"
 	"regexp"
@@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink {
 	ctype := resp.Header.Get("Content-Type")
 	switch {
 	case strings.HasPrefix(ctype, "text/html"):
-		return extractLinksFromHTML(resp)
+		return extractLinksFromHTML(resp.Body, nil)
 	case strings.HasPrefix(ctype, "text/css"):
-		return extractLinksFromCSS(resp)
+		return extractLinksFromCSS(resp.Body, nil)
 	default:
 		return nil
 	}
 }
 
-func extractLinksFromHTML(resp *http.Response) []rawOutlink {
-	var outlinks []rawOutlink
-	// Use goquery to extract links from the parsed HTML
-	// contents (query patterns are described in the
-	// linkMatches table).
-	doc, err := goquery.NewDocumentFromReader(resp.Body)
+func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+	// Use goquery to extract links from the parsed HTML contents
+	// (query patterns are described in the linkMatches table).
+	doc, err := goquery.NewDocumentFromReader(r)
 	if err != nil {
 		return nil
 	}
-
 	for _, lm := range linkMatches {
 		doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
 			val, _ := s.Attr(lm.attr)
 			outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
 		})
 	}
+
+	// Find the inline <style> sections and parse them separately as CSS.
+	doc.Find("style").Each(func(i int, s *goquery.Selection) {
+		outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks)
+	})
+
 	return outlinks
 }
 
-func extractLinksFromCSS(resp *http.Response) []rawOutlink {
-	// Use a simple (and actually quite bad) regular
-	// expression to extract "url()" links from CSS.
-	var outlinks []rawOutlink
-	if data, err := ioutil.ReadAll(resp.Body); err == nil {
+func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+	// Use a simple (and actually quite bad) regular expression to
+	// extract "url()" and "@import" links from CSS.
+	if data, err := ioutil.ReadAll(r); err == nil {
 		for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
 			outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
 		}
diff --git a/analysis/links_test.go b/analysis/links_test.go
new file mode 100644
index 0000000..1bd906b
--- /dev/null
+++ b/analysis/links_test.go
@@ -0,0 +1,80 @@
+package analysis
+
+import (
+	"fmt"
+	"io/ioutil"
+	"net/http"
+	"net/url"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func makeResponse(ctype, body string) *http.Response {
+	u, _ := url.Parse("https://example.com/")
+	r := &http.Response{
+		Header: make(http.Header),
+		Body:   ioutil.NopCloser(strings.NewReader(body)),
+		Request: &http.Request{
+			URL: u,
+		},
+	}
+	r.Header.Set("Content-Type", ctype)
+	return r
+}
+
+type testdata struct {
+	ctype         string
+	body          string
+	expectedLinks []string
+}
+
+func (td *testdata) runTestCase() error {
+	links, err := GetLinks(makeResponse(td.ctype, td.body))
+	if err != nil {
+		return fmt.Errorf("GetLinks() error: %v", err)
+	}
+	var linkStr []string
+	for _, l := range links {
+		linkStr = append(linkStr, l.URL.String())
+	}
+	if diff := cmp.Diff(td.expectedLinks, linkStr); diff != "" {
+		return fmt.Errorf("unexpected result:\n%s", diff)
+	}
+	return nil
+}
+
+var tests = []testdata{
+	{
+		"text/html",
+		`
+<html><body>
+<a href="/link1">link</a>
+</body></html>
+`,
+		[]string{
+			"https://example.com/link1",
+		},
+	},
+	{
+		"text/html",
+		`
+<html><head><style type="text/css">
+body { background: url('/link1'); }
+</style></head>
+<body></body></html>
+`,
+		[]string{
+			"https://example.com/link1",
+		},
+	},
+}
+
+func TestLinks(t *testing.T) {
+	for _, tt := range tests {
+		if err := tt.runTestCase(); err != nil {
+			t.Error(err)
+		}
+	}
+}
author	ale <ale@gramma>	2019-10-07 15:07:15 +0200
committer	ale <ale@gramma>	2019-10-07 15:07:15 +0200
commit	6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 (patch)
tree	65a668ea9507a38fa30b886ab2bf06e1621638a5 /analysis
parent	f4ef521da8711c8c4b5cc144fa275b212fe695e7 (diff)
download	crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.tar.gz crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.zip