aboutsummaryrefslogtreecommitdiff
path: root/analysis
diff options
context:
space:
mode:
authorale <ale@gramma>2019-10-07 15:07:15 +0200
committerale <ale@gramma>2019-10-07 15:07:15 +0200
commit6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 (patch)
tree65a668ea9507a38fa30b886ab2bf06e1621638a5 /analysis
parentf4ef521da8711c8c4b5cc144fa275b212fe695e7 (diff)
downloadcrawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.tar.gz
crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.zip
Parse links in inline style blocks
Diffstat (limited to 'analysis')
-rw-r--r--analysis/links.go31
-rw-r--r--analysis/links_test.go80
2 files changed, 97 insertions, 14 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 398a9e0..521b8fe 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -4,6 +4,7 @@ package analysis
import (
"fmt"
+ "io"
"io/ioutil"
"net/http"
"regexp"
@@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink {
ctype := resp.Header.Get("Content-Type")
switch {
case strings.HasPrefix(ctype, "text/html"):
- return extractLinksFromHTML(resp)
+ return extractLinksFromHTML(resp.Body, nil)
case strings.HasPrefix(ctype, "text/css"):
- return extractLinksFromCSS(resp)
+ return extractLinksFromCSS(resp.Body, nil)
default:
return nil
}
}
-func extractLinksFromHTML(resp *http.Response) []rawOutlink {
- var outlinks []rawOutlink
- // Use goquery to extract links from the parsed HTML
- // contents (query patterns are described in the
- // linkMatches table).
- doc, err := goquery.NewDocumentFromReader(resp.Body)
+func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+ // Use goquery to extract links from the parsed HTML contents
+ // (query patterns are described in the linkMatches table).
+ doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil
}
-
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
+
+ // Find the inline <style> sections and parse them separately as CSS.
+ doc.Find("style").Each(func(i int, s *goquery.Selection) {
+ outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks)
+ })
+
return outlinks
}
-func extractLinksFromCSS(resp *http.Response) []rawOutlink {
- // Use a simple (and actually quite bad) regular
- // expression to extract "url()" links from CSS.
- var outlinks []rawOutlink
- if data, err := ioutil.ReadAll(resp.Body); err == nil {
+func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+ // Use a simple (and actually quite bad) regular expression to
+ // extract "url()" and "@import" links from CSS.
+ if data, err := ioutil.ReadAll(r); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
diff --git a/analysis/links_test.go b/analysis/links_test.go
new file mode 100644
index 0000000..1bd906b
--- /dev/null
+++ b/analysis/links_test.go
@@ -0,0 +1,80 @@
+package analysis
+
+import (
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "strings"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+)
+
+func makeResponse(ctype, body string) *http.Response {
+ u, _ := url.Parse("https://example.com/")
+ r := &http.Response{
+ Header: make(http.Header),
+ Body: ioutil.NopCloser(strings.NewReader(body)),
+ Request: &http.Request{
+ URL: u,
+ },
+ }
+ r.Header.Set("Content-Type", ctype)
+ return r
+}
+
+type testdata struct {
+ ctype string
+ body string
+ expectedLinks []string
+}
+
+func (td *testdata) runTestCase() error {
+ links, err := GetLinks(makeResponse(td.ctype, td.body))
+ if err != nil {
+ return fmt.Errorf("GetLinks() error: %v", err)
+ }
+ var linkStr []string
+ for _, l := range links {
+ linkStr = append(linkStr, l.URL.String())
+ }
+ if diff := cmp.Diff(td.expectedLinks, linkStr); diff != "" {
+ return fmt.Errorf("unexpected result:\n%s", diff)
+ }
+ return nil
+}
+
+var tests = []testdata{
+ {
+ "text/html",
+ `
+<html><body>
+<a href="/link1">link</a>
+</body></html>
+`,
+ []string{
+ "https://example.com/link1",
+ },
+ },
+ {
+ "text/html",
+ `
+<html><head><style type="text/css">
+body { background: url('/link1'); }
+</style></head>
+<body></body></html>
+`,
+ []string{
+ "https://example.com/link1",
+ },
+ },
+}
+
+func TestLinks(t *testing.T) {
+ for _, tt := range tests {
+ if err := tt.runTestCase(); err != nil {
+ t.Error(err)
+ }
+ }
+}