diff options
author | ale <ale@gramma> | 2019-10-07 15:07:15 +0200 |
---|---|---|
committer | ale <ale@gramma> | 2019-10-07 15:07:15 +0200 |
commit | 6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 (patch) | |
tree | 65a668ea9507a38fa30b886ab2bf06e1621638a5 /analysis | |
parent | f4ef521da8711c8c4b5cc144fa275b212fe695e7 (diff) | |
download | crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.tar.gz crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.zip |
Parse links in inline style blocks
Diffstat (limited to 'analysis')
-rw-r--r-- | analysis/links.go | 31 | ||||
-rw-r--r-- | analysis/links_test.go | 80 |
2 files changed, 97 insertions, 14 deletions
diff --git a/analysis/links.go b/analysis/links.go index 398a9e0..521b8fe 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -4,6 +4,7 @@ package analysis import ( "fmt" + "io" "io/ioutil" "net/http" "regexp" @@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink { ctype := resp.Header.Get("Content-Type") switch { case strings.HasPrefix(ctype, "text/html"): - return extractLinksFromHTML(resp) + return extractLinksFromHTML(resp.Body, nil) case strings.HasPrefix(ctype, "text/css"): - return extractLinksFromCSS(resp) + return extractLinksFromCSS(resp.Body, nil) default: return nil } } -func extractLinksFromHTML(resp *http.Response) []rawOutlink { - var outlinks []rawOutlink - // Use goquery to extract links from the parsed HTML - // contents (query patterns are described in the - // linkMatches table). - doc, err := goquery.NewDocumentFromReader(resp.Body) +func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use goquery to extract links from the parsed HTML contents + // (query patterns are described in the linkMatches table). + doc, err := goquery.NewDocumentFromReader(r) if err != nil { return nil } - for _, lm := range linkMatches { doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { val, _ := s.Attr(lm.attr) outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) }) } + + // Find the inline <style> sections and parse them separately as CSS. + doc.Find("style").Each(func(i int, s *goquery.Selection) { + outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks) + }) + return outlinks } -func extractLinksFromCSS(resp *http.Response) []rawOutlink { - // Use a simple (and actually quite bad) regular - // expression to extract "url()" links from CSS. - var outlinks []rawOutlink - if data, err := ioutil.ReadAll(resp.Body); err == nil { +func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use a simple (and actually quite bad) regular expression to + // extract "url()" and "@import" links from CSS. + if data, err := ioutil.ReadAll(r); err == nil { for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated}) } diff --git a/analysis/links_test.go b/analysis/links_test.go new file mode 100644 index 0000000..1bd906b --- /dev/null +++ b/analysis/links_test.go @@ -0,0 +1,80 @@ +package analysis + +import ( + "fmt" + "io/ioutil" + "net/http" + "net/url" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func makeResponse(ctype, body string) *http.Response { + u, _ := url.Parse("https://example.com/") + r := &http.Response{ + Header: make(http.Header), + Body: ioutil.NopCloser(strings.NewReader(body)), + Request: &http.Request{ + URL: u, + }, + } + r.Header.Set("Content-Type", ctype) + return r +} + +type testdata struct { + ctype string + body string + expectedLinks []string +} + +func (td *testdata) runTestCase() error { + links, err := GetLinks(makeResponse(td.ctype, td.body)) + if err != nil { + return fmt.Errorf("GetLinks() error: %v", err) + } + var linkStr []string + for _, l := range links { + linkStr = append(linkStr, l.URL.String()) + } + if diff := cmp.Diff(td.expectedLinks, linkStr); diff != "" { + return fmt.Errorf("unexpected result:\n%s", diff) + } + return nil +} + +var tests = []testdata{ + { + "text/html", + ` +<html><body> +<a href="/link1">link</a> +</body></html> +`, + []string{ + "https://example.com/link1", + }, + }, + { + "text/html", + ` +<html><head><style type="text/css"> +body { background: url('/link1'); } +</style></head> +<body></body></html> +`, + []string{ + "https://example.com/link1", + }, + }, +} + +func TestLinks(t *testing.T) { + for _, tt := range tests { + if err := tt.runTestCase(); err != nil { + t.Error(err) + } + } +} |