From 6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 Mon Sep 17 00:00:00 2001 From: ale Date: Mon, 7 Oct 2019 15:07:15 +0200 Subject: Parse links in inline style blocks --- analysis/links.go | 31 ++++++++++--------- analysis/links_test.go | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 14 deletions(-) create mode 100644 analysis/links_test.go (limited to 'analysis') diff --git a/analysis/links.go b/analysis/links.go index 398a9e0..521b8fe 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -4,6 +4,7 @@ package analysis import ( "fmt" + "io" "io/ioutil" "net/http" "regexp" @@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink { ctype := resp.Header.Get("Content-Type") switch { case strings.HasPrefix(ctype, "text/html"): - return extractLinksFromHTML(resp) + return extractLinksFromHTML(resp.Body, nil) case strings.HasPrefix(ctype, "text/css"): - return extractLinksFromCSS(resp) + return extractLinksFromCSS(resp.Body, nil) default: return nil } } -func extractLinksFromHTML(resp *http.Response) []rawOutlink { - var outlinks []rawOutlink - // Use goquery to extract links from the parsed HTML - // contents (query patterns are described in the - // linkMatches table). - doc, err := goquery.NewDocumentFromReader(resp.Body) +func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use goquery to extract links from the parsed HTML contents + // (query patterns are described in the linkMatches table). + doc, err := goquery.NewDocumentFromReader(r) if err != nil { return nil } - for _, lm := range linkMatches { doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { val, _ := s.Attr(lm.attr) outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) }) } + + // Find the inline + +`, + []string{ + "https://example.com/link1", + }, + }, +} + +func TestLinks(t *testing.T) { + for _, tt := range tests { + if err := tt.runTestCase(); err != nil { + t.Error(err) + } + } +} -- cgit v1.2.3-54-g00ecf