From 6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 Mon Sep 17 00:00:00 2001 From: ale Date: Mon, 7 Oct 2019 15:07:15 +0200 Subject: Parse links in inline style blocks --- analysis/links.go | 31 ++++++++++--------- analysis/links_test.go | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ go.mod | 6 ++-- go.sum | 39 ++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 17 deletions(-) create mode 100644 analysis/links_test.go create mode 100644 go.sum diff --git a/analysis/links.go b/analysis/links.go index 398a9e0..521b8fe 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -4,6 +4,7 @@ package analysis import ( "fmt" + "io" "io/ioutil" "net/http" "regexp" @@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink { ctype := resp.Header.Get("Content-Type") switch { case strings.HasPrefix(ctype, "text/html"): - return extractLinksFromHTML(resp) + return extractLinksFromHTML(resp.Body, nil) case strings.HasPrefix(ctype, "text/css"): - return extractLinksFromCSS(resp) + return extractLinksFromCSS(resp.Body, nil) default: return nil } } -func extractLinksFromHTML(resp *http.Response) []rawOutlink { - var outlinks []rawOutlink - // Use goquery to extract links from the parsed HTML - // contents (query patterns are described in the - // linkMatches table). - doc, err := goquery.NewDocumentFromReader(resp.Body) +func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use goquery to extract links from the parsed HTML contents + // (query patterns are described in the linkMatches table). + doc, err := goquery.NewDocumentFromReader(r) if err != nil { return nil } - for _, lm := range linkMatches { doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { val, _ := s.Attr(lm.attr) outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) }) } + + // Find the inline + +`, + []string{ + "https://example.com/link1", + }, + }, +} + +func TestLinks(t *testing.T) { + for _, tt := range tests { + if err := tt.runTestCase(); err != nil { + t.Error(err) + } + } +} diff --git a/go.mod b/go.mod index 792b792..3a6c3df 100644 --- a/go.mod +++ b/go.mod @@ -4,10 +4,10 @@ require ( github.com/PuerkitoBio/goquery v1.5.0 github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 - github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4 - github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf + github.com/andybalholm/cascadia v1.0.0 + github.com/golang/snappy v0.0.1 github.com/pborman/uuid v0.0.0-20171128162732-e53336930665 github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 - golang.org/x/text v0.0.0-20190829152558-3d0f7978add9 + golang.org/x/text v0.3.0 ) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..74b5bff --- /dev/null +++ b/go.sum @@ -0,0 +1,39 @@ +github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk= +github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= +github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 h1:1H3FyRw7YsqIty9WHPOVEGJaFJ1sfGVZ3PPDUw3ob2w= +github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= +github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/pborman/uuid v0.0.0-20171128162732-e53336930665 h1:7G9lvlxEu1ZPLqJnsRY1MuoBaf2Mg4qbtcxNRXKdzFs= +github.com/pborman/uuid v0.0.0-20171128162732-e53336930665/go.mod h1:VyrYX9gd7irzKovcSS6BIIEwPRkP2Wm2m9ufcdFSJ34= +github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d h1:OgkXbz/O0zsJoaB+z6n/a3bNGCbCWhBPLfGr6qaBprM= +github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 h1:qPnAdmjNA41t3QBTx2mFGf/SD1IoslhYu7AmdsVzCcs= +golang.org/x/net v0.0.0-20190926025831-c00fd9afed17/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.0.0-20190829152558-3d0f7978add9/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -- cgit v1.2.3-54-g00ecf