diff options
author | ale <ale@gramma> | 2019-10-07 15:07:15 +0200 |
---|---|---|
committer | ale <ale@gramma> | 2019-10-07 15:07:15 +0200 |
commit | 6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 (patch) | |
tree | 65a668ea9507a38fa30b886ab2bf06e1621638a5 | |
parent | f4ef521da8711c8c4b5cc144fa275b212fe695e7 (diff) | |
download | crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.tar.gz crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.zip |
Parse links in inline style blocks
-rw-r--r-- | analysis/links.go | 31 | ||||
-rw-r--r-- | analysis/links_test.go | 80 | ||||
-rw-r--r-- | go.mod | 6 | ||||
-rw-r--r-- | go.sum | 39 |
4 files changed, 139 insertions, 17 deletions
diff --git a/analysis/links.go b/analysis/links.go index 398a9e0..521b8fe 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -4,6 +4,7 @@ package analysis import ( "fmt" + "io" "io/ioutil" "net/http" "regexp" @@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink { ctype := resp.Header.Get("Content-Type") switch { case strings.HasPrefix(ctype, "text/html"): - return extractLinksFromHTML(resp) + return extractLinksFromHTML(resp.Body, nil) case strings.HasPrefix(ctype, "text/css"): - return extractLinksFromCSS(resp) + return extractLinksFromCSS(resp.Body, nil) default: return nil } } -func extractLinksFromHTML(resp *http.Response) []rawOutlink { - var outlinks []rawOutlink - // Use goquery to extract links from the parsed HTML - // contents (query patterns are described in the - // linkMatches table). - doc, err := goquery.NewDocumentFromReader(resp.Body) +func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use goquery to extract links from the parsed HTML contents + // (query patterns are described in the linkMatches table). + doc, err := goquery.NewDocumentFromReader(r) if err != nil { return nil } - for _, lm := range linkMatches { doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) { val, _ := s.Attr(lm.attr) outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag}) }) } + + // Find the inline <style> sections and parse them separately as CSS. + doc.Find("style").Each(func(i int, s *goquery.Selection) { + outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks) + }) + return outlinks } -func extractLinksFromCSS(resp *http.Response) []rawOutlink { - // Use a simple (and actually quite bad) regular - // expression to extract "url()" links from CSS. - var outlinks []rawOutlink - if data, err := ioutil.ReadAll(resp.Body); err == nil { +func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink { + // Use a simple (and actually quite bad) regular expression to + // extract "url()" and "@import" links from CSS. + if data, err := ioutil.ReadAll(r); err == nil { for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated}) } diff --git a/analysis/links_test.go b/analysis/links_test.go new file mode 100644 index 0000000..1bd906b --- /dev/null +++ b/analysis/links_test.go @@ -0,0 +1,80 @@ +package analysis + +import ( + "fmt" + "io/ioutil" + "net/http" + "net/url" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func makeResponse(ctype, body string) *http.Response { + u, _ := url.Parse("https://example.com/") + r := &http.Response{ + Header: make(http.Header), + Body: ioutil.NopCloser(strings.NewReader(body)), + Request: &http.Request{ + URL: u, + }, + } + r.Header.Set("Content-Type", ctype) + return r +} + +type testdata struct { + ctype string + body string + expectedLinks []string +} + +func (td *testdata) runTestCase() error { + links, err := GetLinks(makeResponse(td.ctype, td.body)) + if err != nil { + return fmt.Errorf("GetLinks() error: %v", err) + } + var linkStr []string + for _, l := range links { + linkStr = append(linkStr, l.URL.String()) + } + if diff := cmp.Diff(td.expectedLinks, linkStr); diff != "" { + return fmt.Errorf("unexpected result:\n%s", diff) + } + return nil +} + +var tests = []testdata{ + { + "text/html", + ` +<html><body> +<a href="/link1">link</a> +</body></html> +`, + []string{ + "https://example.com/link1", + }, + }, + { + "text/html", + ` +<html><head><style type="text/css"> +body { background: url('/link1'); } +</style></head> +<body></body></html> +`, + []string{ + "https://example.com/link1", + }, + }, +} + +func TestLinks(t *testing.T) { + for _, tt := range tests { + if err := tt.runTestCase(); err != nil { + t.Error(err) + } + } +} @@ -4,10 +4,10 @@ require ( github.com/PuerkitoBio/goquery v1.5.0 github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 - github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4 - github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf + github.com/andybalholm/cascadia v1.0.0 + github.com/golang/snappy v0.0.1 github.com/pborman/uuid v0.0.0-20171128162732-e53336930665 github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 - golang.org/x/text v0.0.0-20190829152558-3d0f7978add9 + golang.org/x/text v0.3.0 ) @@ -0,0 +1,39 @@ +github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk= +github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= +github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 h1:1H3FyRw7YsqIty9WHPOVEGJaFJ1sfGVZ3PPDUw3ob2w= +github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= +github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/pborman/uuid v0.0.0-20171128162732-e53336930665 h1:7G9lvlxEu1ZPLqJnsRY1MuoBaf2Mg4qbtcxNRXKdzFs= +github.com/pborman/uuid v0.0.0-20171128162732-e53336930665/go.mod h1:VyrYX9gd7irzKovcSS6BIIEwPRkP2Wm2m9ufcdFSJ34= +github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d h1:OgkXbz/O0zsJoaB+z6n/a3bNGCbCWhBPLfGr6qaBprM= +github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 h1:qPnAdmjNA41t3QBTx2mFGf/SD1IoslhYu7AmdsVzCcs= +golang.org/x/net v0.0.0-20190926025831-c00fd9afed17/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.0.0-20190829152558-3d0f7978add9/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= |