aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorale <ale@gramma>2019-10-07 15:07:15 +0200
committerale <ale@gramma>2019-10-07 15:07:15 +0200
commit6dc36ab41a50a22ccbc75c55e47326998c9f4bc0 (patch)
tree65a668ea9507a38fa30b886ab2bf06e1621638a5
parentf4ef521da8711c8c4b5cc144fa275b212fe695e7 (diff)
downloadcrawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.tar.gz
crawl-6dc36ab41a50a22ccbc75c55e47326998c9f4bc0.zip
Parse links in inline style blocks
-rw-r--r--analysis/links.go31
-rw-r--r--analysis/links_test.go80
-rw-r--r--go.mod6
-rw-r--r--go.sum39
4 files changed, 139 insertions, 17 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 398a9e0..521b8fe 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -4,6 +4,7 @@ package analysis
import (
"fmt"
+ "io"
"io/ioutil"
"net/http"
"regexp"
@@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink {
ctype := resp.Header.Get("Content-Type")
switch {
case strings.HasPrefix(ctype, "text/html"):
- return extractLinksFromHTML(resp)
+ return extractLinksFromHTML(resp.Body, nil)
case strings.HasPrefix(ctype, "text/css"):
- return extractLinksFromCSS(resp)
+ return extractLinksFromCSS(resp.Body, nil)
default:
return nil
}
}
-func extractLinksFromHTML(resp *http.Response) []rawOutlink {
- var outlinks []rawOutlink
- // Use goquery to extract links from the parsed HTML
- // contents (query patterns are described in the
- // linkMatches table).
- doc, err := goquery.NewDocumentFromReader(resp.Body)
+func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+ // Use goquery to extract links from the parsed HTML contents
+ // (query patterns are described in the linkMatches table).
+ doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil
}
-
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
+
+ // Find the inline <style> sections and parse them separately as CSS.
+ doc.Find("style").Each(func(i int, s *goquery.Selection) {
+ outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks)
+ })
+
return outlinks
}
-func extractLinksFromCSS(resp *http.Response) []rawOutlink {
- // Use a simple (and actually quite bad) regular
- // expression to extract "url()" links from CSS.
- var outlinks []rawOutlink
- if data, err := ioutil.ReadAll(resp.Body); err == nil {
+func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink {
+ // Use a simple (and actually quite bad) regular expression to
+ // extract "url()" and "@import" links from CSS.
+ if data, err := ioutil.ReadAll(r); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
diff --git a/analysis/links_test.go b/analysis/links_test.go
new file mode 100644
index 0000000..1bd906b
--- /dev/null
+++ b/analysis/links_test.go
@@ -0,0 +1,80 @@
+package analysis
+
+import (
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "net/url"
+ "strings"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+)
+
+func makeResponse(ctype, body string) *http.Response {
+ u, _ := url.Parse("https://example.com/")
+ r := &http.Response{
+ Header: make(http.Header),
+ Body: ioutil.NopCloser(strings.NewReader(body)),
+ Request: &http.Request{
+ URL: u,
+ },
+ }
+ r.Header.Set("Content-Type", ctype)
+ return r
+}
+
+type testdata struct {
+ ctype string
+ body string
+ expectedLinks []string
+}
+
+func (td *testdata) runTestCase() error {
+ links, err := GetLinks(makeResponse(td.ctype, td.body))
+ if err != nil {
+ return fmt.Errorf("GetLinks() error: %v", err)
+ }
+ var linkStr []string
+ for _, l := range links {
+ linkStr = append(linkStr, l.URL.String())
+ }
+ if diff := cmp.Diff(td.expectedLinks, linkStr); diff != "" {
+ return fmt.Errorf("unexpected result:\n%s", diff)
+ }
+ return nil
+}
+
+var tests = []testdata{
+ {
+ "text/html",
+ `
+<html><body>
+<a href="/link1">link</a>
+</body></html>
+`,
+ []string{
+ "https://example.com/link1",
+ },
+ },
+ {
+ "text/html",
+ `
+<html><head><style type="text/css">
+body { background: url('/link1'); }
+</style></head>
+<body></body></html>
+`,
+ []string{
+ "https://example.com/link1",
+ },
+ },
+}
+
+func TestLinks(t *testing.T) {
+ for _, tt := range tests {
+ if err := tt.runTestCase(); err != nil {
+ t.Error(err)
+ }
+ }
+}
diff --git a/go.mod b/go.mod
index 792b792..3a6c3df 100644
--- a/go.mod
+++ b/go.mod
@@ -4,10 +4,10 @@ require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578
- github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4
- github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf
+ github.com/andybalholm/cascadia v1.0.0
+ github.com/golang/snappy v0.0.1
github.com/pborman/uuid v0.0.0-20171128162732-e53336930665
github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17
- golang.org/x/text v0.0.0-20190829152558-3d0f7978add9
+ golang.org/x/text v0.3.0
)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..74b5bff
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,39 @@
+github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
+github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
+github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 h1:1H3FyRw7YsqIty9WHPOVEGJaFJ1sfGVZ3PPDUw3ob2w=
+github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
+github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
+github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
+github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
+github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
+github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
+github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
+github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
+github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
+github.com/pborman/uuid v0.0.0-20171128162732-e53336930665 h1:7G9lvlxEu1ZPLqJnsRY1MuoBaf2Mg4qbtcxNRXKdzFs=
+github.com/pborman/uuid v0.0.0-20171128162732-e53336930665/go.mod h1:VyrYX9gd7irzKovcSS6BIIEwPRkP2Wm2m9ufcdFSJ34=
+github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d h1:OgkXbz/O0zsJoaB+z6n/a3bNGCbCWhBPLfGr6qaBprM=
+github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 h1:qPnAdmjNA41t3QBTx2mFGf/SD1IoslhYu7AmdsVzCcs=
+golang.org/x/net v0.0.0-20190926025831-c00fd9afed17/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/text v0.0.0-20190829152558-3d0f7978add9/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
+gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
+gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=