aboutsummaryrefslogtreecommitdiff
path: root/analysis
diff options
context:
space:
mode:
authorale <ale@incal.net>2014-12-20 10:56:40 +0000
committerale <ale@incal.net>2014-12-20 10:56:40 +0000
commitefe98903c17a9103d7830361d6ff6f98bb9e0faa (patch)
treefb24eb56631af2c8deb2e617d85c5cf0f4206c17 /analysis
parentf0c14e5e36d640d0f6801691c69dffb69459fe10 (diff)
downloadcrawl-efe98903c17a9103d7830361d6ff6f98bb9e0faa.tar.gz
crawl-efe98903c17a9103d7830361d6ff6f98bb9e0faa.zip
relax the CSS url() regexp
Diffstat (limited to 'analysis')
-rw-r--r--analysis/links.go15
1 files changed, 10 insertions, 5 deletions
diff --git a/analysis/links.go b/analysis/links.go
index 36244c0..22bcb80 100644
--- a/analysis/links.go
+++ b/analysis/links.go
@@ -14,7 +14,7 @@ import (
)
var (
- urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`)
+ urlcssRx = regexp.MustCompile(`.*:.*url\(["']?([^'"\)]+)["']?\)`)
linkMatches = []struct {
tag string
@@ -32,6 +32,9 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
+ // Use goquery to extract links from the parsed HTML
+ // contents (query patterns are described in the
+ // linkMatches table).
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
return nil, err
@@ -44,6 +47,8 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
})
}
} else if strings.HasPrefix(ctype, "text/css") {
+ // Use a simple (and actually quite bad) regular
+ // expression to extract "url()" links from CSS.
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, val[1])
@@ -51,7 +56,8 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
}
}
- // Uniquify and parse outbound links.
+ // Parse outbound links relative to the request URI, and
+ // return unique results.
var result []*url.URL
links := make(map[string]*url.URL)
for _, val := range outlinks {
@@ -59,9 +65,8 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
links[linkurl.String()] = linkurl
}
}
- for _, link := range links {
- result = append(result, link)
+ for _, u := range links {
+ result = append(result, u)
}
-
return result, nil
}