diff options
author | ale <ale@incal.net> | 2014-12-20 10:56:40 +0000 |
---|---|---|
committer | ale <ale@incal.net> | 2014-12-20 10:56:40 +0000 |
commit | efe98903c17a9103d7830361d6ff6f98bb9e0faa (patch) | |
tree | fb24eb56631af2c8deb2e617d85c5cf0f4206c17 /analysis | |
parent | f0c14e5e36d640d0f6801691c69dffb69459fe10 (diff) | |
download | crawl-efe98903c17a9103d7830361d6ff6f98bb9e0faa.tar.gz crawl-efe98903c17a9103d7830361d6ff6f98bb9e0faa.zip |
relax the CSS url() regexp
Diffstat (limited to 'analysis')
-rw-r--r-- | analysis/links.go | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/analysis/links.go b/analysis/links.go index 36244c0..22bcb80 100644 --- a/analysis/links.go +++ b/analysis/links.go @@ -14,7 +14,7 @@ import ( ) var ( - urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`) + urlcssRx = regexp.MustCompile(`.*:.*url\(["']?([^'"\)]+)["']?\)`) linkMatches = []struct { tag string @@ -32,6 +32,9 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { ctype := resp.Header.Get("Content-Type") if strings.HasPrefix(ctype, "text/html") { + // Use goquery to extract links from the parsed HTML + // contents (query patterns are described in the + // linkMatches table). doc, err := goquery.NewDocumentFromResponse(resp) if err != nil { return nil, err @@ -44,6 +47,8 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { }) } } else if strings.HasPrefix(ctype, "text/css") { + // Use a simple (and actually quite bad) regular + // expression to extract "url()" links from CSS. if data, err := ioutil.ReadAll(resp.Body); err == nil { for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) { outlinks = append(outlinks, val[1]) @@ -51,7 +56,8 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { } } - // Uniquify and parse outbound links. + // Parse outbound links relative to the request URI, and + // return unique results. var result []*url.URL links := make(map[string]*url.URL) for _, val := range outlinks { @@ -59,9 +65,8 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) { links[linkurl.String()] = linkurl } } - for _, link := range links { - result = append(result, link) + for _, u := range links { + result = append(result, u) } - return result, nil } |