diff options
Diffstat (limited to 'vendor')
42 files changed, 6732 insertions, 795 deletions
diff --git a/vendor/github.com/PuerkitoBio/goquery/README.md b/vendor/github.com/PuerkitoBio/goquery/README.md index 84f9af3..c9db736 100644 --- a/vendor/github.com/PuerkitoBio/goquery/README.md +++ b/vendor/github.com/PuerkitoBio/goquery/README.md @@ -138,9 +138,11 @@ func main() { - [Goq][goq], an HTML deserialization and scraping library based on goquery and struct tags. - [andybalholm/cascadia][cascadia], the CSS selector library used by goquery. - [suntong/cascadia][cascadiacli], a command-line interface to the cascadia CSS selector library, useful to test selectors. -- [asciimoo/colly](https://github.com/asciimoo/colly), a lightning fast and elegant Scraping Framework +- [gocolly/colly](https://github.com/gocolly/colly), a lightning fast and elegant Scraping Framework - [gnulnx/goperf](https://github.com/gnulnx/goperf), a website performance test tool that also fetches static assets. - [MontFerret/ferret](https://github.com/MontFerret/ferret), declarative web scraping. +- [tacusci/berrycms](https://github.com/tacusci/berrycms), a modern simple to use CMS with easy to write plugins +- [Dataflow kit](https://github.com/slotix/dataflowkit), Web Scraping framework for Gophers. ## Support diff --git a/vendor/github.com/andybalholm/cascadia/parser.go b/vendor/github.com/andybalholm/cascadia/parser.go index 495db9c..bebf0af 100644 --- a/vendor/github.com/andybalholm/cascadia/parser.go +++ b/vendor/github.com/andybalholm/cascadia/parser.go @@ -7,14 +7,16 @@ import ( "regexp" "strconv" "strings" - - "golang.org/x/net/html" ) // a parser for CSS selectors type parser struct { s string // the source text i int // the current position + + // if `false`, parsing a pseudo-element + // returns an error. + acceptPseudoElements bool } // parseEscape parses a backslash escape. @@ -56,6 +58,26 @@ func (p *parser) parseEscape() (result string, err error) { return result, nil } +// toLowerASCII returns s with all ASCII capital letters lowercased. +func toLowerASCII(s string) string { + var b []byte + for i := 0; i < len(s); i++ { + if c := s[i]; 'A' <= c && c <= 'Z' { + if b == nil { + b = make([]byte, len(s)) + copy(b, s) + } + b[i] = s[i] + ('a' - 'A') + } + } + + if b == nil { + return s + } + + return string(b) +} + func hexDigit(c byte) bool { return '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' } @@ -280,92 +302,92 @@ func (p *parser) consumeClosingParenthesis() bool { } // parseTypeSelector parses a type selector (one that matches by tag name). -func (p *parser) parseTypeSelector() (result Selector, err error) { +func (p *parser) parseTypeSelector() (result tagSelector, err error) { tag, err := p.parseIdentifier() if err != nil { - return nil, err + return } - - return typeSelector(tag), nil + return tagSelector{tag: toLowerASCII(tag)}, nil } // parseIDSelector parses a selector that matches by id attribute. -func (p *parser) parseIDSelector() (Selector, error) { +func (p *parser) parseIDSelector() (idSelector, error) { if p.i >= len(p.s) { - return nil, fmt.Errorf("expected id selector (#id), found EOF instead") + return idSelector{}, fmt.Errorf("expected id selector (#id), found EOF instead") } if p.s[p.i] != '#' { - return nil, fmt.Errorf("expected id selector (#id), found '%c' instead", p.s[p.i]) + return idSelector{}, fmt.Errorf("expected id selector (#id), found '%c' instead", p.s[p.i]) } p.i++ id, err := p.parseName() if err != nil { - return nil, err + return idSelector{}, err } - return attributeEqualsSelector("id", id), nil + return idSelector{id: id}, nil } // parseClassSelector parses a selector that matches by class attribute. -func (p *parser) parseClassSelector() (Selector, error) { +func (p *parser) parseClassSelector() (classSelector, error) { if p.i >= len(p.s) { - return nil, fmt.Errorf("expected class selector (.class), found EOF instead") + return classSelector{}, fmt.Errorf("expected class selector (.class), found EOF instead") } if p.s[p.i] != '.' { - return nil, fmt.Errorf("expected class selector (.class), found '%c' instead", p.s[p.i]) + return classSelector{}, fmt.Errorf("expected class selector (.class), found '%c' instead", p.s[p.i]) } p.i++ class, err := p.parseIdentifier() if err != nil { - return nil, err + return classSelector{}, err } - return attributeIncludesSelector("class", class), nil + return classSelector{class: class}, nil } // parseAttributeSelector parses a selector that matches by attribute value. -func (p *parser) parseAttributeSelector() (Selector, error) { +func (p *parser) parseAttributeSelector() (attrSelector, error) { if p.i >= len(p.s) { - return nil, fmt.Errorf("expected attribute selector ([attribute]), found EOF instead") + return attrSelector{}, fmt.Errorf("expected attribute selector ([attribute]), found EOF instead") } if p.s[p.i] != '[' { - return nil, fmt.Errorf("expected attribute selector ([attribute]), found '%c' instead", p.s[p.i]) + return attrSelector{}, fmt.Errorf("expected attribute selector ([attribute]), found '%c' instead", p.s[p.i]) } p.i++ p.skipWhitespace() key, err := p.parseIdentifier() if err != nil { - return nil, err + return attrSelector{}, err } + key = toLowerASCII(key) p.skipWhitespace() if p.i >= len(p.s) { - return nil, errors.New("unexpected EOF in attribute selector") + return attrSelector{}, errors.New("unexpected EOF in attribute selector") } if p.s[p.i] == ']' { p.i++ - return attributeExistsSelector(key), nil + return attrSelector{key: key, operation: ""}, nil } if p.i+2 >= len(p.s) { - return nil, errors.New("unexpected EOF in attribute selector") + return attrSelector{}, errors.New("unexpected EOF in attribute selector") } op := p.s[p.i : p.i+2] if op[0] == '=' { op = "=" } else if op[1] != '=' { - return nil, fmt.Errorf(`expected equality operator, found "%s" instead`, op) + return attrSelector{}, fmt.Errorf(`expected equality operator, found "%s" instead`, op) } p.i += len(op) p.skipWhitespace() if p.i >= len(p.s) { - return nil, errors.New("unexpected EOF in attribute selector") + return attrSelector{}, errors.New("unexpected EOF in attribute selector") } var val string var rx *regexp.Regexp @@ -380,88 +402,84 @@ func (p *parser) parseAttributeSelector() (Selector, error) { } } if err != nil { - return nil, err + return attrSelector{}, err } p.skipWhitespace() if p.i >= len(p.s) { - return nil, errors.New("unexpected EOF in attribute selector") + return attrSelector{}, errors.New("unexpected EOF in attribute selector") } if p.s[p.i] != ']' { - return nil, fmt.Errorf("expected ']', found '%c' instead", p.s[p.i]) + return attrSelector{}, fmt.Errorf("expected ']', found '%c' instead", p.s[p.i]) } p.i++ switch op { - case "=": - return attributeEqualsSelector(key, val), nil - case "!=": - return attributeNotEqualSelector(key, val), nil - case "~=": - return attributeIncludesSelector(key, val), nil - case "|=": - return attributeDashmatchSelector(key, val), nil - case "^=": - return attributePrefixSelector(key, val), nil - case "$=": - return attributeSuffixSelector(key, val), nil - case "*=": - return attributeSubstringSelector(key, val), nil - case "#=": - return attributeRegexSelector(key, rx), nil - } - - return nil, fmt.Errorf("attribute operator %q is not supported", op) + case "=", "!=", "~=", "|=", "^=", "$=", "*=", "#=": + return attrSelector{key: key, val: val, operation: op, regexp: rx}, nil + default: + return attrSelector{}, fmt.Errorf("attribute operator %q is not supported", op) + } } var errExpectedParenthesis = errors.New("expected '(' but didn't find it") var errExpectedClosingParenthesis = errors.New("expected ')' but didn't find it") var errUnmatchedParenthesis = errors.New("unmatched '('") -// parsePseudoclassSelector parses a pseudoclass selector like :not(p). -func (p *parser) parsePseudoclassSelector() (Selector, error) { +// parsePseudoclassSelector parses a pseudoclass selector like :not(p) or a pseudo-element +// For backwards compatibility, both ':' and '::' prefix are allowed for pseudo-elements. +// https://drafts.csswg.org/selectors-3/#pseudo-elements +// Returning a nil `Sel` (and a nil `error`) means we found a pseudo-element. +func (p *parser) parsePseudoclassSelector() (out Sel, pseudoElement string, err error) { if p.i >= len(p.s) { - return nil, fmt.Errorf("expected pseudoclass selector (:pseudoclass), found EOF instead") + return nil, "", fmt.Errorf("expected pseudoclass selector (:pseudoclass), found EOF instead") } if p.s[p.i] != ':' { - return nil, fmt.Errorf("expected attribute selector (:pseudoclass), found '%c' instead", p.s[p.i]) + return nil, "", fmt.Errorf("expected attribute selector (:pseudoclass), found '%c' instead", p.s[p.i]) } p.i++ + var mustBePseudoElement bool + if p.i >= len(p.s) { + return nil, "", fmt.Errorf("got empty pseudoclass (or pseudoelement)") + } + if p.s[p.i] == ':' { // we found a pseudo-element + mustBePseudoElement = true + p.i++ + } + name, err := p.parseIdentifier() if err != nil { - return nil, err + return } name = toLowerASCII(name) + if mustBePseudoElement && (name != "after" && name != "backdrop" && name != "before" && + name != "cue" && name != "first-letter" && name != "first-line" && name != "grammar-error" && + name != "marker" && name != "placeholder" && name != "selection" && name != "spelling-error") { + return out, "", fmt.Errorf("unknown pseudoelement :%s", name) + } switch name { case "not", "has", "haschild": if !p.consumeParenthesis() { - return nil, errExpectedParenthesis + return out, "", errExpectedParenthesis } sel, parseErr := p.parseSelectorGroup() if parseErr != nil { - return nil, parseErr + return out, "", parseErr } if !p.consumeClosingParenthesis() { - return nil, errExpectedClosingParenthesis + return out, "", errExpectedClosingParenthesis } - switch name { - case "not": - return negatedSelector(sel), nil - case "has": - return hasDescendantSelector(sel), nil - case "haschild": - return hasChildSelector(sel), nil - } + out = relativePseudoClassSelector{name: name, match: sel} case "contains", "containsown": if !p.consumeParenthesis() { - return nil, errExpectedParenthesis + return out, "", errExpectedParenthesis } if p.i == len(p.s) { - return nil, errUnmatchedParenthesis + return out, "", errUnmatchedParenthesis } var val string switch p.s[p.i] { @@ -471,95 +489,75 @@ func (p *parser) parsePseudoclassSelector() (Selector, error) { val, err = p.parseIdentifier() } if err != nil { - return nil, err + return out, "", err } val = strings.ToLower(val) p.skipWhitespace() if p.i >= len(p.s) { - return nil, errors.New("unexpected EOF in pseudo selector") + return out, "", errors.New("unexpected EOF in pseudo selector") } if !p.consumeClosingParenthesis() { - return nil, errExpectedClosingParenthesis + return out, "", errExpectedClosingParenthesis } - switch name { - case "contains": - return textSubstrSelector(val), nil - case "containsown": - return ownTextSubstrSelector(val), nil - } + out = containsPseudoClassSelector{own: name == "containsown", value: val} case "matches", "matchesown": if !p.consumeParenthesis() { - return nil, errExpectedParenthesis + return out, "", errExpectedParenthesis } rx, err := p.parseRegex() if err != nil { - return nil, err + return out, "", err } if p.i >= len(p.s) { - return nil, errors.New("unexpected EOF in pseudo selector") + return out, "", errors.New("unexpected EOF in pseudo selector") } if !p.consumeClosingParenthesis() { - return nil, errExpectedClosingParenthesis + return out, "", errExpectedClosingParenthesis } - switch name { - case "matches": - return textRegexSelector(rx), nil - case "matchesown": - return ownTextRegexSelector(rx), nil - } + out = regexpPseudoClassSelector{own: name == "matchesown", regexp: rx} case "nth-child", "nth-last-child", "nth-of-type", "nth-last-of-type": if !p.consumeParenthesis() { - return nil, errExpectedParenthesis + return out, "", errExpectedParenthesis } a, b, err := p.parseNth() if err != nil { - return nil, err + return out, "", err } if !p.consumeClosingParenthesis() { - return nil, errExpectedClosingParenthesis - } - if a == 0 { - switch name { - case "nth-child": - return simpleNthChildSelector(b, false), nil - case "nth-of-type": - return simpleNthChildSelector(b, true), nil - case "nth-last-child": - return simpleNthLastChildSelector(b, false), nil - case "nth-last-of-type": - return simpleNthLastChildSelector(b, true), nil - } + return out, "", errExpectedClosingParenthesis } - return nthChildSelector(a, b, - name == "nth-last-child" || name == "nth-last-of-type", - name == "nth-of-type" || name == "nth-last-of-type"), - nil + last := name == "nth-last-child" || name == "nth-last-of-type" + ofType := name == "nth-of-type" || name == "nth-last-of-type" + out = nthPseudoClassSelector{a: a, b: b, last: last, ofType: ofType} case "first-child": - return simpleNthChildSelector(1, false), nil + out = nthPseudoClassSelector{a: 0, b: 1, ofType: false, last: false} case "last-child": - return simpleNthLastChildSelector(1, false), nil + out = nthPseudoClassSelector{a: 0, b: 1, ofType: false, last: true} case "first-of-type": - return simpleNthChildSelector(1, true), nil + out = nthPseudoClassSelector{a: 0, b: 1, ofType: true, last: false} case "last-of-type": - return simpleNthLastChildSelector(1, true), nil + out = nthPseudoClassSelector{a: 0, b: 1, ofType: true, last: true} case "only-child": - return onlyChildSelector(false), nil + out = onlyChildPseudoClassSelector{ofType: false} case "only-of-type": - return onlyChildSelector(true), nil + out = onlyChildPseudoClassSelector{ofType: true} case "input": - return inputSelector, nil + out = inputPseudoClassSelector{} case "empty": - return emptyElementSelector, nil + out = emptyElementPseudoClassSelector{} case "root": - return rootSelector, nil + out = rootPseudoClassSelector{} + case "after", "backdrop", "before", "cue", "first-letter", "first-line", "grammar-error", "marker", "placeholder", "selection", "spelling-error": + return nil, name, nil + default: + return out, "", fmt.Errorf("unknown pseudoclass or pseudoelement :%s", name) } - - return nil, fmt.Errorf("unknown pseudoclass :%s", name) + return } // parseInteger parses a decimal integer. @@ -705,8 +703,8 @@ invalid: // parseSimpleSelectorSequence parses a selector sequence that applies to // a single element. -func (p *parser) parseSimpleSelectorSequence() (Selector, error) { - var result Selector +func (p *parser) parseSimpleSelectorSequence() (Sel, error) { + var selectors []Sel if p.i >= len(p.s) { return nil, errors.New("expected selector, found EOF instead") @@ -723,13 +721,17 @@ func (p *parser) parseSimpleSelectorSequence() (Selector, error) { if err != nil { return nil, err } - result = r + selectors = append(selectors, r) } + var pseudoElement string loop: for p.i < len(p.s) { - var ns Selector - var err error + var ( + ns Sel + newPseudoElement string + err error + ) switch p.s[p.i] { case '#': ns, err = p.parseIDSelector() @@ -738,44 +740,57 @@ loop: case '[': ns, err = p.parseAttributeSelector() case ':': - ns, err = p.parsePseudoclassSelector() + ns, newPseudoElement, err = p.parsePseudoclassSelector() default: break loop } if err != nil { return nil, err } - if result == nil { - result = ns + // From https://drafts.csswg.org/selectors-3/#pseudo-elements : + // "Only one pseudo-element may appear per selector, and if present + // it must appear after the sequence of simple selectors that + // represents the subjects of the selector."" + if ns == nil { // we found a pseudo-element + if pseudoElement != "" { + return nil, fmt.Errorf("only one pseudo-element is accepted per selector, got %s and %s", pseudoElement, newPseudoElement) + } + if !p.acceptPseudoElements { + return nil, fmt.Errorf("pseudo-element %s found, but pseudo-elements support is disabled", newPseudoElement) + } + pseudoElement = newPseudoElement } else { - result = intersectionSelector(result, ns) + if pseudoElement != "" { + return nil, fmt.Errorf("pseudo-element %s must be at the end of selector", pseudoElement) + } + selectors = append(selectors, ns) } - } - if result == nil { - result = func(n *html.Node) bool { - return n.Type == html.ElementNode - } } - - return result, nil + if len(selectors) == 1 && pseudoElement == "" { // no need wrap the selectors in compoundSelector + return selectors[0], nil + } + return compoundSelector{selectors: selectors, pseudoElement: pseudoElement}, nil } // parseSelector parses a selector that may include combinators. -func (p *parser) parseSelector() (result Selector, err error) { +func (p *parser) parseSelector() (Sel, error) { p.skipWhitespace() - result, err = p.parseSimpleSelectorSequence() + result, err := p.parseSimpleSelectorSequence() if err != nil { - return + return nil, err } for { - var combinator byte + var ( + combinator byte + c Sel + ) if p.skipWhitespace() { combinator = ' ' } if p.i >= len(p.s) { - return + return result, nil } switch p.s[p.i] { @@ -785,51 +800,39 @@ func (p *parser) parseSelector() (result Selector, err error) { p.skipWhitespace() case ',', ')': // These characters can't begin a selector, but they can legally occur after one. - return + return result, nil } if combinator == 0 { - return + return result, nil } - c, err := p.parseSimpleSelectorSequence() + c, err = p.parseSimpleSelectorSequence() if err != nil { return nil, err } - - switch combinator { - case ' ': - result = descendantSelector(result, c) - case '>': - result = childSelector(result, c) - case '+': - result = siblingSelector(result, c, true) - case '~': - result = siblingSelector(result, c, false) - } + result = combinedSelector{first: result, combinator: combinator, second: c} } - - panic("unreachable") } // parseSelectorGroup parses a group of selectors, separated by commas. -func (p *parser) parseSelectorGroup() (result Selector, err error) { - result, err = p.parseSelector() +func (p *parser) parseSelectorGroup() (SelectorGroup, error) { + current, err := p.parseSelector() if err != nil { - return + return nil, err } + result := SelectorGroup{current} for p.i < len(p.s) { if p.s[p.i] != ',' { - return result, nil + break } p.i++ c, err := p.parseSelector() if err != nil { return nil, err } - result = unionSelector(result, c) + result = append(result, c) } - - return + return result, nil } diff --git a/vendor/github.com/andybalholm/cascadia/selector.go b/vendor/github.com/andybalholm/cascadia/selector.go index 9fb05cc..e2a6dc4 100644 --- a/vendor/github.com/andybalholm/cascadia/selector.go +++ b/vendor/github.com/andybalholm/cascadia/selector.go @@ -9,36 +9,60 @@ import ( "golang.org/x/net/html" ) -// the Selector type, and functions for creating them +// Matcher is the interface for basic selector functionality. +// Match returns whether a selector matches n. +type Matcher interface { + Match(n *html.Node) bool +} -// A Selector is a function which tells whether a node matches or not. -type Selector func(*html.Node) bool +// Sel is the interface for all the functionality provided by selectors. +type Sel interface { + Matcher + Specificity() Specificity -// hasChildMatch returns whether n has any child that matches a. -func hasChildMatch(n *html.Node, a Selector) bool { - for c := n.FirstChild; c != nil; c = c.NextSibling { - if a(c) { - return true - } + // Returns a CSS input compiling to this selector. + String() string + + // Returns a pseudo-element, or an empty string. + PseudoElement() string +} + +// Parse parses a selector. Use `ParseWithPseudoElement` +// if you need support for pseudo-elements. +func Parse(sel string) (Sel, error) { + p := &parser{s: sel} + compiled, err := p.parseSelector() + if err != nil { + return nil, err } - return false + + if p.i < len(sel) { + return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i) + } + + return compiled, nil } -// hasDescendantMatch performs a depth-first search of n's descendants, -// testing whether any of them match a. It returns true as soon as a match is -// found, or false if no match is found. -func hasDescendantMatch(n *html.Node, a Selector) bool { - for c := n.FirstChild; c != nil; c = c.NextSibling { - if a(c) || (c.Type == html.ElementNode && hasDescendantMatch(c, a)) { - return true - } +// ParseWithPseudoElement parses a single selector, +// with support for pseudo-element. +func ParseWithPseudoElement(sel string) (Sel, error) { + p := &parser{s: sel, acceptPseudoElements: true} + compiled, err := p.parseSelector() + if err != nil { + return nil, err } - return false + + if p.i < len(sel) { + return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i) + } + + return compiled, nil } -// Compile parses a selector and returns, if successful, a Selector object -// that can be used to match against html.Node objects. -func Compile(sel string) (Selector, error) { +// ParseGroup parses a selector, or a group of selectors separated by commas. +// Use `ParseGroupWithPseudoElements` +// if you need support for pseudo-elements. +func ParseGroup(sel string) (SelectorGroup, error) { p := &parser{s: sel} compiled, err := p.parseSelectorGroup() if err != nil { @@ -52,6 +76,39 @@ func Compile(sel string) (Selector, error) { return compiled, nil } +// ParseGroupWithPseudoElements parses a selector, or a group of selectors separated by commas. +// It supports pseudo-elements. +func ParseGroupWithPseudoElements(sel string) (SelectorGroup, error) { + p := &parser{s: sel, acceptPseudoElements: true} + compiled, err := p.parseSelectorGroup() + if err != nil { + return nil, err + } + + if p.i < len(sel) { + return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i) + } + + return compiled, nil +} + +// A Selector is a function which tells whether a node matches or not. +// +// This type is maintained for compatibility; I recommend using the newer and +// more idiomatic interfaces Sel and Matcher. +type Selector func(*html.Node) bool + +// Compile parses a selector and returns, if successful, a Selector object +// that can be used to match against html.Node objects. +func Compile(sel string) (Selector, error) { + compiled, err := ParseGroup(sel) + if err != nil { + return nil, err + } + + return Selector(compiled.Match), nil +} + // MustCompile is like Compile, but panics instead of returning an error. func MustCompile(sel string) Selector { compiled, err := Compile(sel) @@ -79,6 +136,23 @@ func (s Selector) matchAllInto(n *html.Node, storage []*html.Node) []*html.Node return storage } +func queryInto(n *html.Node, m Matcher, storage []*html.Node) []*html.Node { + for child := n.FirstChild; child != nil; child = child.NextSibling { + if m.Match(child) { + storage = append(storage, child) + } + storage = queryInto(child, m, storage) + } + + return storage +} + +// QueryAll returns a slice of all the nodes that match m, from the descendants +// of n. +func QueryAll(n *html.Node, m Matcher) []*html.Node { + return queryInto(n, m, nil) +} + // Match returns true if the node matches the selector. func (s Selector) Match(n *html.Node) bool { return s(n) @@ -99,6 +173,21 @@ func (s Selector) MatchFirst(n *html.Node) *html.Node { return nil } +// Query returns the first node that matches m, from the descendants of n. +// If none matches, it returns nil. +func Query(n *html.Node, m Matcher) *html.Node { + for c := n.FirstChild; c != nil; c = c.NextSibling { + if m.Match(c) { + return c + } + if matched := Query(c, m); matched != nil { + return matched + } + } + + return nil +} + // Filter returns the nodes in nodes that match the selector. func (s Selector) Filter(nodes []*html.Node) (result []*html.Node) { for _, n := range nodes { @@ -109,106 +198,148 @@ func (s Selector) Filter(nodes []*html.Node) (result []*html.Node) { return result |