diff options
author | Thomas Pointhuber <thomas.pointhuber@gmx.at> | 2014-09-02 19:57:01 +0200 |
---|---|---|
committer | Thomas Pointhuber <thomas.pointhuber@gmx.at> | 2014-09-02 19:57:01 +0200 |
commit | 678a80f043d2f57f059236b574cc29fab4f70fe8 (patch) | |
tree | 2319089c4225c5cfdf0360287f53ec483c5af12d /searx/engines/startpage.py | |
parent | a46bbb40422564b5576b81c978fb734dbf45a9ce (diff) | |
download | searxng-678a80f043d2f57f059236b574cc29fab4f70fe8.tar.gz searxng-678a80f043d2f57f059236b574cc29fab4f70fe8.zip |
fix startpage engine and add comments
* add language support
* remove not required code
* improve google-ad detection (no false detection anymore, I hope)
* other improvements
Diffstat (limited to 'searx/engines/startpage.py')
-rw-r--r-- | searx/engines/startpage.py | 74 |
1 files changed, 53 insertions, 21 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 8d44d05ab..2adbfb3e4 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -1,47 +1,79 @@ +## Startpage (Web) +# +# @website https://startpage.com +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo paging + from urllib import urlencode from lxml import html from cgi import escape +import re + +# engine dependent config +categories = ['general'] +# there is a mechanism to block "bot" search (probably the parameter qid), require storing of qid's between mulitble search-calls +#paging = False +language_support = True -base_url = None -search_url = None +# search-url +base_url = 'https://startpage.com/' +search_url = base_url + 'do/search' -# TODO paging -paging = False -# TODO complete list of country mapping -country_map = {'en_US': 'eng', - 'en_UK': 'uk', - 'nl_NL': 'ned'} +# specific xpath variables +# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] +# not ads: div[@class="result"] are the direct childs of div[@id="results"] +results_xpath = '//div[@class="result"]' +link_xpath = './/h3/a' +# do search-request def request(query, params): + offset = (params['pageno'] - 1) * 10 query = urlencode({'q': query})[2:] + params['url'] = search_url params['method'] = 'POST' params['data'] = {'query': query, - 'startat': (params['pageno'] - 1) * 10} # offset - country = country_map.get(params['language'], 'eng') - params['cookies']['preferences'] = \ - 'lang_homepageEEEs/air/{country}/N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE0N1Ncolor_schemeEEEnewN1Nnum_of_resultsEEE10N1N'.format(country=country) # noqa + 'startat': offset} + + # set language if specified + if params['language'] != 'all': + params['data']['with_language'] = 'lang_' + params['language'].split('_')[0] + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.content) - # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] - # not ads: div[@class="result"] are the direct childs of div[@id="results"] - for result in dom.xpath('//div[@class="result"]'): - link = result.xpath('.//h3/a')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] url = link.attrib.get('href') - if url.startswith('http://www.google.')\ - or url.startswith('https://www.google.'): - continue title = escape(link.text_content()) - content = '' + # block google-ad url's + if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): + continue + if result.xpath('./p[@class="desc"]'): content = escape(result.xpath('./p[@class="desc"]')[0].text_content()) + else: + content = '' - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + # return results return results |