diff options
author | Noémi Ványi <kvch@users.noreply.github.com> | 2019-01-04 22:27:05 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-01-04 22:27:05 +0100 |
commit | 1ea56576dc4f2fb65b9028c900c2e4fb063a8b9a (patch) | |
tree | e4d0e50c0494ca49a41351edea532c00289386a3 /searx/engines | |
parent | 0e493db2fb06fd78ac05f82abd8b1cc86089684f (diff) | |
parent | 899ba5d6dee82faacb572b4d9bc4c58570628531 (diff) | |
download | searxng-1ea56576dc4f2fb65b9028c900c2e4fb063a8b9a.tar.gz searxng-1ea56576dc4f2fb65b9028c900c2e4fb063a8b9a.zip |
Merge branch 'master' into devel_google_videos
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/bing_images.py | 4 | ||||
-rw-r--r-- | searx/engines/findx.py | 115 | ||||
-rw-r--r-- | searx/engines/startpage.py | 11 |
3 files changed, 5 insertions, 125 deletions
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 66e14c01f..876011f1d 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -88,9 +88,7 @@ def response(resp): url = json_data.get('purl') img_src = json_data.get('murl') - - thumb_json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('mad'))) - thumbnail = thumb_json_data.get('turl') + thumbnail = json_data.get('turl') # append result results.append({'template': 'images.html', diff --git a/searx/engines/findx.py b/searx/engines/findx.py deleted file mode 100644 index 87c9d503c..000000000 --- a/searx/engines/findx.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -FindX (General, Images, Videos) - -@website https://www.findx.com -@provide-api no -@using-api no -@results HTML -@stable no -@parse url, title, content, embedded, img_src, thumbnail_src -""" - -from dateutil import parser -from json import loads -import re - -from lxml import html - -from searx import logger -from searx.engines.xpath import extract_text -from searx.engines.youtube_noapi import base_youtube_url, embedded_url -from searx.url_utils import urlencode - - -paging = True -results_xpath = '//script[@id="initial-state"]' -search_url = 'https://www.findx.com/{category}?{q}' -type_map = { - 'none': 'web', - 'general': 'web', - 'images': 'images', - 'videos': 'videos', -} - - -def request(query, params): - params['url'] = search_url.format( - category=type_map[params['category']], - q=urlencode({ - 'q': query, - 'page': params['pageno'] - }) - ) - return params - - -def response(resp): - dom = html.fromstring(resp.text) - results_raw_json = dom.xpath(results_xpath) - results_json = loads(extract_text(results_raw_json)) - - if len(results_json['web']['results']) > 0: - return _general_results(results_json['web']['results']['webSearch']['results']) - - if len(results_json['images']['results']) > 0: - return _images_results(results_json['images']['results']) - - if len(results_json['video']['results']) > 0: - return _videos_results(results_json['video']['results']) - - return [] - - -def _general_results(general_results): - results = [] - for result in general_results: - results.append({ - 'url': result['url'], - 'title': result['title'], - 'content': result['sum'], - }) - return results - - -def _images_results(image_results): - results = [] - for result in image_results: - results.append({ - 'url': result['sourceURL'], - 'title': result['title'], - 'content': result['source'], - 'thumbnail_src': _extract_url(result['assets']['thumb']['url']), - 'img_src': _extract_url(result['assets']['file']['url']), - 'template': 'images.html', - }) - return results - - -def _videos_results(video_results): - results = [] - for result in video_results: - if not result['kind'].startswith('youtube'): - logger.warn('Unknown video kind in findx: {}'.format(result['kind'])) - continue - - description = result['snippet']['description'] - if len(description) > 300: - description = description[:300] + '...' - - results.append({ - 'url': base_youtube_url + result['id'], - 'title': result['snippet']['title'], - 'content': description, - 'thumbnail': _extract_url(result['snippet']['thumbnails']['default']['url']), - 'publishedDate': parser.parse(result['snippet']['publishedAt']), - 'embedded': embedded_url.format(videoid=result['id']), - 'template': 'videos.html', - }) - return results - - -def _extract_url(url): - matching = re.search('(/https?://[^)]+)', url) - if matching: - return matching.group(0)[1:] - return '' diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 3e067597e..55efdc884 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -32,8 +32,9 @@ search_url = base_url + 'do/search' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # not ads: div[@class="result"] are the direct childs of div[@id="results"] -results_xpath = '//div[@class="result"]' +results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]' link_xpath = './/h3/a' +content_xpath = './p[@class="search-item__body"]' # do search-request @@ -73,14 +74,10 @@ def response(resp): if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue - # block ixquick search url's - if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): - continue - title = extract_text(link) - if result.xpath('./p[@class="desc clk"]'): - content = extract_text(result.xpath('./p[@class="desc clk"]')) + if result.xpath(content_xpath): + content = extract_text(result.xpath(content_xpath)) else: content = '' |