summaryrefslogtreecommitdiff
path: root/searx/engines/google_images.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines/google_images.py')
-rw-r--r--searx/engines/google_images.py59
1 files changed, 29 insertions, 30 deletions
diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py
index 85963a16f..ca067e14f 100644
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@@ -2,41 +2,42 @@
Google (Images)
@website https://www.google.com
- @provide-api yes (https://developers.google.com/web-search/docs/),
- deprecated!
+ @provide-api yes (https://developers.google.com/custom-search/)
@using-api yes
- @results JSON
- @stable yes (but deprecated)
+ @results HTML chunk
+ @stable no
@parse url, title, img_src
"""
-from urllib import urlencode, unquote
+from urllib import urlencode
+from urlparse import parse_qs
from json import loads
+from lxml import html
# engine dependent config
categories = ['images']
paging = True
safesearch = True
-# search-url
-url = 'https://ajax.googleapis.com/'
-search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe={safesearch}&filter=off&{query}'
+search_url = 'https://www.google.com/search'\
+ '?{query}'\
+ '&tbm=isch'\
+ '&ijn=1'\
+ '&start={offset}'
# do search-request
def request(query, params):
- offset = (params['pageno'] - 1) * 8
-
- if params['safesearch'] == 0:
- safesearch = 'off'
- else:
- safesearch = 'on'
+ offset = (params['pageno'] - 1) * 100
params['url'] = search_url.format(query=urlencode({'q': query}),
offset=offset,
safesearch=safesearch)
+ if safesearch and params['safesearch']:
+ params['url'] += '&' + urlencode({'safe': 'active'})
+
return params
@@ -44,30 +45,28 @@ def request(query, params):
def response(resp):
results = []
- search_res = loads(resp.text)
-
- # return empty array if there are no results
- if not search_res.get('responseData', {}).get('results'):
- return []
+ dom = html.fromstring(resp.text)
# parse results
- for result in search_res['responseData']['results']:
- href = result['originalContextUrl']
- title = result['title']
- if 'url' not in result:
- continue
- thumbnail_src = result['tbUrl']
+ for result in dom.xpath('//div[@data-ved]'):
+ data_url = result.xpath('./a/@href')[0]
+ data_query = {k: v[0] for k, v in parse_qs(data_url.split('?', 1)[1]).iteritems()}
+
+ metadata = loads(result.xpath('./div[@class="rg_meta"]/text()')[0])
+
+ thumbnail_src = metadata['tu']
# http to https
thumbnail_src = thumbnail_src.replace("http://", "https://")
# append result
- results.append({'url': href,
- 'title': title,
- 'content': result['content'],
- 'thumbnail_src': thumbnail_src,
- 'img_src': unquote(result['url']),
+ results.append({'url': data_query['imgrefurl'],
+ 'title': metadata['pt'],
+ 'content': metadata['s'],
+ 'thumbnail_src': metadata['tu'],
+ 'img_src': data_query['imgurl'],
'template': 'images.html'})
# return results
+ print len(results)
return results