[fix] wolframalpha page changes

related issues: #508 #509
author: Adam Tauber <asciimoo@gmail.com> 2016-02-17 17:07:19 +0100
committer: Adam Tauber <asciimoo@gmail.com> 2016-02-17 17:11:51 +0100
commit: d06178139f141ae5c6e1908ca70de37371d3572d (patch)
tree: 32fdcc6a06caeb0b3bb4c4a049315b9ef0d96130 /searx/engines/wolframalpha_noapi.py
parent: 4e5af8d87bc3602fcdb263ad2e1595be91df95c9 (diff)
download: searxng-d06178139f141ae5c6e1908ca70de37371d3572d.tar.gz
searxng-d06178139f141ae5c6e1908ca70de37371d3572d.zip
1 files changed, 62 insertions, 56 deletions
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
index 291fee04d..bffbbe466 100644
--- a/searx/engines/wolframalpha_noapi.py
+++ b/searx/engines/wolframalpha_noapi.py
@@ -8,79 +8,85 @@
 # @stable      no
 # @parse       answer
 
-from re import search, sub
+from cgi import escape
 from json import loads
+from time import time
 from urllib import urlencode
-from lxml import html
-import HTMLParser
+
+from searx.poolrequests import get as http_get
 
 # search-url
-url = 'http://www.wolframalpha.com/'
+url = 'https://www.wolframalpha.com/'
 search_url = url + 'input/?{query}'
 
+search_url = url + 'input/json.jsp'\
+    '?async=true'\
+    '&banners=raw'\
+    '&debuggingdata=false'\
+    '&format=image,plaintext,imagemap,minput,moutput'\
+    '&formattimeout=2'\
+    '&{query}'\
+    '&output=JSON'\
+    '&parsetimeout=2'\
+    '&proxycode={token}'\
+    '&scantimeout=0.5'\
+    '&sponsorcategories=true'\
+    '&statemethod=deploybutton'
+
 # xpath variables
 scripts_xpath = '//script'
 title_xpath = '//title'
 failure_xpath = '//p[attribute::class="pfail"]'
+token = {'value': '',
+         'last_updated': None}
+
+
+# seems, wolframalpha resets its token in every hour
+def obtain_token():
+    update_time = time() - (time() % 3600)
+    token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
+    token['value'] = loads(token_response.text)['code']
+    token['last_updated'] = update_time
+    return token
+
+
+obtain_token()
 
 
 # do search-request
 def request(query, params):
-    params['url'] = search_url.format(query=urlencode({'i': query}))
+    # obtain token if last update was more than an hour
+    if time() - token['last_updated'] > 3600:
+        obtain_token()
+    params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
+    params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
 
     return params
 
 
 # get response from search-request
 def response(resp):
-    results = []
-    line = None
-
-    dom = html.fromstring(resp.text)
-    scripts = dom.xpath(scripts_xpath)
-
-    # the answer is inside a js function
-    # answer can be located in different 'pods', although by default it should be in pod_0200
-    possible_locations = ['pod_0200\.push\((.*)',
-                          'pod_0100\.push\((.*)']
-
-    # failed result
-    if dom.xpath(failure_xpath):
-        return results
-
-    # get line that matches the pattern
-    for pattern in possible_locations:
-        for script in scripts:
-            try:
-                line = search(pattern, script.text_content()).group(1)
-                break
-            except AttributeError:
-                continue
-        if line:
-            break
-
-    if line:
-        # extract answer from json
-        answer = line[line.find('{'):line.rfind('}') + 1]
-        try:
-            answer = loads(answer)
-        except Exception:
-            answer = loads(answer.encode('unicode-escape'))
-        answer = answer['stringified']
-
-        # clean plaintext answer
-        h = HTMLParser.HTMLParser()
-        answer = h.unescape(answer.decode('unicode-escape'))
-        answer = sub(r'\\', '', answer)
-
-        results.append({'answer': answer})
-
-    # user input is in first part of title
-    title = dom.xpath(title_xpath)[0].text.encode('utf-8')
-    result_url = request(title[:-16], {})['url']
-
-    # append result
-    results.append({'url': result_url,
-                    'title': title.decode('utf-8')})
-
-    return results
+    resp_json = loads(resp.text)
+
+    if not resp_json['queryresult']['success']:
+        return []
+
+    # TODO handle resp_json['queryresult']['assumptions']
+    result_chunks = []
+    for pod in resp_json['queryresult']['pods']:
+        pod_title = pod.get('title', '')
+        if 'subpods' not in pod:
+            continue
+        for subpod in pod['subpods']:
+            if 'img' in subpod:
+                result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
+                                     .format(escape(pod_title or subpod['img']['alt']),
+                                             escape(subpod['img']['src']),
+                                             escape(subpod['img']['alt'])))
+
+    if not result_chunks:
+        return []
+
+    return [{'url': resp.request.headers['Referer'],
+             'title': 'Wolframalpha',
+             'content': ''.join(result_chunks)}]
author	Adam Tauber <asciimoo@gmail.com>	2016-02-17 17:07:19 +0100
committer	Adam Tauber <asciimoo@gmail.com>	2016-02-17 17:11:51 +0100
commit	d06178139f141ae5c6e1908ca70de37371d3572d (patch)
tree	32fdcc6a06caeb0b3bb4c4a049315b9ef0d96130 /searx/engines/wolframalpha_noapi.py
parent	4e5af8d87bc3602fcdb263ad2e1595be91df95c9 (diff)
download	searxng-d06178139f141ae5c6e1908ca70de37371d3572d.tar.gz searxng-d06178139f141ae5c6e1908ca70de37371d3572d.zip