[mod] engine ask.com - parse JS result to JSON

Parse the result list from ask.com given in the variable named window.MESON.initialState:: <script nonce=".."> window.MESON = window.MESON || {}; window.MESON.initialState = {"siteConfig": ... ...}}; window.MESON.loadedLang = "en"; </script> The result list is in field:: json_resp['search']['webResults']['results'] Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
author: Markus Heiser <markus.heiser@darmarit.de> 2024-02-16 18:17:03 +0100
committer: Markus Heiser <markus.heiser@darmarIT.de> 2024-02-18 09:17:22 +0100
commit: 76845ea42c2c3484e30c118f07671b13ade07a29 (patch)
tree: dc88db7291342fa9888fa6a748d05765844896e0 /searx
parent: 3df53d6e503b97533dda33691462d949aa480dda (diff)
download: searxng-76845ea42c2c3484e30c118f07671b13ade07a29.tar.gz
searxng-76845ea42c2c3484e30c118f07671b13ade07a29.zip
1 files changed, 34 insertions, 16 deletions
diff --git a/searx/engines/ask.py b/searx/engines/ask.py
index f9bcdf1e6..ff4413e1a 100644
--- a/searx/engines/ask.py
+++ b/searx/engines/ask.py
@@ -3,8 +3,9 @@
 """Ask.com"""
 
 from urllib.parse import urlencode
-import re
+import dateutil
 from lxml import html
+from searx import utils
 
 # Metadata
 about = {
@@ -37,20 +38,37 @@ def request(query, params):
 
 def response(resp):
 
-    text = html.fromstring(resp.text).text_content()
-    urls_match = re.findall(r'"url":"(.*?)"', text)
-    titles_match = re.findall(r'"title":"(.*?)"', text)[3:]
-    content_match = re.findall(r'"abstract":"(.*?)"', text)
-
-    results = [
-        {
-            "url": url,
-            "title": title,
-            "content": content,
-        }
-        for url, title, content in zip(urls_match, titles_match, content_match)
-        if "&qo=relatedSearchNarrow" not in url
-        # Related searches shouldn't be in the search results: www.ask.com/web&q=related
-    ]
+    start_tag = 'window.MESON.initialState = {'
+    end_tag = '}};'
+
+    dom = html.fromstring(resp.text)
+    script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text
+
+    pos = script.index(start_tag) + len(start_tag) - 1
+    script = script[pos:]
+    pos = script.index(end_tag) + len(end_tag) - 1
+    script = script[:pos]
+
+    json_resp = utils.js_variable_to_python(script)
+
+    results = []
+
+    for item in json_resp['search']['webResults']['results']:
+
+        pubdate_original = item.get('pubdate_original')
+        if pubdate_original:
+            pubdate_original = dateutil.parser.parse(pubdate_original)
+        metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)]
+
+        results.append(
+            {
+                "url": item['url'],
+                "title": item['title'],
+                "content": item['abstract'],
+                "publishedDate": pubdate_original,
+                # "img_src": item.get('image_url') or None, # these are not thumbs / to large
+                "metadata": ' | '.join(metadata),
+            }
+        )
 
     return results
author	Markus Heiser <markus.heiser@darmarit.de>	2024-02-16 18:17:03 +0100
committer	Markus Heiser <markus.heiser@darmarIT.de>	2024-02-18 09:17:22 +0100
commit	76845ea42c2c3484e30c118f07671b13ade07a29 (patch)
tree	dc88db7291342fa9888fa6a748d05765844896e0 /searx
parent	3df53d6e503b97533dda33691462d949aa480dda (diff)
download	searxng-76845ea42c2c3484e30c118f07671b13ade07a29.tar.gz searxng-76845ea42c2c3484e30c118f07671b13ade07a29.zip