diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2024-02-16 18:17:03 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarIT.de> | 2024-02-18 09:17:22 +0100 |
commit | 76845ea42c2c3484e30c118f07671b13ade07a29 (patch) | |
tree | dc88db7291342fa9888fa6a748d05765844896e0 /searx | |
parent | 3df53d6e503b97533dda33691462d949aa480dda (diff) | |
download | searxng-76845ea42c2c3484e30c118f07671b13ade07a29.tar.gz searxng-76845ea42c2c3484e30c118f07671b13ade07a29.zip |
[mod] engine ask.com - parse JS result to JSON
Parse the result list from ask.com given in the variable named
window.MESON.initialState::
<script nonce="..">
window.MESON = window.MESON || {};
window.MESON.initialState = {"siteConfig": ...
...}};
window.MESON.loadedLang = "en";
</script>
The result list is in field::
json_resp['search']['webResults']['results']
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r-- | searx/engines/ask.py | 50 |
1 files changed, 34 insertions, 16 deletions
diff --git a/searx/engines/ask.py b/searx/engines/ask.py index f9bcdf1e6..ff4413e1a 100644 --- a/searx/engines/ask.py +++ b/searx/engines/ask.py @@ -3,8 +3,9 @@ """Ask.com""" from urllib.parse import urlencode -import re +import dateutil from lxml import html +from searx import utils # Metadata about = { @@ -37,20 +38,37 @@ def request(query, params): def response(resp): - text = html.fromstring(resp.text).text_content() - urls_match = re.findall(r'"url":"(.*?)"', text) - titles_match = re.findall(r'"title":"(.*?)"', text)[3:] - content_match = re.findall(r'"abstract":"(.*?)"', text) - - results = [ - { - "url": url, - "title": title, - "content": content, - } - for url, title, content in zip(urls_match, titles_match, content_match) - if "&qo=relatedSearchNarrow" not in url - # Related searches shouldn't be in the search results: www.ask.com/web&q=related - ] + start_tag = 'window.MESON.initialState = {' + end_tag = '}};' + + dom = html.fromstring(resp.text) + script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text + + pos = script.index(start_tag) + len(start_tag) - 1 + script = script[pos:] + pos = script.index(end_tag) + len(end_tag) - 1 + script = script[:pos] + + json_resp = utils.js_variable_to_python(script) + + results = [] + + for item in json_resp['search']['webResults']['results']: + + pubdate_original = item.get('pubdate_original') + if pubdate_original: + pubdate_original = dateutil.parser.parse(pubdate_original) + metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)] + + results.append( + { + "url": item['url'], + "title": item['title'], + "content": item['abstract'], + "publishedDate": pubdate_original, + # "img_src": item.get('image_url') or None, # these are not thumbs / to large + "metadata": ' | '.join(metadata), + } + ) return results |