summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2022-09-20 18:04:21 +0200
committerMarkus Heiser <markus.heiser@darmarit.de>2022-09-20 20:18:43 +0200
commitdcf1d408a53a0dbf61e4bd545537508b42153158 (patch)
treec2efd498dc7f4d21c03e80984b7783a4b7437f42 /searx/engines
parentf98ef718de5638f2f1a610d472438c80b51cc139 (diff)
downloadsearxng-dcf1d408a53a0dbf61e4bd545537508b42153158.tar.gz
searxng-dcf1d408a53a0dbf61e4bd545537508b42153158.zip
[fix] google-news: origin result does not have a content area
The google news are in a rework, the content area of a news item has been removed. Closes: https://github.com/searxng/searxng/issues/1790 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/google_news.py24
1 files changed, 6 insertions, 18 deletions
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index 8f5a4b104..87867d65a 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -150,24 +150,12 @@ def response(resp):
# the first <h3> tag in the <article> contains the title of the link
title = extract_text(eval_xpath(result, './article/h3[1]'))
- # the first <div> tag in the <article> contains the content of the link
- content = extract_text(eval_xpath(result, './article/div[1]'))
-
- # the second <div> tag contains origin publisher and the publishing date
-
- pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
- pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))
-
- pub_info = []
- if pub_origin:
- pub_info.append(pub_origin)
- if pub_date:
- # The pub_date is mostly a string like 'yesertday', not a real
- # timezone date or time. Therefore we can't use publishedDate.
- pub_info.append(pub_date)
- pub_info = ', '.join(pub_info)
- if pub_info:
- content = pub_info + ': ' + content
+ # The pub_date is mostly a string like 'yesertday', not a real
+ # timezone date or time. Therefore we can't use publishedDate.
+ pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time'))
+ pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a'))
+
+ content = ' / '.join([x for x in [pub_origin, pub_date] if x])
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"