diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2022-09-20 18:04:21 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2022-09-20 20:18:43 +0200 |
commit | dcf1d408a53a0dbf61e4bd545537508b42153158 (patch) | |
tree | c2efd498dc7f4d21c03e80984b7783a4b7437f42 /searx/engines | |
parent | f98ef718de5638f2f1a610d472438c80b51cc139 (diff) | |
download | searxng-dcf1d408a53a0dbf61e4bd545537508b42153158.tar.gz searxng-dcf1d408a53a0dbf61e4bd545537508b42153158.zip |
[fix] google-news: origin result does not have a content area
The google news are in a rework, the content area of a news item has been
removed.
Closes: https://github.com/searxng/searxng/issues/1790
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/google_news.py | 24 |
1 files changed, 6 insertions, 18 deletions
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 8f5a4b104..87867d65a 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -150,24 +150,12 @@ def response(resp): # the first <h3> tag in the <article> contains the title of the link title = extract_text(eval_xpath(result, './article/h3[1]')) - # the first <div> tag in the <article> contains the content of the link - content = extract_text(eval_xpath(result, './article/div[1]')) - - # the second <div> tag contains origin publisher and the publishing date - - pub_date = extract_text(eval_xpath(result, './article/div[2]//time')) - pub_origin = extract_text(eval_xpath(result, './article/div[2]//a')) - - pub_info = [] - if pub_origin: - pub_info.append(pub_origin) - if pub_date: - # The pub_date is mostly a string like 'yesertday', not a real - # timezone date or time. Therefore we can't use publishedDate. - pub_info.append(pub_date) - pub_info = ', '.join(pub_info) - if pub_info: - content = pub_info + ': ' + content + # The pub_date is mostly a string like 'yesertday', not a real + # timezone date or time. Therefore we can't use publishedDate. + pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time')) + pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a')) + + content = ' / '.join([x for x in [pub_origin, pub_date] if x]) # The image URL is located in a preceding sibling <img> tag, e.g.: # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" |