diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2021-03-08 09:41:32 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2021-03-08 11:43:34 +0100 |
commit | d2faea423a33a82867e097505cd88b84c24a7931 (patch) | |
tree | db04150a3ab80b372aa4f855022f5d6ba08bff98 | |
parent | 0d8b369b5b300e8a575d6715fc75067d09db63a5 (diff) | |
download | searxng-d2faea423a33a82867e097505cd88b84c24a7931.tar.gz searxng-d2faea423a33a82867e097505cd88b84c24a7931.zip |
[fix] rewrite Yahoo-News engine
Many things have been changed since last review of this engine. This patch fix
xpath selectors, implements suggestion and is a complete review / rewrite of the
engine.
Signed-off-by: Markus Heiser <markus@darmarit.de>
-rw-r--r-- | Makefile | 1 | ||||
-rw-r--r-- | searx/engines/yahoo_news.py | 153 |
2 files changed, 81 insertions, 73 deletions
@@ -196,6 +196,7 @@ PYLINT_FILES=\ searx/engines/google_images.py \ searx/engines/mediathekviewweb.py \ searx/engines/google_scholar.py \ + searx/engines/yahoo_news.py \ searx_extra/update/update_external_bangs.py test.pylint: pyenvinstall diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index ca17896dc..5f6734cb3 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -1,16 +1,35 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +"""Yahoo (News) + +Yahoo News is "English only" and do not offer localized nor language queries. + """ - Yahoo (News) -""" + +# pylint: disable=invalid-name, missing-function-docstring import re -from datetime import datetime, timedelta from urllib.parse import urlencode -from lxml import html -from searx.engines.yahoo import parse_url, language_aliases -from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import +from datetime import datetime, timedelta from dateutil import parser -from searx.utils import extract_text, extract_url, match_language +from lxml import html + +from searx import logger +from searx.utils import ( + eval_xpath_list, + eval_xpath_getindex, + extract_text, +) + +from searx.engines.yahoo import parse_url + +# pylint: disable=unused-import +from searx.engines.yahoo import ( + _fetch_supported_languages, + supported_languages_url, +) +# pylint: enable=unused-import + +logger = logger.getChild('yahoo_news engine') # about about = { @@ -22,90 +41,78 @@ about = { "results": 'HTML', } -# engine dependent config -categories = ['news'] +language_support = False +time_range_support = False +safesearch = False paging = True +categories = ['news'] # search-url -search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa - -# specific xpath variables -results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' -url_xpath = './/h3/a/@href' -title_xpath = './/h3/a' -content_xpath = './/div[@class="compText"]' -publishedDate_xpath = './/span[contains(@class,"tri")]' -suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' - +search_url = ( + 'https://news.search.yahoo.com/search' + '?{query}&b={offset}' + ) + +AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)') +AGO_TIMEDELTA = { + 'minute': timedelta(minutes=1), + 'hour': timedelta(hours=1), + 'day': timedelta(days=1), + 'week': timedelta(days=7), + 'month': timedelta(days=30), + 'year': timedelta(days=365), +} -# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - if params['language'] == 'all': - language = 'en' - else: - language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] - - params['url'] = search_url.format(offset=offset, - query=urlencode({'p': query}), - lang=language) - - # TODO required? - params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ - .format(lang=language) + params['url'] = search_url.format( + offset = offset, + query = urlencode({'p': query}) + ) + logger.debug("query_url --> %s", params['url']) return params - -def sanitize_url(url): - if ".yahoo.com/" in url: - return re.sub("\\;\\_ylt\\=.+$", "", url) - else: - return url - - -# get response from search-request def response(resp): results = [] - dom = html.fromstring(resp.text) + # parse results - for result in dom.xpath(results_xpath): - urls = result.xpath(url_xpath) - if len(urls) != 1: + for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): + + url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) + if url is None: continue - url = sanitize_url(parse_url(extract_url(urls, search_url))) - title = extract_text(result.xpath(title_xpath)[0]) - content = extract_text(result.xpath(content_xpath)[0]) - - # parse publishedDate - publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) - - # still useful ? - if re.match("^[0-9]+ minute(s|) ago$", publishedDate): - publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) - elif re.match("^[0-9]+ days? ago$", publishedDate): - publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) - elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): - timeNumbers = re.findall(r'\d+', publishedDate) - publishedDate = datetime.now()\ - - timedelta(hours=int(timeNumbers[0]))\ - - timedelta(minutes=int(timeNumbers[1])) + url = parse_url(url) + title = extract_text(result.xpath('.//h4/a')) + content = extract_text(result.xpath('.//p')) + img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) + + item = { + 'url': url, + 'title': title, + 'content': content, + 'img_src' : img_src + } + + pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]')) + ago = AGO_RE.search(pub_date) + if ago: + number = int(ago.group(1)) + delta = AGO_TIMEDELTA[ago.group(2)] + pub_date = datetime.now() - delta * number else: try: - publishedDate = parser.parse(publishedDate) - except: - publishedDate = datetime.now() + pub_date = parser.parse(pub_date) + except parser.ParserError: + pub_date = None - if publishedDate.year == 1900: - publishedDate = publishedDate.replace(year=datetime.now().year) + if pub_date is not None: + item['publishedDate'] = pub_date + results.append(item) - # append result - results.append({'url': url, - 'title': title, - 'content': content, - 'publishedDate': publishedDate}) + for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'): + results.append({'suggestion': extract_text(suggestion)}) - # return results return results |