searx/engines/bing_news.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-News: description see :py:obj:`searx.engines.bing`.

.. hint::

   Bing News is *different* in some ways!

"""

# pylint: disable=invalid-name

from typing import TYPE_CHECKING
from urllib.parse import urlencode

from lxml import html

from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import set_bing_cookies

if TYPE_CHECKING:
    import logging

    logger: logging.Logger

traits: EngineTraits


# about
about = {
    "website": 'https://www.bing.com/news',
    "wikidata_id": 'Q2878637',
    "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'RSS',
}

# engine dependent config
categories = ['news']
paging = True
"""If go through the pages and there are actually no new results for another
page, then bing returns the results from the last page again."""

time_range_support = True
time_map = {
    'day': 'interval="4"',
    'week': 'interval="7"',
    'month': 'interval="9"',
}
"""A string '4' means *last hour*.  We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
Bing does not have news range ``year`` / we use ``month`` instead."""

base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL"""


def request(query, params):
    """Assemble a Bing-News request."""

    engine_region = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
    engine_language = traits.get_language(params['searxng_locale'], 'en')  # type: ignore
    set_bing_cookies(params, engine_language, engine_region)

    # build URL query
    #
    # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1

    page = int(params.get('pageno', 1)) - 1
    query_params = {
        'q': query,
        'InfiniteScroll': 1,
        # to simplify the page count lets use the default of 10 images per page
        'first': page * 10 + 1,
        'SFX': page,
        'form': 'PTFTNR',
        'setlang': engine_region.split('-')[0],
        'cc': engine_region.split('-')[-1],
    }

    if params['time_range']:
        query_params['qft'] = time_map.get(params['time_range'], 'interval="9"')

    params['url'] = base_url + '?' + urlencode(query_params)

    return params


def response(resp):
    """Get response from Bing-Video"""
    results = []

    if not resp.ok or not resp.text:
        return results

    dom = html.fromstring(resp.text)

    for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'):

        link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None)
        if link is None:
            continue
        url = link.attrib.get('href')
        title = extract_text(link)
        content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]'))

        metadata = []
        source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None)
        if source is not None:
            for item in (
                eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None),
                # eval_xpath_getindex(source, './/a', 0, None),
                # eval_xpath_getindex(source, './div/span', 3, None),
                link.attrib.get('data-author'),
            ):
                if item is not None:
                    t = extract_text(item)
                    if t and t.strip():
                        metadata.append(t.strip())
        metadata = ' | '.join(metadata)

        thumbnail = None
        imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
        if imagelink is not None:
            thumbnail = 'https://www.bing.com/' + imagelink.attrib.get('src')

        results.append(
            {
                'url': url,
                'title': title,
                'content': content,
                'img_src': thumbnail,
                'metadata': metadata,
            }
        )

    return results


def fetch_traits(engine_traits: EngineTraits):
    """Fetch languages and regions from Bing-News."""
    # pylint: disable=import-outside-toplevel

    from searx.engines.bing import fetch_traits as _f

    _f(engine_traits)

    # fix market codes not known by bing news:

    # In bing the market code 'zh-cn' exists, but there is no 'news' category in
    # bing for this market.  Alternatively we use the the market code from Honk
    # Kong.  Even if this is not correct, it is better than having no hits at
    # all, or sending false queries to bing that could raise the suspicion of a
    # bot.

    # HINT: 'en-hk' is the region code it does not indicate the language en!!
    engine_traits.regions['zh-CN'] = 'en-hk'