searx/engines/brave.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
news, videos, images).  The support of :py:obj:`paging` and :py:obj:`time range
<time_range_support>` is limited (see remarks).

Configured ``brave`` engines:

.. code:: yaml

  - name: brave
    engine: brave
    ...
    brave_category: search
    time_range_support: true
    paging: true

  - name: brave.images
    engine: brave
    ...
    brave_category: images

  - name: brave.videos
    engine: brave
    ...
    brave_category: videos

  - name: brave.news
    engine: brave
    ...
    brave_category: news


Implementations
===============

"""
# pylint: disable=fixme

from urllib.parse import (
    urlencode,
    urlparse,
    parse_qs,
)

import chompjs
from lxml import html

from searx.utils import (
    extract_text,
    eval_xpath_list,
    eval_xpath_getindex,
)

about = {
    "website": 'https://search.brave.com/',
    "wikidata_id": 'Q22906900',
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

base_url = "https://search.brave.com/"
categories = []
brave_category = 'search'
"""Brave supports common web-search, video search, image and video search.

- ``search``: Common WEB search
- ``videos``: search for videos
- ``images``: search for images
- ``news``: search for news
"""

brave_spellcheck = False
"""Brave supports some kind of spell checking.  When activated, Brave tries to
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``.  In
the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""

send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All)."""

safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'}  # cookie: safesearch=off

time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All)."""

time_range_map = {
    'day': 'pd',
    'week': 'pw',
    'month': 'pm',
    'year': 'py',
}


def request(query, params):

    # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
    params['headers']['Accept-Encoding'] = 'gzip, deflate'

    args = {
        'q': query,
    }
    if brave_spellcheck:
        args['spellcheck'] = '1'

    if brave_category == 'search':
        if params.get('pageno', 1) - 1:
            args['offset'] = params.get('pageno', 1) - 1
        if time_range_map.get(params['time_range']):
            args['tf'] = time_range_map.get(params['time_range'])

    params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"

    # set preferences in cookie
    params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')

    # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
    #       limited and the country handling has it quirks

    eng_locale = params.get('searxng_locale')
    params['cookies']['useLocation'] = '0'  # the useLocation is IP based, we use 'country'
    params['cookies']['summarizer'] = '0'

    if not eng_locale or eng_locale == 'all':
        params['cookies']['country'] = 'all'  # country=all
    else:
        params['cookies']['country'] = eng_locale.split('-')[-1].lower()
        params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()

    # logger.debug("cookies %s", params['cookies'])


def response(resp):

    if brave_category == 'search':
        return _parse_search(resp)

    datastr = ""
    for line in resp.text.split("\n"):
        if "const data = " in line:
            datastr = line.replace("const data = ", "").strip()[:-1]
            break

    json_data = chompjs.parse_js_object(datastr)
    json_resp = json_data[1]['data']['body']['response']

    if brave_category == 'news':
        json_resp = json_resp['news']
        return _parse_news(json_resp)

    if brave_category == 'images':
        return _parse_images(json_resp)
    if brave_category == 'videos':
        return _parse_videos(json_resp)

    return []


def _parse_search(resp):

    result_list = []
    dom = html.fromstring(resp.text)

    answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
    if answer_tag:
        result_list.append({'answer': extract_text(answer_tag)})

    # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
    xpath_results = '//div[contains(@class, "snippet")]'

    for result in eval_xpath_list(dom, xpath_results):

        url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
        title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
        if not (url and title_tag):
            continue

        content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
        img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')

        item = {
            'url': url,
            'title': extract_text(title_tag),
            'content': extract_text(content_tag),
            'img_src': img_src,
        }

        video_tag = eval_xpath_getindex(
            result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
        )
        if video_tag:

            # In my tests a video tag in the WEB search was mostoften not a
            # video, except the ones from youtube ..

            iframe_src = _get_iframe_src(url)
            if iframe_src:
                item['iframe_src'] = iframe_src
                item['template'] = 'videos.html'
                item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
            else:
                item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')

        result_list.append(item)

    return result_list


def _get_iframe_src(url):
    parsed_url = urlparse(url)
    if parsed_url.path == '/watch' and parsed_url.query:
        video_id = parse_qs(parsed_url.query).get('v', [])  # type: ignore
        if video_id:
            return 'https://www.youtube-nocookie.com/embed/' + video_id[0]  # type: ignore
    return None


def _parse_news(json_resp):
    result_list = []

    for result in json_resp["results"]:
        item = {
            'url': result['url'],
            'title': result['title'],
            'content': result['description'],
        }
        if result['thumbnail'] != "null":
            item['img_src'] = result['thumbnail']['src']
        result_list.append(item)

    return result_list


def _parse_images(json_resp):
    result_list = []

    for result in json_resp["results"]:
        item = {
            'url': result['url'],
            'title': result['title'],
            'content': result['description'],
            'template': 'images.html',
            'img_format': result['properties']['format'],
            'source': result['source'],
            'img_src': result['properties']['url'],
        }
        result_list.append(item)

    return result_list


def _parse_videos(json_resp):
    result_list = []

    for result in json_resp["results"]:

        url = result['url']
        item = {
            'url': url,
            'title': result['title'],
            'content': result['description'],
            'template': 'videos.html',
            'length': result['video']['duration'],
            'duration': result['video']['duration'],
        }

        if result['thumbnail'] != "null":
            item['thumbnail'] = result['thumbnail']['src']

        iframe_src = _get_iframe_src(url)
        if iframe_src:
            item['iframe_src'] = iframe_src

        result_list.append(item)

    return result_list