summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2023-08-06 19:35:56 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2023-08-08 16:21:45 +0200
commit460bbe5b8114cb5782a6f3e8cb644bbd29829b64 (patch)
tree36a028d350090c3cb7c231a4ebc385e26771b4fc /searx
parentd151497db3ad33efda95b2fc7c1dd0ecf64304b6 (diff)
downloadsearxng-460bbe5b8114cb5782a6f3e8cb644bbd29829b64.tar.gz
searxng-460bbe5b8114cb5782a6f3e8cb644bbd29829b64.zip
[mod] implement brave (WEB) engine to replace XPath configuration
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/brave.py274
-rw-r--r--searx/settings.yml48
2 files changed, 252 insertions, 70 deletions
diff --git a/searx/engines/brave.py b/searx/engines/brave.py
index deb109b8e..c32d0f39e 100644
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@@ -1,10 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Brave supports the categories listed in :py:obj:`brave_category` (General,
+news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
+<time_range_support>` is limited (see remarks).
+
+Configured ``brave`` engines:
+
+.. code:: yaml
+
+ - name: brave
+ engine: brave
+ ...
+ brave_category: search
+ time_range_support: true
+ paging: true
+
+ - name: brave.images
+ engine: brave
+ ...
+ brave_category: images
+
+ - name: brave.videos
+ engine: brave
+ ...
+ brave_category: videos
+
+ - name: brave.news
+ engine: brave
+ ...
+ brave_category: news
+
+
+Implementations
+===============
+
"""
- Brave (General, news, videos, images)
-"""
+# pylint: disable=fixme
+
+from urllib.parse import (
+ urlencode,
+ urlparse,
+ parse_qs,
+)
-from urllib.parse import urlencode
import chompjs
+from lxml import html
+
+from searx.utils import (
+ extract_text,
+ eval_xpath_list,
+ eval_xpath_getindex,
+)
about = {
"website": 'https://search.brave.com/',
@@ -14,41 +60,87 @@ about = {
"require_api_key": False,
"results": 'HTML',
}
+
base_url = "https://search.brave.com/"
+categories = []
+brave_category = 'search'
+"""Brave supports common web-search, video search, image and video search.
+
+- ``search``: Common WEB search
+- ``videos``: search for videos
+- ``images``: search for images
+- ``news``: search for news
+"""
+
+brave_spellcheck = False
+"""Brave supports some kind of spell checking. When activated, Brave tries to
+fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
+the UI of Brave the user gets warned about this, since we can not warn the user
+in SearXNG, the spellchecking is disabled by default.
+"""
+
+send_accept_language_header = True
paging = False
-categories = ['images', 'videos', 'news'] # images, videos, news
+"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
+category All)."""
+
+safesearch = True
+safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
+
+time_range_support = False
+"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
+category All)."""
+
+time_range_map = {
+ 'day': 'pd',
+ 'week': 'pw',
+ 'month': 'pm',
+ 'year': 'py',
+}
def request(query, params):
+
+ # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
+ params['headers']['Accept-Encoding'] = 'gzip, deflate'
+
args = {
'q': query,
- 'spellcheck': 1,
}
- params["url"] = f"{base_url}{categories[0]}?{urlencode(args)}"
-
-
-def get_video_results(json_data):
- results = []
-
- for result in json_data:
- results.append(
- {
- 'template': 'videos.html',
- 'url': result['url'],
- 'thumbnail_src': result['thumbnail']['src'],
- 'img_src': result['properties']['url'],
- 'content': result['description'],
- 'title': result['title'],
- 'source': result['source'],
- 'duration': result['video']['duration'],
- }
- )
+ if brave_spellcheck:
+ args['spellcheck'] = '1'
+
+ if brave_category == 'search':
+ if params.get('pageno', 1) - 1:
+ args['offset'] = params.get('pageno', 1) - 1
+ if time_range_map.get(params['time_range']):
+ args['tf'] = time_range_map.get(params['time_range'])
+
+ params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
+
+ # set preferences in cookie
+ params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
+
+ # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
+ # limited and the country handling has it quirks
+
+ eng_locale = params.get('searxng_locale')
+ params['cookies']['useLocation'] = '0' # the useLocation is IP based, we use 'country'
+ params['cookies']['summarizer'] = '0'
- return results
+ if not eng_locale or eng_locale == 'all':
+ params['cookies']['country'] = 'all' # country=all
+ else:
+ params['cookies']['country'] = eng_locale.split('-')[-1].lower()
+ params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()
+
+ # logger.debug("cookies %s", params['cookies'])
def response(resp):
- results = []
+
+ if brave_category == 'search':
+ return _parse_search(resp)
datastr = ""
for line in resp.text.split("\n"):
@@ -57,10 +149,81 @@ def response(resp):
break
json_data = chompjs.parse_js_object(datastr)
-
json_resp = json_data[1]['data']['body']['response']
- if categories[0] == 'news':
+
+ if brave_category == 'news':
json_resp = json_resp['news']
+ return _parse_news(json_resp)
+
+ if brave_category == 'images':
+ return _parse_images(json_resp)
+ if brave_category == 'videos':
+ return _parse_videos(json_resp)
+
+ return []
+
+
+def _parse_search(resp):
+
+ result_list = []
+ dom = html.fromstring(resp.text)
+
+ answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
+ if answer_tag:
+ result_list.append({'answer': extract_text(answer_tag)})
+
+ # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
+ xpath_results = '//div[contains(@class, "snippet")]'
+
+ for result in eval_xpath_list(dom, xpath_results):
+
+ url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
+ title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
+ if not (url and title_tag):
+ continue
+
+ content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
+ img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')
+
+ item = {
+ 'url': url,
+ 'title': extract_text(title_tag),
+ 'content': extract_text(content_tag),
+ 'img_src': img_src,
+ }
+
+ video_tag = eval_xpath_getindex(
+ result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
+ )
+ if video_tag:
+
+ # In my tests a video tag in the WEB search was mostoften not a
+ # video, except the ones from youtube ..
+
+ iframe_src = _get_iframe_src(url)
+ if iframe_src:
+ item['iframe_src'] = iframe_src
+ item['template'] = 'videos.html'
+ item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
+ else:
+ item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
+
+ result_list.append(item)
+
+ return result_list
+
+
+def _get_iframe_src(url):
+ parsed_url = urlparse(url)
+ if parsed_url.path == '/watch' and parsed_url.query:
+ video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
+ if video_id:
+ return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
+ return None
+
+
+def _parse_news(json_resp):
+ result_list = []
for result in json_resp["results"]:
item = {
@@ -69,17 +232,52 @@ def response(resp):
'content': result['description'],
}
if result['thumbnail'] != "null":
+ item['img_src'] = result['thumbnail']['src']
+ result_list.append(item)
+
+ return result_list
+
+
+def _parse_images(json_resp):
+ result_list = []
+
+ for result in json_resp["results"]:
+ item = {
+ 'url': result['url'],
+ 'title': result['title'],
+ 'content': result['description'],
+ 'template': 'images.html',
+ 'img_format': result['properties']['format'],
+ 'source': result['source'],
+ 'img_src': result['properties']['url'],
+ }
+ result_list.append(item)
+
+ return result_list
+
+
+def _parse_videos(json_resp):
+ result_list = []
+
+ for result in json_resp["results"]:
+
+ url = result['url']
+ item = {
+ 'url': url,
+ 'title': result['title'],
+ 'content': result['description'],
+ 'template': 'videos.html',
+ 'length': result['video']['duration'],
+ 'duration': result['video']['duration'],
+ }
+
+ if result['thumbnail'] != "null":
item['thumbnail'] = result['thumbnail']['src']
- if categories[0] == 'images':
- item['template'] = 'images.html'
- item['img_format'] = result['properties']['format']
- item['source'] = result['source']
- item['img_src'] = result['properties']['url']
- elif categories[0] == 'videos':
- item['template'] = 'videos.html'
- item['length'] = result['video']['duration']
+ iframe_src = _get_iframe_src(url)
+ if iframe_src:
+ item['iframe_src'] = iframe_src
- results.append(item)
+ result_list.append(item)
- return results
+ return result_list
diff --git a/searx/settings.yml b/searx/settings.yml
index 87bf381eb..04ec3e466 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1816,50 +1816,34 @@ engines:
timeout: 9.0
- name: brave
- shortcut: brave
- engine: xpath
- paging: true
+ engine: brave
+ shortcut: br
time_range_support: true
- first_page_num: 0
- time_range_url: "&tf={time_range_val}"
- search_url: https://search.brave.com/search?q={query}&offset={pageno}&spellcheck=1{time_range}
- url_xpath: //a[@class="result-header"]/@href
- title_xpath: //span[@class="snippet-title"]
- content_xpath: //p[1][@class="snippet-description"]
- suggestion_xpath: //div[@class="text-gray h6"]/a
- time_range_map:
- day: 'pd'
- week: 'pw'
- month: 'pm'
- year: 'py'
+ paging: true
categories: [general, web]
- disabled: true
- headers:
- Accept-Encoding: gzip, deflate
- about:
- website: https://brave.com/search/
- wikidata_id: Q107355971
- use_official_api: false
- require_api_key: false
- results: HTML
+ brave_category: search
+ # brave_spellcheck: true
- name: brave.images
- shortcut: braveimg
engine: brave
- categories: images
- disabled: true
+ network: brave
+ shortcut: brimg
+ categories: [images, web]
+ brave_category: images
- name: brave.videos
- shortcut: bravevid
engine: brave
- categories: videos
- disabled: true
+ network: brave
+ shortcut: brvid
+ categories: [videos, web]
+ brave_category: videos
- name: brave.news
- shortcut: bravenews
engine: brave
+ network: brave
+ shortcut: brnews
categories: news
- disabled: true
+ brave_category: news
- name: petalsearch
shortcut: pts