summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--searx/engines/brave.py7
-rw-r--r--searx/engines/duckduckgo.py17
-rw-r--r--searx/engines/qwant.py7
-rw-r--r--searx/engines/vimeo.py9
-rw-r--r--searx/engines/youtube_noapi.py6
-rw-r--r--searx/utils.py32
6 files changed, 53 insertions, 25 deletions
diff --git a/searx/engines/brave.py b/searx/engines/brave.py
index 04c2931f9..c5780a02c 100644
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@@ -132,6 +132,7 @@ from lxml import html
from searx import locales
from searx.utils import (
extract_text,
+ extr,
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
@@ -252,11 +253,7 @@ def response(resp):
if brave_category in ('search', 'goggles'):
return _parse_search(resp)
- datastr = ""
- for line in resp.text.split("\n"):
- if "const data = " in line:
- datastr = line.replace("const data = ", "").strip()[:-1]
- break
+ datastr = extr(resp.text, "const data = ", ";\n").strip()
json_data = js_variable_to_python(datastr)
json_resp = json_data[1]['data']['body']['response']
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index b874ca2f8..fced014c1 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -392,7 +392,9 @@ def fetch_traits(engine_traits: EngineTraits):
SearXNG's locale.
"""
- # pylint: disable=too-many-branches, too-many-statements
+ # pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel
+ from searx.utils import extr, js_variable_to_python
+
# fetch regions
engine_traits.all_locale = 'wt-wt'
@@ -403,11 +405,9 @@ def fetch_traits(engine_traits: EngineTraits):
if not resp.ok: # type: ignore
print("ERROR: response from DuckDuckGo is not OK.")
- pos = resp.text.find('regions:{') + 8 # type: ignore
- js_code = resp.text[pos:] # type: ignore
- pos = js_code.find('}') + 1
- regions = json.loads(js_code[:pos])
+ js_code = extr(resp.text, 'regions:', ',snippetLengths')
+ regions = json.loads(js_code)
for eng_tag, name in regions.items():
if eng_tag == 'wt-wt':
@@ -439,12 +439,9 @@ def fetch_traits(engine_traits: EngineTraits):
engine_traits.custom['lang_region'] = {}
- pos = resp.text.find('languages:{') + 10 # type: ignore
- js_code = resp.text[pos:] # type: ignore
- pos = js_code.find('}') + 1
- js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
- languages = json.loads(js_code)
+ js_code = extr(resp.text, 'languages:', ',regions')
+ languages = js_variable_to_python(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index facd47bb9..989fe1445 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -312,13 +312,12 @@ def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from searx.locales import region_tag
+ from searx.utils import extr
resp = network.get(about['website'])
- text = resp.text
- text = text[text.find('INITIAL_PROPS') :]
- text = text[text.find('{') : text.find('</script>')]
+ json_string = extr(resp.text, 'INITIAL_PROPS = ', '</script>')
- q_initial_props = loads(text)
+ q_initial_props = loads(json_string)
q_locales = q_initial_props.get('locales')
eng_tag_list = set()
diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py
index 2449345e6..d46468d8d 100644
--- a/searx/engines/vimeo.py
+++ b/searx/engines/vimeo.py
@@ -7,6 +7,8 @@ from urllib.parse import urlencode
from json import loads
from dateutil import parser
+from searx.utils import extr
+
# about
about = {
"website": 'https://vimeo.com/',
@@ -23,7 +25,7 @@ paging = True
# search-url
base_url = 'https://vimeo.com/'
-search_url = base_url + '/search/page:{pageno}?{query}'
+search_url = base_url + 'search/page:{pageno}?{query}'
# do search-request
@@ -36,9 +38,8 @@ def request(query, params):
# get response from search-request
def response(resp):
results = []
- data_start_pos = resp.text.find('{"filtered"')
- data_end_pos = resp.text.find(';\n', data_start_pos + 1)
- data = loads(resp.text[data_start_pos:data_end_pos])
+
+ data = loads(extr(resp.text, 'var data = ', ';\n'))
# parse results
for result in data['filtered']['data']:
diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py
index b015dff8d..c2136c3ca 100644
--- a/searx/engines/youtube_noapi.py
+++ b/searx/engines/youtube_noapi.py
@@ -7,6 +7,8 @@ from functools import reduce
from json import loads, dumps
from urllib.parse import quote_plus
+from searx.utils import extr
+
# about
about = {
"website": 'https://www.youtube.com/',
@@ -109,8 +111,8 @@ def parse_next_page_response(response_text):
def parse_first_page_response(response_text):
results = []
- results_data = response_text[response_text.find('ytInitialData') :]
- results_data = results_data[results_data.find('{') : results_data.find(';</script>')]
+ results_data = extr(response_text, 'ytInitialData = ', ';</script>')
+
results_json = loads(results_data) if results_data else {}
sections = (
results_json.get('contents', {})
diff --git a/searx/utils.py b/searx/utils.py
index f50618ea2..58ff72bb9 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -2,6 +2,9 @@
"""Utility functions for the engines
"""
+
+from __future__ import annotations
+
import re
import importlib
import importlib.util
@@ -371,6 +374,35 @@ def convert_str_to_int(number_str: str) -> int:
return 0
+def extr(txt: str, begin: str, end: str, default: str = ""):
+ """Extract the string between ``begin`` and ``end`` from ``txt``
+
+ :param txt: String to search in
+ :param begin: First string to be searched for
+ :param end: Second string to be searched for after ``begin``
+ :param default: Default value if one of ``begin`` or ``end`` is not
+ found. Defaults to an empty string.
+ :return: The string between the two search-strings ``begin`` and ``end``.
+ If at least one of ``begin`` or ``end`` is not found, the value of
+ ``default`` is returned.
+
+ Examples:
+ >>> extr("abcde", "a", "e")
+ "bcd"
+ >>> extr("abcde", "a", "z", deafult="nothing")
+ "nothing"
+
+ """
+
+ # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
+
+ try:
+ first = txt.index(begin) + len(begin)
+ return txt[first : txt.index(end, first)]
+ except ValueError:
+ return default
+
+
def int_or_zero(num: Union[List[str], str]) -> int:
"""Convert num to int or 0. num can be either a str or a list.
If num is a list, the first element is converted to int (or return 0 if the list is empty).