summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2023-09-09 10:18:39 +0000
committerAlexandre Flament <alex@al-f.net>2023-09-09 13:02:36 +0200
commitd07c006aed12631a7e0f9a5c82288a8ef7c5bfad (patch)
tree3d44039eb5302fc7db7d398f6ec92f03635142c3
parent8e45ac42717da130017b53fd332afa416e36b194 (diff)
downloadsearxng-d07c006aed12631a7e0f9a5c82288a8ef7c5bfad.tar.gz
searxng-d07c006aed12631a7e0f9a5c82288a8ef7c5bfad.zip
Replace chompjs with pure Python code
The new implementation is good enough for the current usage (brave)
-rw-r--r--requirements.txt1
-rw-r--r--searx/engines/brave.py4
-rw-r--r--searx/utils.py73
3 files changed, 75 insertions, 3 deletions
diff --git a/requirements.txt b/requirements.txt
index 6088f6338..a88f7adc0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,4 +17,3 @@ markdown-it-py==3.0.0
typing_extensions==4.7.1
fasttext-predict==0.9.2.1
pytomlpp==1.0.13
-chompjs==1.2.2 \ No newline at end of file
diff --git a/searx/engines/brave.py b/searx/engines/brave.py
index d5e2d3e9f..e9d4af762 100644
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@@ -104,7 +104,6 @@ from urllib.parse import (
parse_qs,
)
-import chompjs
from lxml import html
from searx import locales
@@ -112,6 +111,7 @@ from searx.utils import (
extract_text,
eval_xpath_list,
eval_xpath_getindex,
+ js_variable_to_python,
)
from searx.enginelib.traits import EngineTraits
@@ -215,7 +215,7 @@ def response(resp):
datastr = line.replace("const data = ", "").strip()[:-1]
break
- json_data = chompjs.parse_js_object(datastr)
+ json_data = js_variable_to_python(datastr)
json_resp = json_data[1]['data']['body']['response']
if brave_category == 'news':
diff --git a/searx/utils.py b/searx/utils.py
index 7ddd2305a..9457ca87d 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -7,6 +7,7 @@
import re
import importlib
import importlib.util
+import json
import types
from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
@@ -37,6 +38,9 @@ _BLOCKED_TAGS = ('script', 'style')
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
+_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
+_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
+
_STORAGE_UNIT_VALUE: Dict[str, int] = {
'TB': 1024 * 1024 * 1024 * 1024,
'GB': 1024 * 1024 * 1024,
@@ -645,3 +649,72 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
return None
return language
return None
+
+
+def js_variable_to_python(js_variable):
+ """Convert a javascript variable into JSON and then load the value
+
+ It does not deal with all cases, but it is good enough for now.
+ chompjs has a better implementation.
+ """
+ # when in_string is not None, it contains the character that has opened the string
+ # either simple quote or double quote
+ in_string = None
+ # cut the string:
+ # r"""{ a:"f\"irst", c:'sec"ond'}"""
+ # becomes
+ # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
+ parts = re.split(r'(["\'])', js_variable)
+ # previous part (to check the escape character antislash)
+ previous_p = ""
+ for i, p in enumerate(parts):
+ # parse characters inside a ECMA string
+ if in_string:
+ # we are in a JS string: replace the colon by a temporary character
+ # so quote_keys_regex doesn't have to deal with colon inside the JS strings
+ parts[i] = parts[i].replace(':', chr(1))
+ if in_string == "'":
+ # the JS string is delimited by simple quote.
+ # This is not supported by JSON.
+ # simple quote delimited string are converted to double quote delimited string
+ # here, inside a JS string, we escape the double quote
+ parts[i] = parts[i].replace('"', r'\"')
+
+ # deal with delimieters and escape character
+ if not in_string and p in ('"', "'"):
+ # we are not in string
+ # but p is double or simple quote
+ # that's the start of a new string
+ # replace simple quote by double quote
+ # (JSON doesn't support simple quote)
+ parts[i] = '"'
+ in_string = p
+ continue
+ if p == in_string:
+ # we are in a string and the current part MAY close the string
+ if len(previous_p) > 0 and previous_p[-1] == '\\':
+ # there is an antislash just before: the ECMA string continue
+ continue
+ # the current p close the string
+ # replace simple quote by double quote
+ parts[i] = '"'
+ in_string = None
+ #
+ if not in_string:
+ # replace void 0 by null
+ # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
+ # we are sure there is no string in p
+ parts[i] = _JS_VOID_RE.sub("null", p)
+ # update previous_p
+ previous_p = p
+ # join the string
+ s = ''.join(parts)
+ # add quote arround the key
+ # { a: 12 }
+ # becomes
+ # { "a": 12 }
+ s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
+ # replace the surogate character by colon
+ s = s.replace(chr(1), ':')
+ # load the JSON and return the result
+ return json.loads(s)