summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2023-09-16 13:45:15 +0000
committerAlexandre Flament <alex@al-f.net>2023-09-16 18:30:57 +0000
commit72f5e7cfb8b1fc7be862bbe96e9e0123de252e5d (patch)
tree31ad90bec9e13992e5d709498874b03e72dfc9a5
parentec540a967a66156baa06797183cc64c4a3e345be (diff)
downloadsearxng-update_js_ariable_to_python.tar.gz
searxng-update_js_ariable_to_python.zip
js_variable_to_python: add tests, handle more JS syntaxupdate_js_ariable_to_python
The tests from chompjs are copied. The comment out tests do not pass. The implementation of js_variable_to_python has been updated: * in the main looop, try to make the four different cases more clear * handle decimal number like "-.5", "5." or "- 5" (without double quote) * the character ` is seen a string delimiter as intended in JS * the identifiers follow JS specification ($, _, letters and numbers)
-rw-r--r--requirements-dev.txt1
-rw-r--r--searx/utils.py148
-rw-r--r--tests/unit/test_js_variable_to_python.py283
3 files changed, 389 insertions, 43 deletions
diff --git a/requirements-dev.txt b/requirements-dev.txt
index cde368479..0b66e6886 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -21,3 +21,4 @@ aiounittest==1.4.2
yamllint==1.32.0
wlc==1.13
coloredlogs==15.0.1
+parameterized==0.9.0 \ No newline at end of file
diff --git a/searx/utils.py b/searx/utils.py
index 458cef7ea..08e7e8f77 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -38,9 +38,14 @@ _BLOCKED_TAGS = ('script', 'style')
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
-_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
-_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
-_JS_DECIMAL_RE = re.compile(r":\s*\.")
+_JS_STRING_DELIMITERS = re.compile(r'(["\'`])')
+_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)')
+_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined')
+_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)")
+_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)")
+_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])")
+_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)')
+_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
_STORAGE_UNIT_VALUE: Dict[str, int] = {
'TB': 1024 * 1024 * 1024 * 1024,
@@ -652,12 +657,45 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
return None
+def _j2p_process_escape(match):
+ # deal with ECMA escape characters
+ escape = match.group(1) or match.group(2)
+ return (
+ Rf'\{escape}'
+ if escape in _JSON_PASSTHROUGH_ESCAPES
+ else R'\u00'
+ if escape == 'x'
+ else ''
+ if escape == '\n'
+ else escape
+ )
+
+
+def _j2p_decimal(match):
+ return (
+ match.group(1)
+ + match.group(2)
+ + (match.group(3).replace("_", "") or "0")
+ + "."
+ + (match.group(4).replace("_", "") or "0")
+ )
+
+
+def _j2p_decimal2(match):
+ return match.group(1) + match.group(2) + match.group(3).replace("_", "")
+
+
def js_variable_to_python(js_variable):
"""Convert a javascript variable into JSON and then load the value
It does not deal with all cases, but it is good enough for now.
chompjs has a better implementation.
"""
+ if not isinstance(js_variable, str):
+ raise ValueError("js_variable must be of type str")
+ if js_variable == "":
+ raise ValueError("js_variable can't be an empty string")
+
# when in_string is not None, it contains the character that has opened the string
# either simple quote or double quote
in_string = None
@@ -665,49 +703,68 @@ def js_variable_to_python(js_variable):
# r"""{ a:"f\"irst", c:'sec"ond'}"""
# becomes
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
- parts = re.split(r'(["\'])', js_variable)
- # previous part (to check the escape character antislash)
- previous_p = ""
+ parts = _JS_STRING_DELIMITERS.split(js_variable)
+ # does the previous part ends with a backslash?
+ blackslash_just_before = False
for i, p in enumerate(parts):
- # parse characters inside a ECMA string
- if in_string:
- # we are in a JS string: replace the colon by a temporary character
- # so quote_keys_regex doesn't have to deal with colon inside the JS strings
- parts[i] = parts[i].replace(':', chr(1))
- if in_string == "'":
- # the JS string is delimited by simple quote.
- # This is not supported by JSON.
- # simple quote delimited string are converted to double quote delimited string
- # here, inside a JS string, we escape the double quote
- parts[i] = parts[i].replace('"', r'\"')
-
- # deal with delimieters and escape character
- if not in_string and p in ('"', "'"):
- # we are not in string
- # but p is double or simple quote
- # that's the start of a new string
- # replace simple quote by double quote
- # (JSON doesn't support simple quote)
+ if p == in_string and not blackslash_just_before:
+ # * the current part matches the character which has opened the string
+ # * there is no antislash just before
+ # --> the current part close the current string
+ in_string = None
+ # replace simple quote and ` by double quote
+ # since JSON supports only double quote for string
parts[i] = '"'
+
+ elif in_string:
+ # --> we are in a JS string
+ # replace the colon by a temporary character
+ # so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings
+ p = p.replace(':', chr(1))
+ # replace JS escape sequences by JSON escape sequences
+ p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p)
+ # the JS string is delimited by simple quote.
+ # This is not supported by JSON.
+ # simple quote delimited string are converted to double quote delimited string
+ # here, inside a JS string, we escape the double quote
+ if in_string == "'":
+ p = p.replace('"', r'\"')
+ parts[i] = p
+ # deal with the sequence blackslash then quote
+ # since js_variable splits on quote, we detect this case:
+ # * the previous part ends with a black slash
+ # * the current part is a single quote
+ # when detected the blackslash is removed on the previous part
+ if blackslash_just_before and p[:1] == "'":
+ parts[i - 1] = parts[i - 1][:-1]
+
+ elif in_string is None and p in ('"', "'", "`"):
+ # we are not in string but p is string delimiter
+ # --> that's the start of a new string
in_string = p
- continue
- if p == in_string:
- # we are in a string and the current part MAY close the string
- if len(previous_p) > 0 and previous_p[-1] == '\\':
- # there is an antislash just before: the ECMA string continue
- continue
- # the current p close the string
# replace simple quote by double quote
+ # since JSON supports only double quote for string
parts[i] = '"'
- in_string = None
- if not in_string:
- # replace void 0 by null
+ elif in_string is None:
+ # we are not in a string
+ # replace by null these values:
+ # * void 0
+ # * void(0)
+ # * undefined
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
- # we are sure there is no string in p
- parts[i] = _JS_VOID_RE.sub("null", p)
- # update previous_p
- previous_p = p
+ p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p)
+ # make sure there is a leading zero in front of float
+ p = _JS_DECIMAL_RE.sub(_j2p_decimal, p)
+ p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p)
+ # remove extra coma in a list or an object
+ # for example [1,2,3,] becomes [1,2,3]
+ p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p)
+ parts[i] = p
+
+ # update for the next iteration
+ blackslash_just_before = len(p) > 0 and p[-1] == '\\'
+
# join the string
s = ''.join(parts)
# add quote arround the key
@@ -715,8 +772,13 @@ def js_variable_to_python(js_variable):
# becomes
# { "a": 12 }
s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
- s = _JS_DECIMAL_RE.sub(":0.", s)
- # replace the surogate character by colon
- s = s.replace(chr(1), ':')
+ # replace the surogate character by colon and strip whitespaces
+ s = s.replace(chr(1), ':').strip()
# load the JSON and return the result
- return json.loads(s)
+ if s == "":
+ raise ValueError("js_variable can't be an empty string")
+ try:
+ return json.loads(s)
+ except json.JSONDecodeError as e:
+ logger.debug("Internal error: js_variable_to_python creates invalid JSON:\n%s", s)
+ raise ValueError("js_variable_to_python creates invalid JSON") from e
diff --git a/tests/unit/test_js_variable_to_python.py b/tests/unit/test_js_variable_to_python.py
new file mode 100644
index 000000000..634749b2b
--- /dev/null
+++ b/tests/unit/test_js_variable_to_python.py
@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for the function searx.utils.js_variable_to_python
+
+The tests are copied from https://github.com/Nykakin/chompjs/blob/c1501b5cd82c0044539875331745b820e7bfd067/chompjs/test_parser.py
+
+Comment out tests do not pass
+"""
+import math
+
+from parameterized import parameterized
+
+from searx.utils import js_variable_to_python
+
+from tests import SearxTestCase
+
+
+class TestParser(SearxTestCase):
+ @parameterized.expand(
+ [
+ ("{'hello': 'world'}", {'hello': 'world'}),
+ ("{'hello': 'world', 'my': 'master'}", {'hello': 'world', 'my': 'master'}),
+ (
+ "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}",
+ {'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'},
+ ),
+ ("{}", {}),
+ ]
+ )
+ def test_parse_object(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ @parameterized.expand(
+ [
+ ("[]", []),
+ ("[[[]]]", [[[]]]),
+ ("[[[1]]]", [[[1]]]),
+ ("[1]", [1]),
+ ("[1, 2, 3, 4]", [1, 2, 3, 4]),
+ ("['h', 'e', 'l', 'l', 'o']", ['h', 'e', 'l', 'l', 'o']),
+ ("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]", [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]),
+ ]
+ )
+ def test_parse_list(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ @parameterized.expand(
+ [
+ ("{'hello': [], 'world': [0]}", {'hello': [], 'world': [0]}),
+ ("{'hello': [1, 2, 3, 4]}", {'hello': [1, 2, 3, 4]}),
+ ("[{'a':12}, {'b':33}]", [{'a': 12}, {'b': 33}]),
+ (
+ "[false, {'true': true, `pies`: \"kot\"}, false,]",
+ [False, {"true": True, 'pies': 'kot'}, False],
+ ),
+ (
+ "{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}",
+ {k: 1 for k in 'abcdefghij'},
+ ),
+ (
+ "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}",
+ {'a': [{'b': 1}, {'c': [{'d': {'f': {'g': [1, 2]}}}, {'e': 1}]}]},
+ ),
+ ]
+ )
+ def test_parse_mixed(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ @parameterized.expand(
+ [
+ ("{'hello': 12, 'world': 10002.21}", {'hello': 12, 'world': 10002.21}),
+ ("[12, -323, 0.32, -32.22, .2, - 4]", [12, -323, 0.32, -32.22, 0.2, -4]),
+ ('{"a": -12, "b": - 5}', {'a': -12, 'b': -5}),
+ ("{'a': true, 'b': false, 'c': null}", {'a': True, 'b': False, 'c': None}),
+ ("[\"\\uD834\\uDD1E\"]", ['𝄞']),
+ ("{'a': '123\\'456\\n'}", {'a': "123'456\n"}),
+ ("['\u00E9']", ['é']),
+ ('{"cache":{"\u002Ftest\u002F": 0}}', {'cache': {'/test/': 0}}),
+ ('{"a": 3.125e7}', {'a': 3.125e7}),
+ ('''{"a": "b\\'"}''', {'a': "b'"}),
+ ('{"a": .99, "b": -.1}', {"a": 0.99, "b": -0.1}),
+ ('["/* ... */", "// ..."]', ["/* ... */", "// ..."]),
+ ('{"inclusions":["/*","/"]}', {'inclusions': ['/*', '/']}),
+ ]
+ )
+ def test_parse_standard_values(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ def test_parse_nan(self):
+ js = '{"A": NaN}'
+ py = js_variable_to_python(js)
+ self.assertTrue(math.isnan(py["A"]))
+
+ @parameterized.expand(
+ [
+ ("{abc: 100, dev: 200}", {'abc': 100, 'dev': 200}),
+ ("{abcdefghijklmnopqrstuvwxyz: 12}", {"abcdefghijklmnopqrstuvwxyz": 12}),
+ # (
+ # "{age: function(yearBorn,thisYear) {return thisYear - yearBorn;}}",
+ # {"age": "function(yearBorn,thisYear) {return thisYear - yearBorn;}"}
+ # ),
+ # (
+ # "{\"abc\": function() {return '])))))))))))))))';}}",
+ # {"abc": "function() {return '])))))))))))))))';}"},
+ # ),
+ ('{"a": undefined}', {"a": None}), # chompjs returns {"a": "undefined"}
+ ('[undefined, undefined]', [None, None]), # chompjs returns ["undefined", "undefined"]
+ ("{_a: 1, $b: 2}", {"_a": 1, "$b": 2}),
+ # ("{regex: /a[^d]{1,12}/i}", {'regex': '/a[^d]{1,12}/i'}),
+ # ("{'a': function(){return '\"'}}", {'a': 'function(){return \'"\'}'}),
+ ("{1: 1, 2: 2, 3: 3, 4: 4}", {'1': 1, '2': 2, '3': 3, '4': 4}),
+ ("{'a': 121.}", {'a': 121.0}),
+ ]
+ )
+ def test_parse_strange_values(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ @parameterized.expand(
+ [
+ # ('{"a": {"b": [12, 13, 14]}}text text', {"a": {"b": [12, 13, 14]}}),
+ # ('var test = {"a": {"b": [12, 13, 14]}}', {"a": {"b": [12, 13, 14]}}),
+ ('{"a":\r\n10}', {'a': 10}),
+ ("{'foo': 0,\r\n}", {'foo': 0}),
+ ("{truefalse: 0, falsefalse: 1, nullnull: 2}", {'truefalse': 0, 'falsefalse': 1, 'nullnull': 2}),
+ ]
+ )
+ def test_strange_input(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ @parameterized.expand(
+ [
+ ("[0]", [0]),
+ ("[1]", [1]),
+ ("[12]", [12]),
+ ("[12_12]", [1212]),
+ # ("[0x12]", [18]),
+ # ("[0xab]", [171]),
+ # ("[0xAB]", [171]),
+ # ("[0X12]", [18]),
+ # ("[0Xab]", [171]),
+ # ("[0XAB]", [171]),
+ # ("[01234]", [668]),
+ # ("[0o1234]", [668]),
+ # ("[0O1234]", [668]),
+ # ("[0b1111]", [15]),
+ # ("[0B1111]", [15]),
+ ("[-0]", [-0]),
+ ("[-1]", [-1]),
+ ("[-12]", [-12]),
+ ("[-12_12]", [-1212]),
+ # ("[-0x12]", [-18]),
+ # ("[-0xab]", [-171]),
+ # ("[-0xAB]", [-171]),
+ # ("[-0X12]", [-18]),
+ # ("[-0Xab]", [-171]),
+ # ("[-0XAB]", [-171]),
+ # ("[-01234]", [-668]),
+ # ("[-0o1234]", [-668]),
+ # ("[-0O1234]", [-668]),
+ # ("[-0b1111]", [-15]),
+ # ("[-0B1111]", [-15]),
+ ]
+ )
+ def test_integer_numeric_values(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ @parameterized.expand(
+ [
+ ("[0.32]", [0.32]),
+ ("[-0.32]", [-0.32]),
+ ("[.32]", [0.32]),
+ ("[-.32]", [-0.32]),
+ ("[12.]", [12.0]),
+ ("[-12.]", [-12.0]),
+ ("[12.32]", [12.32]),
+ ("[-12.12]", [-12.12]),
+ ("[3.1415926]", [3.1415926]),
+ ("[.123456789]", [0.123456789]),
+ ("[.0123]", [0.0123]),
+ ("[0.0123]", [0.0123]),
+ ("[-.0123]", [-0.0123]),
+ ("[-0.0123]", [-0.0123]),
+ ("[3.1E+12]", [3.1e12]),
+ ("[3.1e+12]", [3.1e12]),
+ ("[.1e-23]", [0.1e-23]),
+ ("[.1e-23]", [0.1e-23]),
+ ]
+ )
+ def test_float_numeric_values(self, js, expected_py):
+ py = js_variable_to_python(js)
+ self.assertEqual(py, expected_py)
+
+ # @parameterized.expand([
+ # ('["Test\\nDrive"]\n{"Test": "Drive"}', [['Test\nDrive'], {'Test': 'Drive'}]),
+ # ])
+ # def test_jsonlines(self, js, expected_py):
+ # py = js_variable_to_python(js)
+ # self.assertEqual(py, expected_py)
+
+
+class TestParserExceptions(SearxTestCase):
+ @parameterized.expand(
+ [
+ ('}{', ValueError),
+ ('', ValueError),
+ (None, ValueError),
+ ]
+ )
+ def test_exceptions(self, js, expected_exception):
+ with self.assertRaises(expected_exception):
+ js_variable_to_python(js)
+
+ @parameterized.expand(
+ [
+ ("{whose: 's's', category_name: '>'}", ValueError),
+ ]
+ )
+ def test_malformed_input(self, in_data, expected_exception):
+ with self.assertRaises(expected_exception):
+ js_variable_to_python(in_data)
+
+ @parameterized.expand(
+ [
+ (
+ '{"test": """}',
+ ValueError,
+ 'js_variable_to_python creates invalid JSON',
+ ),
+ ]
+ )
+ def test_error_messages(self, js, expected_exception, expected_exception_text):
+ with self.assertRaisesRegex(expected_exception, expected_exception_text):
+ js_variable_to_python(js)
+
+
+# class TestOptions(SearxTestCase):
+# @parameterized.expand(
+# [
+# ('{\\\"a\\\": 12}', {'a': 12}),
+# ]
+# )
+# def test_unicode_escape(self, js, expected_py):
+# py = js_variable_to_python(js)
+# self.assertEqual(py, expected_py)
+
+
+class TestParseJsonObjects(SearxTestCase):
+ @parameterized.expand(
+ [
+ # ("", []),
+ # ("aaaaaaaaaaaaaaaa", []),
+ # (" ", []),
+ (" {'a': 12}", [{'a': 12}]),
+ # ("[1, 2, 3, 4]xxxxxxxxxxxxxxxxxxxxxxxx", [[1, 2, 3, 4]]),
+ # ("[12] [13] [14]", [[12], [13], [14]]),
+ # ("[10] {'a': [1, 1, 1,]}", [[10], {'a': [1, 1, 1]}]),
+ # ("[1][1][1]", [[1], [1], [1]]),
+ # ("[1] [2] {'a': ", [[1], [2]]),
+ # ("[]", [[]]),
+ # ("[][][][]", [[], [], [], []]),
+ ("{}", [{}]),
+ # ("{}{}{}{}", [{}, {}, {}, {}]),
+ # ("{{}}{{}}", []),
+ # ("[[]][[]]", [[[]], [[]]]),
+ # ("{am: 'ab'}\n{'ab': 'xx'}", [{'am': 'ab'}, {'ab': 'xx'}]),
+ # (
+ # 'function(a, b, c){ /* ... */ }({"a": 12}, Null, [1, 2, 3])',
+ # [{}, {'a': 12}, [1, 2, 3]],
+ # ),
+ # ('{"a": 12, broken}{"c": 100}', [{'c': 100}]),
+ # ('[12,,,,21][211,,,][12,12][12,,,21]', [[12, 12]]),
+ ]
+ )
+ def test_parse_json_objects(self, js, expected_py):
+ py_in_list = [js_variable_to_python(js)]
+ self.assertEqual(py_in_list, expected_py)