From 72f5e7cfb8b1fc7be862bbe96e9e0123de252e5d Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 16 Sep 2023 13:45:15 +0000 Subject: [PATCH] js_variable_to_python: add tests, handle more JS syntax The tests from chompjs are copied. The comment out tests do not pass. The implementation of js_variable_to_python has been updated: * in the main looop, try to make the four different cases more clear * handle decimal number like "-.5", "5." or "- 5" (without double quote) * the character ` is seen a string delimiter as intended in JS * the identifiers follow JS specification ($, _, letters and numbers) --- requirements-dev.txt | 1 + searx/utils.py | 154 ++++++++---- tests/unit/test_js_variable_to_python.py | 283 +++++++++++++++++++++++ 3 files changed, 392 insertions(+), 46 deletions(-) create mode 100644 tests/unit/test_js_variable_to_python.py diff --git a/requirements-dev.txt b/requirements-dev.txt index cde368479..0b66e6886 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,3 +21,4 @@ aiounittest==1.4.2 yamllint==1.32.0 wlc==1.13 coloredlogs==15.0.1 +parameterized==0.9.0 \ No newline at end of file diff --git a/searx/utils.py b/searx/utils.py index 458cef7ea..08e7e8f77 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -38,9 +38,14 @@ _BLOCKED_TAGS = ('script', 'style') _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) -_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)') -_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)') -_JS_DECIMAL_RE = re.compile(r":\s*\.") +_JS_STRING_DELIMITERS = re.compile(r'(["\'`])') +_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)') +_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined') +_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)") +_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)") +_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])") +_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)') +_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' _STORAGE_UNIT_VALUE: Dict[str, int] = { 'TB': 1024 * 1024 * 1024 * 1024, @@ -652,12 +657,45 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo return None +def _j2p_process_escape(match): + # deal with ECMA escape characters + escape = match.group(1) or match.group(2) + return ( + Rf'\{escape}' + if escape in _JSON_PASSTHROUGH_ESCAPES + else R'\u00' + if escape == 'x' + else '' + if escape == '\n' + else escape + ) + + +def _j2p_decimal(match): + return ( + match.group(1) + + match.group(2) + + (match.group(3).replace("_", "") or "0") + + "." + + (match.group(4).replace("_", "") or "0") + ) + + +def _j2p_decimal2(match): + return match.group(1) + match.group(2) + match.group(3).replace("_", "") + + def js_variable_to_python(js_variable): """Convert a javascript variable into JSON and then load the value It does not deal with all cases, but it is good enough for now. chompjs has a better implementation. """ + if not isinstance(js_variable, str): + raise ValueError("js_variable must be of type str") + if js_variable == "": + raise ValueError("js_variable can't be an empty string") + # when in_string is not None, it contains the character that has opened the string # either simple quote or double quote in_string = None @@ -665,49 +703,68 @@ def js_variable_to_python(js_variable): # r"""{ a:"f\"irst", c:'sec"ond'}""" # becomes # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}'] - parts = re.split(r'(["\'])', js_variable) - # previous part (to check the escape character antislash) - previous_p = "" + parts = _JS_STRING_DELIMITERS.split(js_variable) + # does the previous part ends with a backslash? + blackslash_just_before = False for i, p in enumerate(parts): - # parse characters inside a ECMA string - if in_string: - # we are in a JS string: replace the colon by a temporary character - # so quote_keys_regex doesn't have to deal with colon inside the JS strings - parts[i] = parts[i].replace(':', chr(1)) - if in_string == "'": - # the JS string is delimited by simple quote. - # This is not supported by JSON. - # simple quote delimited string are converted to double quote delimited string - # here, inside a JS string, we escape the double quote - parts[i] = parts[i].replace('"', r'\"') - - # deal with delimieters and escape character - if not in_string and p in ('"', "'"): - # we are not in string - # but p is double or simple quote - # that's the start of a new string - # replace simple quote by double quote - # (JSON doesn't support simple quote) - parts[i] = '"' - in_string = p - continue - if p == in_string: - # we are in a string and the current part MAY close the string - if len(previous_p) > 0 and previous_p[-1] == '\\': - # there is an antislash just before: the ECMA string continue - continue - # the current p close the string - # replace simple quote by double quote - parts[i] = '"' + if p == in_string and not blackslash_just_before: + # * the current part matches the character which has opened the string + # * there is no antislash just before + # --> the current part close the current string in_string = None + # replace simple quote and ` by double quote + # since JSON supports only double quote for string + parts[i] = '"' - if not in_string: - # replace void 0 by null + elif in_string: + # --> we are in a JS string + # replace the colon by a temporary character + # so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings + p = p.replace(':', chr(1)) + # replace JS escape sequences by JSON escape sequences + p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p) + # the JS string is delimited by simple quote. + # This is not supported by JSON. + # simple quote delimited string are converted to double quote delimited string + # here, inside a JS string, we escape the double quote + if in_string == "'": + p = p.replace('"', r'\"') + parts[i] = p + # deal with the sequence blackslash then quote + # since js_variable splits on quote, we detect this case: + # * the previous part ends with a black slash + # * the current part is a single quote + # when detected the blackslash is removed on the previous part + if blackslash_just_before and p[:1] == "'": + parts[i - 1] = parts[i - 1][:-1] + + elif in_string is None and p in ('"', "'", "`"): + # we are not in string but p is string delimiter + # --> that's the start of a new string + in_string = p + # replace simple quote by double quote + # since JSON supports only double quote for string + parts[i] = '"' + + elif in_string is None: + # we are not in a string + # replace by null these values: + # * void 0 + # * void(0) + # * undefined # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void - # we are sure there is no string in p - parts[i] = _JS_VOID_RE.sub("null", p) - # update previous_p - previous_p = p + p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p) + # make sure there is a leading zero in front of float + p = _JS_DECIMAL_RE.sub(_j2p_decimal, p) + p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p) + # remove extra coma in a list or an object + # for example [1,2,3,] becomes [1,2,3] + p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p) + parts[i] = p + + # update for the next iteration + blackslash_just_before = len(p) > 0 and p[-1] == '\\' + # join the string s = ''.join(parts) # add quote arround the key @@ -715,8 +772,13 @@ def js_variable_to_python(js_variable): # becomes # { "a": 12 } s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s) - s = _JS_DECIMAL_RE.sub(":0.", s) - # replace the surogate character by colon - s = s.replace(chr(1), ':') + # replace the surogate character by colon and strip whitespaces + s = s.replace(chr(1), ':').strip() # load the JSON and return the result - return json.loads(s) + if s == "": + raise ValueError("js_variable can't be an empty string") + try: + return json.loads(s) + except json.JSONDecodeError as e: + logger.debug("Internal error: js_variable_to_python creates invalid JSON:\n%s", s) + raise ValueError("js_variable_to_python creates invalid JSON") from e diff --git a/tests/unit/test_js_variable_to_python.py b/tests/unit/test_js_variable_to_python.py new file mode 100644 index 000000000..634749b2b --- /dev/null +++ b/tests/unit/test_js_variable_to_python.py @@ -0,0 +1,283 @@ +# -*- coding: utf-8 -*- +""" +Tests for the function searx.utils.js_variable_to_python + +The tests are copied from https://github.com/Nykakin/chompjs/blob/c1501b5cd82c0044539875331745b820e7bfd067/chompjs/test_parser.py + +Comment out tests do not pass +""" +import math + +from parameterized import parameterized + +from searx.utils import js_variable_to_python + +from tests import SearxTestCase + + +class TestParser(SearxTestCase): + @parameterized.expand( + [ + ("{'hello': 'world'}", {'hello': 'world'}), + ("{'hello': 'world', 'my': 'master'}", {'hello': 'world', 'my': 'master'}), + ( + "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}", + {'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}, + ), + ("{}", {}), + ] + ) + def test_parse_object(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + @parameterized.expand( + [ + ("[]", []), + ("[[[]]]", [[[]]]), + ("[[[1]]]", [[[1]]]), + ("[1]", [1]), + ("[1, 2, 3, 4]", [1, 2, 3, 4]), + ("['h', 'e', 'l', 'l', 'o']", ['h', 'e', 'l', 'l', 'o']), + ("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]", [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]), + ] + ) + def test_parse_list(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + @parameterized.expand( + [ + ("{'hello': [], 'world': [0]}", {'hello': [], 'world': [0]}), + ("{'hello': [1, 2, 3, 4]}", {'hello': [1, 2, 3, 4]}), + ("[{'a':12}, {'b':33}]", [{'a': 12}, {'b': 33}]), + ( + "[false, {'true': true, `pies`: \"kot\"}, false,]", + [False, {"true": True, 'pies': 'kot'}, False], + ), + ( + "{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}", + {k: 1 for k in 'abcdefghij'}, + ), + ( + "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}", + {'a': [{'b': 1}, {'c': [{'d': {'f': {'g': [1, 2]}}}, {'e': 1}]}]}, + ), + ] + ) + def test_parse_mixed(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + @parameterized.expand( + [ + ("{'hello': 12, 'world': 10002.21}", {'hello': 12, 'world': 10002.21}), + ("[12, -323, 0.32, -32.22, .2, - 4]", [12, -323, 0.32, -32.22, 0.2, -4]), + ('{"a": -12, "b": - 5}', {'a': -12, 'b': -5}), + ("{'a': true, 'b': false, 'c': null}", {'a': True, 'b': False, 'c': None}), + ("[\"\\uD834\\uDD1E\"]", ['𝄞']), + ("{'a': '123\\'456\\n'}", {'a': "123'456\n"}), + ("['\u00E9']", ['é']), + ('{"cache":{"\u002Ftest\u002F": 0}}', {'cache': {'/test/': 0}}), + ('{"a": 3.125e7}', {'a': 3.125e7}), + ('''{"a": "b\\'"}''', {'a': "b'"}), + ('{"a": .99, "b": -.1}', {"a": 0.99, "b": -0.1}), + ('["/* ... */", "// ..."]', ["/* ... */", "// ..."]), + ('{"inclusions":["/*","/"]}', {'inclusions': ['/*', '/']}), + ] + ) + def test_parse_standard_values(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + def test_parse_nan(self): + js = '{"A": NaN}' + py = js_variable_to_python(js) + self.assertTrue(math.isnan(py["A"])) + + @parameterized.expand( + [ + ("{abc: 100, dev: 200}", {'abc': 100, 'dev': 200}), + ("{abcdefghijklmnopqrstuvwxyz: 12}", {"abcdefghijklmnopqrstuvwxyz": 12}), + # ( + # "{age: function(yearBorn,thisYear) {return thisYear - yearBorn;}}", + # {"age": "function(yearBorn,thisYear) {return thisYear - yearBorn;}"} + # ), + # ( + # "{\"abc\": function() {return '])))))))))))))))';}}", + # {"abc": "function() {return '])))))))))))))))';}"}, + # ), + ('{"a": undefined}', {"a": None}), # chompjs returns {"a": "undefined"} + ('[undefined, undefined]', [None, None]), # chompjs returns ["undefined", "undefined"] + ("{_a: 1, $b: 2}", {"_a": 1, "$b": 2}), + # ("{regex: /a[^d]{1,12}/i}", {'regex': '/a[^d]{1,12}/i'}), + # ("{'a': function(){return '\"'}}", {'a': 'function(){return \'"\'}'}), + ("{1: 1, 2: 2, 3: 3, 4: 4}", {'1': 1, '2': 2, '3': 3, '4': 4}), + ("{'a': 121.}", {'a': 121.0}), + ] + ) + def test_parse_strange_values(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + @parameterized.expand( + [ + # ('{"a": {"b": [12, 13, 14]}}text text', {"a": {"b": [12, 13, 14]}}), + # ('var test = {"a": {"b": [12, 13, 14]}}', {"a": {"b": [12, 13, 14]}}), + ('{"a":\r\n10}', {'a': 10}), + ("{'foo': 0,\r\n}", {'foo': 0}), + ("{truefalse: 0, falsefalse: 1, nullnull: 2}", {'truefalse': 0, 'falsefalse': 1, 'nullnull': 2}), + ] + ) + def test_strange_input(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + @parameterized.expand( + [ + ("[0]", [0]), + ("[1]", [1]), + ("[12]", [12]), + ("[12_12]", [1212]), + # ("[0x12]", [18]), + # ("[0xab]", [171]), + # ("[0xAB]", [171]), + # ("[0X12]", [18]), + # ("[0Xab]", [171]), + # ("[0XAB]", [171]), + # ("[01234]", [668]), + # ("[0o1234]", [668]), + # ("[0O1234]", [668]), + # ("[0b1111]", [15]), + # ("[0B1111]", [15]), + ("[-0]", [-0]), + ("[-1]", [-1]), + ("[-12]", [-12]), + ("[-12_12]", [-1212]), + # ("[-0x12]", [-18]), + # ("[-0xab]", [-171]), + # ("[-0xAB]", [-171]), + # ("[-0X12]", [-18]), + # ("[-0Xab]", [-171]), + # ("[-0XAB]", [-171]), + # ("[-01234]", [-668]), + # ("[-0o1234]", [-668]), + # ("[-0O1234]", [-668]), + # ("[-0b1111]", [-15]), + # ("[-0B1111]", [-15]), + ] + ) + def test_integer_numeric_values(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + @parameterized.expand( + [ + ("[0.32]", [0.32]), + ("[-0.32]", [-0.32]), + ("[.32]", [0.32]), + ("[-.32]", [-0.32]), + ("[12.]", [12.0]), + ("[-12.]", [-12.0]), + ("[12.32]", [12.32]), + ("[-12.12]", [-12.12]), + ("[3.1415926]", [3.1415926]), + ("[.123456789]", [0.123456789]), + ("[.0123]", [0.0123]), + ("[0.0123]", [0.0123]), + ("[-.0123]", [-0.0123]), + ("[-0.0123]", [-0.0123]), + ("[3.1E+12]", [3.1e12]), + ("[3.1e+12]", [3.1e12]), + ("[.1e-23]", [0.1e-23]), + ("[.1e-23]", [0.1e-23]), + ] + ) + def test_float_numeric_values(self, js, expected_py): + py = js_variable_to_python(js) + self.assertEqual(py, expected_py) + + # @parameterized.expand([ + # ('["Test\\nDrive"]\n{"Test": "Drive"}', [['Test\nDrive'], {'Test': 'Drive'}]), + # ]) + # def test_jsonlines(self, js, expected_py): + # py = js_variable_to_python(js) + # self.assertEqual(py, expected_py) + + +class TestParserExceptions(SearxTestCase): + @parameterized.expand( + [ + ('}{', ValueError), + ('', ValueError), + (None, ValueError), + ] + ) + def test_exceptions(self, js, expected_exception): + with self.assertRaises(expected_exception): + js_variable_to_python(js) + + @parameterized.expand( + [ + ("{whose: 's's', category_name: '>'}", ValueError), + ] + ) + def test_malformed_input(self, in_data, expected_exception): + with self.assertRaises(expected_exception): + js_variable_to_python(in_data) + + @parameterized.expand( + [ + ( + '{"test": """}', + ValueError, + 'js_variable_to_python creates invalid JSON', + ), + ] + ) + def test_error_messages(self, js, expected_exception, expected_exception_text): + with self.assertRaisesRegex(expected_exception, expected_exception_text): + js_variable_to_python(js) + + +# class TestOptions(SearxTestCase): +# @parameterized.expand( +# [ +# ('{\\\"a\\\": 12}', {'a': 12}), +# ] +# ) +# def test_unicode_escape(self, js, expected_py): +# py = js_variable_to_python(js) +# self.assertEqual(py, expected_py) + + +class TestParseJsonObjects(SearxTestCase): + @parameterized.expand( + [ + # ("", []), + # ("aaaaaaaaaaaaaaaa", []), + # (" ", []), + (" {'a': 12}", [{'a': 12}]), + # ("[1, 2, 3, 4]xxxxxxxxxxxxxxxxxxxxxxxx", [[1, 2, 3, 4]]), + # ("[12] [13] [14]", [[12], [13], [14]]), + # ("[10] {'a': [1, 1, 1,]}", [[10], {'a': [1, 1, 1]}]), + # ("[1][1][1]", [[1], [1], [1]]), + # ("[1] [2] {'a': ", [[1], [2]]), + # ("[]", [[]]), + # ("[][][][]", [[], [], [], []]), + ("{}", [{}]), + # ("{}{}{}{}", [{}, {}, {}, {}]), + # ("{{}}{{}}", []), + # ("[[]][[]]", [[[]], [[]]]), + # ("{am: 'ab'}\n{'ab': 'xx'}", [{'am': 'ab'}, {'ab': 'xx'}]), + # ( + # 'function(a, b, c){ /* ... */ }({"a": 12}, Null, [1, 2, 3])', + # [{}, {'a': 12}, [1, 2, 3]], + # ), + # ('{"a": 12, broken}{"c": 100}', [{'c': 100}]), + # ('[12,,,,21][211,,,][12,12][12,,,21]', [[12, 12]]), + ] + ) + def test_parse_json_objects(self, js, expected_py): + py_in_list = [js_variable_to_python(js)] + self.assertEqual(py_in_list, expected_py)