js_variable_to_python: add tests, handle more JS syntax

The tests from chompjs are copied.
The comment out tests do not pass.
The implementation of js_variable_to_python has been updated:
* in the main looop, try to make the four different cases more clear
* handle decimal number like "-.5", "5." or "- 5"  (without double quote)
* the character ` is seen a string delimiter as intended in JS
* the identifiers follow JS specification ($, _, letters and numbers)
This commit is contained in:
Alexandre Flament 2023-09-16 13:45:15 +00:00
parent ec540a967a
commit 72f5e7cfb8
3 changed files with 392 additions and 46 deletions

View file

@ -21,3 +21,4 @@ aiounittest==1.4.2
yamllint==1.32.0 yamllint==1.32.0
wlc==1.13 wlc==1.13
coloredlogs==15.0.1 coloredlogs==15.0.1
parameterized==0.9.0

View file

@ -38,9 +38,14 @@ _BLOCKED_TAGS = ('script', 'style')
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)') _JS_STRING_DELIMITERS = re.compile(r'(["\'`])')
_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)') _JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)')
_JS_DECIMAL_RE = re.compile(r":\s*\.") _JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined')
_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)")
_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)")
_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])")
_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)')
_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
_STORAGE_UNIT_VALUE: Dict[str, int] = { _STORAGE_UNIT_VALUE: Dict[str, int] = {
'TB': 1024 * 1024 * 1024 * 1024, 'TB': 1024 * 1024 * 1024 * 1024,
@ -652,12 +657,45 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
return None return None
def _j2p_process_escape(match):
# deal with ECMA escape characters
escape = match.group(1) or match.group(2)
return (
Rf'\{escape}'
if escape in _JSON_PASSTHROUGH_ESCAPES
else R'\u00'
if escape == 'x'
else ''
if escape == '\n'
else escape
)
def _j2p_decimal(match):
return (
match.group(1)
+ match.group(2)
+ (match.group(3).replace("_", "") or "0")
+ "."
+ (match.group(4).replace("_", "") or "0")
)
def _j2p_decimal2(match):
return match.group(1) + match.group(2) + match.group(3).replace("_", "")
def js_variable_to_python(js_variable): def js_variable_to_python(js_variable):
"""Convert a javascript variable into JSON and then load the value """Convert a javascript variable into JSON and then load the value
It does not deal with all cases, but it is good enough for now. It does not deal with all cases, but it is good enough for now.
chompjs has a better implementation. chompjs has a better implementation.
""" """
if not isinstance(js_variable, str):
raise ValueError("js_variable must be of type str")
if js_variable == "":
raise ValueError("js_variable can't be an empty string")
# when in_string is not None, it contains the character that has opened the string # when in_string is not None, it contains the character that has opened the string
# either simple quote or double quote # either simple quote or double quote
in_string = None in_string = None
@ -665,49 +703,68 @@ def js_variable_to_python(js_variable):
# r"""{ a:"f\"irst", c:'sec"ond'}""" # r"""{ a:"f\"irst", c:'sec"ond'}"""
# becomes # becomes
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}'] # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
parts = re.split(r'(["\'])', js_variable) parts = _JS_STRING_DELIMITERS.split(js_variable)
# previous part (to check the escape character antislash) # does the previous part ends with a backslash?
previous_p = "" blackslash_just_before = False
for i, p in enumerate(parts): for i, p in enumerate(parts):
# parse characters inside a ECMA string if p == in_string and not blackslash_just_before:
if in_string: # * the current part matches the character which has opened the string
# we are in a JS string: replace the colon by a temporary character # * there is no antislash just before
# so quote_keys_regex doesn't have to deal with colon inside the JS strings # --> the current part close the current string
parts[i] = parts[i].replace(':', chr(1))
if in_string == "'":
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
parts[i] = parts[i].replace('"', r'\"')
# deal with delimieters and escape character
if not in_string and p in ('"', "'"):
# we are not in string
# but p is double or simple quote
# that's the start of a new string
# replace simple quote by double quote
# (JSON doesn't support simple quote)
parts[i] = '"'
in_string = p
continue
if p == in_string:
# we are in a string and the current part MAY close the string
if len(previous_p) > 0 and previous_p[-1] == '\\':
# there is an antislash just before: the ECMA string continue
continue
# the current p close the string
# replace simple quote by double quote
parts[i] = '"'
in_string = None in_string = None
# replace simple quote and ` by double quote
# since JSON supports only double quote for string
parts[i] = '"'
if not in_string: elif in_string:
# replace void 0 by null # --> we are in a JS string
# replace the colon by a temporary character
# so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings
p = p.replace(':', chr(1))
# replace JS escape sequences by JSON escape sequences
p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p)
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
if in_string == "'":
p = p.replace('"', r'\"')
parts[i] = p
# deal with the sequence blackslash then quote
# since js_variable splits on quote, we detect this case:
# * the previous part ends with a black slash
# * the current part is a single quote
# when detected the blackslash is removed on the previous part
if blackslash_just_before and p[:1] == "'":
parts[i - 1] = parts[i - 1][:-1]
elif in_string is None and p in ('"', "'", "`"):
# we are not in string but p is string delimiter
# --> that's the start of a new string
in_string = p
# replace simple quote by double quote
# since JSON supports only double quote for string
parts[i] = '"'
elif in_string is None:
# we are not in a string
# replace by null these values:
# * void 0
# * void(0)
# * undefined
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
# we are sure there is no string in p p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p)
parts[i] = _JS_VOID_RE.sub("null", p) # make sure there is a leading zero in front of float
# update previous_p p = _JS_DECIMAL_RE.sub(_j2p_decimal, p)
previous_p = p p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p)
# remove extra coma in a list or an object
# for example [1,2,3,] becomes [1,2,3]
p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p)
parts[i] = p
# update for the next iteration
blackslash_just_before = len(p) > 0 and p[-1] == '\\'
# join the string # join the string
s = ''.join(parts) s = ''.join(parts)
# add quote arround the key # add quote arround the key
@ -715,8 +772,13 @@ def js_variable_to_python(js_variable):
# becomes # becomes
# { "a": 12 } # { "a": 12 }
s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s) s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
s = _JS_DECIMAL_RE.sub(":0.", s) # replace the surogate character by colon and strip whitespaces
# replace the surogate character by colon s = s.replace(chr(1), ':').strip()
s = s.replace(chr(1), ':')
# load the JSON and return the result # load the JSON and return the result
return json.loads(s) if s == "":
raise ValueError("js_variable can't be an empty string")
try:
return json.loads(s)
except json.JSONDecodeError as e:
logger.debug("Internal error: js_variable_to_python creates invalid JSON:\n%s", s)
raise ValueError("js_variable_to_python creates invalid JSON") from e

View file

@ -0,0 +1,283 @@
# -*- coding: utf-8 -*-
"""
Tests for the function searx.utils.js_variable_to_python
The tests are copied from https://github.com/Nykakin/chompjs/blob/c1501b5cd82c0044539875331745b820e7bfd067/chompjs/test_parser.py
Comment out tests do not pass
"""
import math
from parameterized import parameterized
from searx.utils import js_variable_to_python
from tests import SearxTestCase
class TestParser(SearxTestCase):
@parameterized.expand(
[
("{'hello': 'world'}", {'hello': 'world'}),
("{'hello': 'world', 'my': 'master'}", {'hello': 'world', 'my': 'master'}),
(
"{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}",
{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'},
),
("{}", {}),
]
)
def test_parse_object(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
@parameterized.expand(
[
("[]", []),
("[[[]]]", [[[]]]),
("[[[1]]]", [[[1]]]),
("[1]", [1]),
("[1, 2, 3, 4]", [1, 2, 3, 4]),
("['h', 'e', 'l', 'l', 'o']", ['h', 'e', 'l', 'l', 'o']),
("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]", [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]),
]
)
def test_parse_list(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
@parameterized.expand(
[
("{'hello': [], 'world': [0]}", {'hello': [], 'world': [0]}),
("{'hello': [1, 2, 3, 4]}", {'hello': [1, 2, 3, 4]}),
("[{'a':12}, {'b':33}]", [{'a': 12}, {'b': 33}]),
(
"[false, {'true': true, `pies`: \"kot\"}, false,]",
[False, {"true": True, 'pies': 'kot'}, False],
),
(
"{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}",
{k: 1 for k in 'abcdefghij'},
),
(
"{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}",
{'a': [{'b': 1}, {'c': [{'d': {'f': {'g': [1, 2]}}}, {'e': 1}]}]},
),
]
)
def test_parse_mixed(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
@parameterized.expand(
[
("{'hello': 12, 'world': 10002.21}", {'hello': 12, 'world': 10002.21}),
("[12, -323, 0.32, -32.22, .2, - 4]", [12, -323, 0.32, -32.22, 0.2, -4]),
('{"a": -12, "b": - 5}', {'a': -12, 'b': -5}),
("{'a': true, 'b': false, 'c': null}", {'a': True, 'b': False, 'c': None}),
("[\"\\uD834\\uDD1E\"]", ['𝄞']),
("{'a': '123\\'456\\n'}", {'a': "123'456\n"}),
("['\u00E9']", ['é']),
('{"cache":{"\u002Ftest\u002F": 0}}', {'cache': {'/test/': 0}}),
('{"a": 3.125e7}', {'a': 3.125e7}),
('''{"a": "b\\'"}''', {'a': "b'"}),
('{"a": .99, "b": -.1}', {"a": 0.99, "b": -0.1}),
('["/* ... */", "// ..."]', ["/* ... */", "// ..."]),
('{"inclusions":["/*","/"]}', {'inclusions': ['/*', '/']}),
]
)
def test_parse_standard_values(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
def test_parse_nan(self):
js = '{"A": NaN}'
py = js_variable_to_python(js)
self.assertTrue(math.isnan(py["A"]))
@parameterized.expand(
[
("{abc: 100, dev: 200}", {'abc': 100, 'dev': 200}),
("{abcdefghijklmnopqrstuvwxyz: 12}", {"abcdefghijklmnopqrstuvwxyz": 12}),
# (
# "{age: function(yearBorn,thisYear) {return thisYear - yearBorn;}}",
# {"age": "function(yearBorn,thisYear) {return thisYear - yearBorn;}"}
# ),
# (
# "{\"abc\": function() {return '])))))))))))))))';}}",
# {"abc": "function() {return '])))))))))))))))';}"},
# ),
('{"a": undefined}', {"a": None}), # chompjs returns {"a": "undefined"}
('[undefined, undefined]', [None, None]), # chompjs returns ["undefined", "undefined"]
("{_a: 1, $b: 2}", {"_a": 1, "$b": 2}),
# ("{regex: /a[^d]{1,12}/i}", {'regex': '/a[^d]{1,12}/i'}),
# ("{'a': function(){return '\"'}}", {'a': 'function(){return \'"\'}'}),
("{1: 1, 2: 2, 3: 3, 4: 4}", {'1': 1, '2': 2, '3': 3, '4': 4}),
("{'a': 121.}", {'a': 121.0}),
]
)
def test_parse_strange_values(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
@parameterized.expand(
[
# ('{"a": {"b": [12, 13, 14]}}text text', {"a": {"b": [12, 13, 14]}}),
# ('var test = {"a": {"b": [12, 13, 14]}}', {"a": {"b": [12, 13, 14]}}),
('{"a":\r\n10}', {'a': 10}),
("{'foo': 0,\r\n}", {'foo': 0}),
("{truefalse: 0, falsefalse: 1, nullnull: 2}", {'truefalse': 0, 'falsefalse': 1, 'nullnull': 2}),
]
)
def test_strange_input(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
@parameterized.expand(
[
("[0]", [0]),
("[1]", [1]),
("[12]", [12]),
("[12_12]", [1212]),
# ("[0x12]", [18]),
# ("[0xab]", [171]),
# ("[0xAB]", [171]),
# ("[0X12]", [18]),
# ("[0Xab]", [171]),
# ("[0XAB]", [171]),
# ("[01234]", [668]),
# ("[0o1234]", [668]),
# ("[0O1234]", [668]),
# ("[0b1111]", [15]),
# ("[0B1111]", [15]),
("[-0]", [-0]),
("[-1]", [-1]),
("[-12]", [-12]),
("[-12_12]", [-1212]),
# ("[-0x12]", [-18]),
# ("[-0xab]", [-171]),
# ("[-0xAB]", [-171]),
# ("[-0X12]", [-18]),
# ("[-0Xab]", [-171]),
# ("[-0XAB]", [-171]),
# ("[-01234]", [-668]),
# ("[-0o1234]", [-668]),
# ("[-0O1234]", [-668]),
# ("[-0b1111]", [-15]),
# ("[-0B1111]", [-15]),
]
)
def test_integer_numeric_values(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
@parameterized.expand(
[
("[0.32]", [0.32]),
("[-0.32]", [-0.32]),
("[.32]", [0.32]),
("[-.32]", [-0.32]),
("[12.]", [12.0]),
("[-12.]", [-12.0]),
("[12.32]", [12.32]),
("[-12.12]", [-12.12]),
("[3.1415926]", [3.1415926]),
("[.123456789]", [0.123456789]),
("[.0123]", [0.0123]),
("[0.0123]", [0.0123]),
("[-.0123]", [-0.0123]),
("[-0.0123]", [-0.0123]),
("[3.1E+12]", [3.1e12]),
("[3.1e+12]", [3.1e12]),
("[.1e-23]", [0.1e-23]),
("[.1e-23]", [0.1e-23]),
]
)
def test_float_numeric_values(self, js, expected_py):
py = js_variable_to_python(js)
self.assertEqual(py, expected_py)
# @parameterized.expand([
# ('["Test\\nDrive"]\n{"Test": "Drive"}', [['Test\nDrive'], {'Test': 'Drive'}]),
# ])
# def test_jsonlines(self, js, expected_py):
# py = js_variable_to_python(js)
# self.assertEqual(py, expected_py)
class TestParserExceptions(SearxTestCase):
@parameterized.expand(
[
('}{', ValueError),
('', ValueError),
(None, ValueError),
]
)
def test_exceptions(self, js, expected_exception):
with self.assertRaises(expected_exception):
js_variable_to_python(js)
@parameterized.expand(
[
("{whose: 's's', category_name: '>'}", ValueError),
]
)
def test_malformed_input(self, in_data, expected_exception):
with self.assertRaises(expected_exception):
js_variable_to_python(in_data)
@parameterized.expand(
[
(
'{"test": """}',
ValueError,
'js_variable_to_python creates invalid JSON',
),
]
)
def test_error_messages(self, js, expected_exception, expected_exception_text):
with self.assertRaisesRegex(expected_exception, expected_exception_text):
js_variable_to_python(js)
# class TestOptions(SearxTestCase):
# @parameterized.expand(
# [
# ('{\\\"a\\\": 12}', {'a': 12}),
# ]
# )
# def test_unicode_escape(self, js, expected_py):
# py = js_variable_to_python(js)
# self.assertEqual(py, expected_py)
class TestParseJsonObjects(SearxTestCase):
@parameterized.expand(
[
# ("", []),
# ("aaaaaaaaaaaaaaaa", []),
# (" ", []),
(" {'a': 12}", [{'a': 12}]),
# ("[1, 2, 3, 4]xxxxxxxxxxxxxxxxxxxxxxxx", [[1, 2, 3, 4]]),
# ("[12] [13] [14]", [[12], [13], [14]]),
# ("[10] {'a': [1, 1, 1,]}", [[10], {'a': [1, 1, 1]}]),
# ("[1][1][1]", [[1], [1], [1]]),
# ("[1] [2] {'a': ", [[1], [2]]),
# ("[]", [[]]),
# ("[][][][]", [[], [], [], []]),
("{}", [{}]),
# ("{}{}{}{}", [{}, {}, {}, {}]),
# ("{{}}{{}}", []),
# ("[[]][[]]", [[[]], [[]]]),
# ("{am: 'ab'}\n{'ab': 'xx'}", [{'am': 'ab'}, {'ab': 'xx'}]),
# (
# 'function(a, b, c){ /* ... */ }({"a": 12}, Null, [1, 2, 3])',
# [{}, {'a': 12}, [1, 2, 3]],
# ),
# ('{"a": 12, broken}{"c": 100}', [{'c': 100}]),
# ('[12,,,,21][211,,,][12,12][12,,,21]', [[12, 12]]),
]
)
def test_parse_json_objects(self, js, expected_py):
py_in_list = [js_variable_to_python(js)]
self.assertEqual(py_in_list, expected_py)