mirror of
https://github.com/searxng/searxng.git
synced 2024-11-25 20:31:00 +00:00
Replace chompjs with pure Python code
The new implementation is good enough for the current usage (brave)
This commit is contained in:
parent
8e45ac4271
commit
d07c006aed
3 changed files with 75 additions and 3 deletions
|
@ -17,4 +17,3 @@ markdown-it-py==3.0.0
|
||||||
typing_extensions==4.7.1
|
typing_extensions==4.7.1
|
||||||
fasttext-predict==0.9.2.1
|
fasttext-predict==0.9.2.1
|
||||||
pytomlpp==1.0.13
|
pytomlpp==1.0.13
|
||||||
chompjs==1.2.2
|
|
|
@ -104,7 +104,6 @@ from urllib.parse import (
|
||||||
parse_qs,
|
parse_qs,
|
||||||
)
|
)
|
||||||
|
|
||||||
import chompjs
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
from searx import locales
|
from searx import locales
|
||||||
|
@ -112,6 +111,7 @@ from searx.utils import (
|
||||||
extract_text,
|
extract_text,
|
||||||
eval_xpath_list,
|
eval_xpath_list,
|
||||||
eval_xpath_getindex,
|
eval_xpath_getindex,
|
||||||
|
js_variable_to_python,
|
||||||
)
|
)
|
||||||
from searx.enginelib.traits import EngineTraits
|
from searx.enginelib.traits import EngineTraits
|
||||||
|
|
||||||
|
@ -215,7 +215,7 @@ def response(resp):
|
||||||
datastr = line.replace("const data = ", "").strip()[:-1]
|
datastr = line.replace("const data = ", "").strip()[:-1]
|
||||||
break
|
break
|
||||||
|
|
||||||
json_data = chompjs.parse_js_object(datastr)
|
json_data = js_variable_to_python(datastr)
|
||||||
json_resp = json_data[1]['data']['body']['response']
|
json_resp = json_data[1]['data']['body']['response']
|
||||||
|
|
||||||
if brave_category == 'news':
|
if brave_category == 'news':
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
import re
|
import re
|
||||||
import importlib
|
import importlib
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
import json
|
||||||
import types
|
import types
|
||||||
|
|
||||||
from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
|
from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
|
||||||
|
@ -37,6 +38,9 @@ _BLOCKED_TAGS = ('script', 'style')
|
||||||
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
|
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
|
||||||
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
|
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
|
||||||
|
|
||||||
|
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
|
||||||
|
_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
|
||||||
|
|
||||||
_STORAGE_UNIT_VALUE: Dict[str, int] = {
|
_STORAGE_UNIT_VALUE: Dict[str, int] = {
|
||||||
'TB': 1024 * 1024 * 1024 * 1024,
|
'TB': 1024 * 1024 * 1024 * 1024,
|
||||||
'GB': 1024 * 1024 * 1024,
|
'GB': 1024 * 1024 * 1024,
|
||||||
|
@ -645,3 +649,72 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
|
||||||
return None
|
return None
|
||||||
return language
|
return language
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def js_variable_to_python(js_variable):
|
||||||
|
"""Convert a javascript variable into JSON and then load the value
|
||||||
|
|
||||||
|
It does not deal with all cases, but it is good enough for now.
|
||||||
|
chompjs has a better implementation.
|
||||||
|
"""
|
||||||
|
# when in_string is not None, it contains the character that has opened the string
|
||||||
|
# either simple quote or double quote
|
||||||
|
in_string = None
|
||||||
|
# cut the string:
|
||||||
|
# r"""{ a:"f\"irst", c:'sec"ond'}"""
|
||||||
|
# becomes
|
||||||
|
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
|
||||||
|
parts = re.split(r'(["\'])', js_variable)
|
||||||
|
# previous part (to check the escape character antislash)
|
||||||
|
previous_p = ""
|
||||||
|
for i, p in enumerate(parts):
|
||||||
|
# parse characters inside a ECMA string
|
||||||
|
if in_string:
|
||||||
|
# we are in a JS string: replace the colon by a temporary character
|
||||||
|
# so quote_keys_regex doesn't have to deal with colon inside the JS strings
|
||||||
|
parts[i] = parts[i].replace(':', chr(1))
|
||||||
|
if in_string == "'":
|
||||||
|
# the JS string is delimited by simple quote.
|
||||||
|
# This is not supported by JSON.
|
||||||
|
# simple quote delimited string are converted to double quote delimited string
|
||||||
|
# here, inside a JS string, we escape the double quote
|
||||||
|
parts[i] = parts[i].replace('"', r'\"')
|
||||||
|
|
||||||
|
# deal with delimieters and escape character
|
||||||
|
if not in_string and p in ('"', "'"):
|
||||||
|
# we are not in string
|
||||||
|
# but p is double or simple quote
|
||||||
|
# that's the start of a new string
|
||||||
|
# replace simple quote by double quote
|
||||||
|
# (JSON doesn't support simple quote)
|
||||||
|
parts[i] = '"'
|
||||||
|
in_string = p
|
||||||
|
continue
|
||||||
|
if p == in_string:
|
||||||
|
# we are in a string and the current part MAY close the string
|
||||||
|
if len(previous_p) > 0 and previous_p[-1] == '\\':
|
||||||
|
# there is an antislash just before: the ECMA string continue
|
||||||
|
continue
|
||||||
|
# the current p close the string
|
||||||
|
# replace simple quote by double quote
|
||||||
|
parts[i] = '"'
|
||||||
|
in_string = None
|
||||||
|
#
|
||||||
|
if not in_string:
|
||||||
|
# replace void 0 by null
|
||||||
|
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
|
||||||
|
# we are sure there is no string in p
|
||||||
|
parts[i] = _JS_VOID_RE.sub("null", p)
|
||||||
|
# update previous_p
|
||||||
|
previous_p = p
|
||||||
|
# join the string
|
||||||
|
s = ''.join(parts)
|
||||||
|
# add quote arround the key
|
||||||
|
# { a: 12 }
|
||||||
|
# becomes
|
||||||
|
# { "a": 12 }
|
||||||
|
s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
|
||||||
|
# replace the surogate character by colon
|
||||||
|
s = s.replace(chr(1), ':')
|
||||||
|
# load the JSON and return the result
|
||||||
|
return json.loads(s)
|
||||||
|
|
Loading…
Reference in a new issue