mirror of
https://github.com/searxng/searxng.git
synced 2024-11-18 08:51:06 +00:00
[enh] add re-usable func to filter text
This commit is contained in:
parent
0fb3f0e4ae
commit
0fa81fc782
6 changed files with 53 additions and 25 deletions
|
@ -132,6 +132,7 @@ from lxml import html
|
||||||
from searx import locales
|
from searx import locales
|
||||||
from searx.utils import (
|
from searx.utils import (
|
||||||
extract_text,
|
extract_text,
|
||||||
|
extr,
|
||||||
eval_xpath,
|
eval_xpath,
|
||||||
eval_xpath_list,
|
eval_xpath_list,
|
||||||
eval_xpath_getindex,
|
eval_xpath_getindex,
|
||||||
|
@ -252,11 +253,7 @@ def response(resp):
|
||||||
if brave_category in ('search', 'goggles'):
|
if brave_category in ('search', 'goggles'):
|
||||||
return _parse_search(resp)
|
return _parse_search(resp)
|
||||||
|
|
||||||
datastr = ""
|
datastr = extr(resp.text, "const data = ", ";\n").strip()
|
||||||
for line in resp.text.split("\n"):
|
|
||||||
if "const data = " in line:
|
|
||||||
datastr = line.replace("const data = ", "").strip()[:-1]
|
|
||||||
break
|
|
||||||
|
|
||||||
json_data = js_variable_to_python(datastr)
|
json_data = js_variable_to_python(datastr)
|
||||||
json_resp = json_data[1]['data']['body']['response']
|
json_resp = json_data[1]['data']['body']['response']
|
||||||
|
|
|
@ -392,7 +392,9 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||||
SearXNG's locale.
|
SearXNG's locale.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# pylint: disable=too-many-branches, too-many-statements
|
# pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel
|
||||||
|
from searx.utils import extr, js_variable_to_python
|
||||||
|
|
||||||
# fetch regions
|
# fetch regions
|
||||||
|
|
||||||
engine_traits.all_locale = 'wt-wt'
|
engine_traits.all_locale = 'wt-wt'
|
||||||
|
@ -403,11 +405,9 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||||
if not resp.ok: # type: ignore
|
if not resp.ok: # type: ignore
|
||||||
print("ERROR: response from DuckDuckGo is not OK.")
|
print("ERROR: response from DuckDuckGo is not OK.")
|
||||||
|
|
||||||
pos = resp.text.find('regions:{') + 8 # type: ignore
|
js_code = extr(resp.text, 'regions:', ',snippetLengths')
|
||||||
js_code = resp.text[pos:] # type: ignore
|
|
||||||
pos = js_code.find('}') + 1
|
|
||||||
regions = json.loads(js_code[:pos])
|
|
||||||
|
|
||||||
|
regions = json.loads(js_code)
|
||||||
for eng_tag, name in regions.items():
|
for eng_tag, name in regions.items():
|
||||||
|
|
||||||
if eng_tag == 'wt-wt':
|
if eng_tag == 'wt-wt':
|
||||||
|
@ -439,12 +439,9 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||||
|
|
||||||
engine_traits.custom['lang_region'] = {}
|
engine_traits.custom['lang_region'] = {}
|
||||||
|
|
||||||
pos = resp.text.find('languages:{') + 10 # type: ignore
|
js_code = extr(resp.text, 'languages:', ',regions')
|
||||||
js_code = resp.text[pos:] # type: ignore
|
|
||||||
pos = js_code.find('}') + 1
|
|
||||||
js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
|
|
||||||
languages = json.loads(js_code)
|
|
||||||
|
|
||||||
|
languages = js_variable_to_python(js_code)
|
||||||
for eng_lang, name in languages.items():
|
for eng_lang, name in languages.items():
|
||||||
|
|
||||||
if eng_lang == 'wt_WT':
|
if eng_lang == 'wt_WT':
|
||||||
|
|
|
@ -312,13 +312,12 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||||
# pylint: disable=import-outside-toplevel
|
# pylint: disable=import-outside-toplevel
|
||||||
from searx import network
|
from searx import network
|
||||||
from searx.locales import region_tag
|
from searx.locales import region_tag
|
||||||
|
from searx.utils import extr
|
||||||
|
|
||||||
resp = network.get(about['website'])
|
resp = network.get(about['website'])
|
||||||
text = resp.text
|
json_string = extr(resp.text, 'INITIAL_PROPS = ', '</script>')
|
||||||
text = text[text.find('INITIAL_PROPS') :]
|
|
||||||
text = text[text.find('{') : text.find('</script>')]
|
|
||||||
|
|
||||||
q_initial_props = loads(text)
|
q_initial_props = loads(json_string)
|
||||||
q_locales = q_initial_props.get('locales')
|
q_locales = q_initial_props.get('locales')
|
||||||
eng_tag_list = set()
|
eng_tag_list = set()
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,8 @@ from urllib.parse import urlencode
|
||||||
from json import loads
|
from json import loads
|
||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
|
|
||||||
|
from searx.utils import extr
|
||||||
|
|
||||||
# about
|
# about
|
||||||
about = {
|
about = {
|
||||||
"website": 'https://vimeo.com/',
|
"website": 'https://vimeo.com/',
|
||||||
|
@ -23,7 +25,7 @@ paging = True
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
base_url = 'https://vimeo.com/'
|
base_url = 'https://vimeo.com/'
|
||||||
search_url = base_url + '/search/page:{pageno}?{query}'
|
search_url = base_url + 'search/page:{pageno}?{query}'
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
# do search-request
|
||||||
|
@ -36,9 +38,8 @@ def request(query, params):
|
||||||
# get response from search-request
|
# get response from search-request
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
results = []
|
||||||
data_start_pos = resp.text.find('{"filtered"')
|
|
||||||
data_end_pos = resp.text.find(';\n', data_start_pos + 1)
|
data = loads(extr(resp.text, 'var data = ', ';\n'))
|
||||||
data = loads(resp.text[data_start_pos:data_end_pos])
|
|
||||||
|
|
||||||
# parse results
|
# parse results
|
||||||
for result in data['filtered']['data']:
|
for result in data['filtered']['data']:
|
||||||
|
|
|
@ -7,6 +7,8 @@ from functools import reduce
|
||||||
from json import loads, dumps
|
from json import loads, dumps
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
|
from searx.utils import extr
|
||||||
|
|
||||||
# about
|
# about
|
||||||
about = {
|
about = {
|
||||||
"website": 'https://www.youtube.com/',
|
"website": 'https://www.youtube.com/',
|
||||||
|
@ -109,8 +111,8 @@ def parse_next_page_response(response_text):
|
||||||
|
|
||||||
def parse_first_page_response(response_text):
|
def parse_first_page_response(response_text):
|
||||||
results = []
|
results = []
|
||||||
results_data = response_text[response_text.find('ytInitialData') :]
|
results_data = extr(response_text, 'ytInitialData = ', ';</script>')
|
||||||
results_data = results_data[results_data.find('{') : results_data.find(';</script>')]
|
|
||||||
results_json = loads(results_data) if results_data else {}
|
results_json = loads(results_data) if results_data else {}
|
||||||
sections = (
|
sections = (
|
||||||
results_json.get('contents', {})
|
results_json.get('contents', {})
|
||||||
|
|
|
@ -2,6 +2,9 @@
|
||||||
"""Utility functions for the engines
|
"""Utility functions for the engines
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import importlib
|
import importlib
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
@ -371,6 +374,35 @@ def convert_str_to_int(number_str: str) -> int:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def extr(txt: str, begin: str, end: str, default: str = ""):
|
||||||
|
"""Extract the string between ``begin`` and ``end`` from ``txt``
|
||||||
|
|
||||||
|
:param txt: String to search in
|
||||||
|
:param begin: First string to be searched for
|
||||||
|
:param end: Second string to be searched for after ``begin``
|
||||||
|
:param default: Default value if one of ``begin`` or ``end`` is not
|
||||||
|
found. Defaults to an empty string.
|
||||||
|
:return: The string between the two search-strings ``begin`` and ``end``.
|
||||||
|
If at least one of ``begin`` or ``end`` is not found, the value of
|
||||||
|
``default`` is returned.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> extr("abcde", "a", "e")
|
||||||
|
"bcd"
|
||||||
|
>>> extr("abcde", "a", "z", deafult="nothing")
|
||||||
|
"nothing"
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
|
||||||
|
|
||||||
|
try:
|
||||||
|
first = txt.index(begin) + len(begin)
|
||||||
|
return txt[first : txt.index(end, first)]
|
||||||
|
except ValueError:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
def int_or_zero(num: Union[List[str], str]) -> int:
|
def int_or_zero(num: Union[List[str], str]) -> int:
|
||||||
"""Convert num to int or 0. num can be either a str or a list.
|
"""Convert num to int or 0. num can be either a str or a list.
|
||||||
If num is a list, the first element is converted to int (or return 0 if the list is empty).
|
If num is a list, the first element is converted to int (or return 0 if the list is empty).
|
||||||
|
|
Loading…
Reference in a new issue