mirror of
https://github.com/searxng/searxng.git
synced 2024-05-17 19:12:39 +00:00
[mod] lower memory footprint by lazy loading JSON data
This patch implements lazy loading of the JSON data. Motivation: in most requests not all JSON data is needed, but loaded. By example these four JSON files: - currencies.json ~550KB - engine_descriptions.json ~1,3MB - external_bangs.json ~1,3MB - osm_keys_tags.json ~ 2,2MB most often not used and consume a lot of memory and BTW they also extend the time required to instantiate a walker. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
e45a7cc063
commit
82fd0dac60
|
@ -20,13 +20,20 @@ __all__ = [
|
|||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from searx import logger
|
||||
|
||||
data_dir = Path(__file__).parent
|
||||
logger = logger.getChild('data')
|
||||
|
||||
|
||||
def _load(filename):
|
||||
with open(data_dir / filename, encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
CURRENCIES: dict
|
||||
USER_AGENTS: dict
|
||||
EXTERNAL_URLS: dict
|
||||
WIKIDATA_UNITS: dict
|
||||
EXTERNAL_BANGS: dict
|
||||
OSM_KEYS_TAGS: dict
|
||||
ENGINE_DESCRIPTIONS: dict
|
||||
ENGINE_TRAITS: dict
|
||||
LOCALES: dict
|
||||
|
||||
|
||||
def ahmia_blacklist_loader():
|
||||
|
@ -42,12 +49,27 @@ def ahmia_blacklist_loader():
|
|||
return f.read().split()
|
||||
|
||||
|
||||
CURRENCIES = _load('currencies.json')
|
||||
USER_AGENTS = _load('useragents.json')
|
||||
EXTERNAL_URLS = _load('external_urls.json')
|
||||
WIKIDATA_UNITS = _load('wikidata_units.json')
|
||||
EXTERNAL_BANGS = _load('external_bangs.json')
|
||||
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
|
||||
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
|
||||
ENGINE_TRAITS = _load('engine_traits.json')
|
||||
LOCALES = _load('locales.json')
|
||||
NAME_TO_JSON_FILE = {
|
||||
'CURRENCIES': 'currencies.json',
|
||||
'USER_AGENTS': 'useragents.json',
|
||||
'EXTERNAL_URLS': 'external_urls.json',
|
||||
'WIKIDATA_UNITS': 'wikidata_units.json',
|
||||
'EXTERNAL_BANGS': 'external_bangs.json',
|
||||
'OSM_KEYS_TAGS': 'osm_keys_tags.json',
|
||||
'ENGINE_DESCRIPTIONS': 'engine_descriptions.json',
|
||||
'ENGINE_TRAITS': 'engine_traits.json',
|
||||
'LOCALES': 'locales.json',
|
||||
}
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
# lazy load of JSON files ..
|
||||
filename = NAME_TO_JSON_FILE.get(name)
|
||||
if filename:
|
||||
filename = data_dir / filename
|
||||
logger.debug("init global %s from JSON file %s", name, filename)
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
globals()[name] = json.load(f)
|
||||
return globals()[name]
|
||||
else:
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
|
|
@ -16,7 +16,7 @@ import types
|
|||
from typing import Dict, Literal, Iterable, Union, Callable, Optional, TYPE_CHECKING
|
||||
|
||||
from searx import locales
|
||||
from searx.data import data_dir, ENGINE_TRAITS
|
||||
from searx import data
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from . import Engine
|
||||
|
@ -193,7 +193,7 @@ class EngineTraits:
|
|||
class EngineTraitsMap(Dict[str, EngineTraits]):
|
||||
"""A python dictionary to map :class:`EngineTraits` by engine name."""
|
||||
|
||||
ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve()
|
||||
ENGINE_TRAITS_FILE = (data.data_dir / 'engine_traits.json').resolve()
|
||||
"""File with persistence of the :py:obj:`EngineTraitsMap`."""
|
||||
|
||||
def save_data(self):
|
||||
|
@ -205,7 +205,7 @@ class EngineTraitsMap(Dict[str, EngineTraits]):
|
|||
def from_data(cls) -> 'EngineTraitsMap':
|
||||
"""Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`"""
|
||||
obj = cls()
|
||||
for k, v in ENGINE_TRAITS.items():
|
||||
for k, v in data.ENGINE_TRAITS.items():
|
||||
obj[k] = EngineTraits(**v)
|
||||
return obj
|
||||
|
||||
|
|
|
@ -37,9 +37,9 @@ from typing import List, Dict, Any, Optional
|
|||
from urllib.parse import quote
|
||||
from lxml import html
|
||||
|
||||
from searx import data
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.data import ENGINE_TRAITS
|
||||
|
||||
# about
|
||||
about: Dict[str, Any] = {
|
||||
|
@ -86,7 +86,7 @@ aa_ext: str = ''
|
|||
|
||||
def init(engine_settings=None): # pylint: disable=unused-argument
|
||||
"""Check of engine's settings."""
|
||||
traits = EngineTraits(**ENGINE_TRAITS['annas archive'])
|
||||
traits = EngineTraits(**data.ENGINE_TRAITS['annas archive'])
|
||||
|
||||
if aa_content and aa_content not in traits.custom['content']:
|
||||
raise ValueError(f'invalid setting content: {aa_content}')
|
||||
|
|
|
@ -12,6 +12,7 @@ import babel
|
|||
import lxml.html
|
||||
|
||||
from searx import (
|
||||
data,
|
||||
locales,
|
||||
redislib,
|
||||
external_bang,
|
||||
|
@ -230,7 +231,7 @@ def quote_ddg_bangs(query):
|
|||
for val in re.split(r'(\s+)', query):
|
||||
if not val.strip():
|
||||
continue
|
||||
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
|
||||
if val.startswith('!') and external_bang.get_node(data.EXTERNAL_BANGS, val[1:]):
|
||||
val = f"'{val}'"
|
||||
query_parts.append(val)
|
||||
return ' '.join(query_parts)
|
||||
|
|
|
@ -18,7 +18,7 @@ from typing import TYPE_CHECKING
|
|||
from urllib.parse import urlencode, urlparse, urljoin
|
||||
from lxml import html
|
||||
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx import data
|
||||
from searx.utils import extract_text, html_to_text, get_string_replaces_function
|
||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
|
||||
|
||||
|
@ -238,7 +238,7 @@ def unit_to_str(unit):
|
|||
for prefix in WIKIDATA_PREFIX:
|
||||
if unit.startswith(prefix):
|
||||
wikidata_entity = unit[len(prefix) :]
|
||||
real_unit = WIKIDATA_UNITS.get(wikidata_entity)
|
||||
real_unit = data.WIKIDATA_UNITS.get(wikidata_entity)
|
||||
if real_unit is None:
|
||||
return unit
|
||||
return real_unit['symbol']
|
||||
|
|
|
@ -10,7 +10,7 @@ from functools import partial
|
|||
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.data import OSM_KEYS_TAGS, CURRENCIES
|
||||
from searx import data as searx_data
|
||||
from searx.utils import searx_useragent
|
||||
from searx.external_urls import get_external_url
|
||||
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
|
||||
|
@ -435,7 +435,7 @@ def get_label(labels, lang):
|
|||
def get_tag_label(tag_category, tag_name, lang):
|
||||
"""Get tag label from OSM_KEYS_TAGS"""
|
||||
tag_name = '' if tag_name is None else tag_name
|
||||
tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {})
|
||||
tag_labels = searx_data.OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {})
|
||||
return get_label(tag_labels, lang)
|
||||
|
||||
|
||||
|
@ -449,12 +449,12 @@ def get_key_label(key_name, lang):
|
|||
# https://taginfo.openstreetmap.org/keys/currency#values
|
||||
currency = key_name.split(':')
|
||||
if len(currency) > 1:
|
||||
o = CURRENCIES['iso4217'].get(currency[1])
|
||||
o = searx_data.CURRENCIES['iso4217'].get(currency[1])
|
||||
if o:
|
||||
return get_label(o, lang).lower()
|
||||
return currency[1]
|
||||
|
||||
labels = OSM_KEYS_TAGS['keys']
|
||||
labels = searx_data.OSM_KEYS_TAGS['keys']
|
||||
for k in key_name.split(':') + ['*']:
|
||||
labels = labels.get(k)
|
||||
if labels is None:
|
||||
|
|
|
@ -13,7 +13,7 @@ from json import loads
|
|||
from dateutil.parser import isoparse
|
||||
from babel.dates import format_datetime, format_date, format_time, get_datetime_format
|
||||
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx import data
|
||||
from searx.network import post, get
|
||||
from searx.utils import searx_useragent, get_string_replaces_function
|
||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
|
||||
|
@ -762,7 +762,7 @@ def debug_explain_wikidata_query(query, method='GET'):
|
|||
|
||||
def init(engine_settings=None): # pylint: disable=unused-argument
|
||||
# WIKIDATA_PROPERTIES : add unit symbols
|
||||
WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS)
|
||||
WIKIDATA_PROPERTIES.update(data.WIKIDATA_UNITS)
|
||||
|
||||
# WIKIDATA_PROPERTIES : add property labels
|
||||
wikidata_property_names = []
|
||||
|
|
|
@ -40,9 +40,9 @@ from urllib.parse import quote
|
|||
from lxml import html
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx import data
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.data import ENGINE_TRAITS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import httpx
|
||||
|
@ -80,7 +80,7 @@ zlib_ext: str = ""
|
|||
|
||||
def init(engine_settings=None) -> None: # pylint: disable=unused-argument
|
||||
"""Check of engine's settings."""
|
||||
traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"])
|
||||
traits: EngineTraits = EngineTraits(**data.ENGINE_TRAITS["z-library"])
|
||||
|
||||
if zlib_ext and zlib_ext not in traits.custom["ext"]:
|
||||
raise ValueError(f"invalid setting ext: {zlib_ext}")
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# pylint: disable=missing-module-docstring
|
||||
|
||||
from urllib.parse import quote_plus, urlparse
|
||||
from searx.data import EXTERNAL_BANGS
|
||||
from searx import data
|
||||
|
||||
LEAF_KEY = chr(16)
|
||||
|
||||
|
@ -56,7 +56,7 @@ def resolve_bang_definition(bang_definition, query):
|
|||
|
||||
def get_bang_definition_and_autocomplete(bang, external_bangs_db=None): # pylint: disable=invalid-name
|
||||
if external_bangs_db is None:
|
||||
external_bangs_db = EXTERNAL_BANGS
|
||||
external_bangs_db = data.EXTERNAL_BANGS
|
||||
|
||||
bang_definition, bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang)
|
||||
|
||||
|
@ -90,7 +90,7 @@ def get_bang_url(search_query, external_bangs_db=None):
|
|||
ret_val = None
|
||||
|
||||
if external_bangs_db is None:
|
||||
external_bangs_db = EXTERNAL_BANGS
|
||||
external_bangs_db = data.EXTERNAL_BANGS
|
||||
|
||||
if search_query.external_bang:
|
||||
bang_definition, _ = get_bang_definition_and_ac(external_bangs_db, search_query.external_bang)
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
import math
|
||||
|
||||
from searx.data import EXTERNAL_URLS
|
||||
from searx import data
|
||||
|
||||
|
||||
IMDB_PREFIX_TO_URL_ID = {
|
||||
|
@ -43,7 +43,7 @@ def get_external_url(url_id, item_id, alternative="default"):
|
|||
elif url_id == 'wikimedia_image':
|
||||
item_id = get_wikimedia_image_id(item_id)
|
||||
|
||||
url_description = EXTERNAL_URLS.get(url_id)
|
||||
url_description = data.EXTERNAL_URLS.get(url_id)
|
||||
if url_description:
|
||||
url_template = url_description["urls"].get(alternative)
|
||||
if url_template is not None:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# pylint: disable=missing-module-docstring
|
||||
|
||||
from hashlib import md5
|
||||
from searx.data import ahmia_blacklist_loader
|
||||
from searx import data
|
||||
|
||||
name = "Ahmia blacklist"
|
||||
description = "Filter out onion results that appear in Ahmia's blacklist. (See https://ahmia.fi/blacklist)"
|
||||
|
@ -24,5 +24,5 @@ def init(_app, settings):
|
|||
if not settings['outgoing']['using_tor_proxy']:
|
||||
# disable the plugin
|
||||
return False
|
||||
ahmia_blacklist = ahmia_blacklist_loader()
|
||||
ahmia_blacklist = data.ahmia_blacklist_loader()
|
||||
return True
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx import data
|
||||
|
||||
name = "Unit converter plugin"
|
||||
description = gettext("Convert between units")
|
||||
|
@ -38,7 +38,7 @@ def _parse_text_and_convert(search, splitted_query):
|
|||
from_unit = None
|
||||
to_unit = None
|
||||
|
||||
for unit in WIKIDATA_UNITS.values():
|
||||
for unit in data.WIKIDATA_UNITS.values():
|
||||
if unit['symbol'] == from_unit_key:
|
||||
from_unit = unit
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
import unicodedata
|
||||
import re
|
||||
|
||||
from searx.data import CURRENCIES
|
||||
from searx import data
|
||||
from .online import OnlineProcessor
|
||||
|
||||
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
|
||||
|
@ -20,14 +20,14 @@ def normalize_name(name):
|
|||
|
||||
def name_to_iso4217(name):
|
||||
name = normalize_name(name)
|
||||
currency = CURRENCIES['names'].get(name, [name])
|
||||
currency = data.CURRENCIES['names'].get(name, [name])
|
||||
if isinstance(currency, str):
|
||||
return currency
|
||||
return currency[0]
|
||||
|
||||
|
||||
def iso4217_to_name(iso4217, language):
|
||||
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
|
||||
return data.CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
|
||||
|
||||
|
||||
class OnlineCurrencyProcessor(OnlineProcessor):
|
||||
|
|
|
@ -21,7 +21,7 @@ from lxml import html
|
|||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
|
||||
|
||||
from searx import settings
|
||||
from searx.data import USER_AGENTS, data_dir
|
||||
from searx import data as searx_data
|
||||
from searx.version import VERSION_TAG
|
||||
from searx.sxng_locales import sxng_locales
|
||||
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
||||
|
@ -81,7 +81,9 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
|
|||
|
||||
See searx/data/useragents.json
|
||||
"""
|
||||
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
|
||||
return searx_data.USER_AGENTS['ua'].format(
|
||||
os=os_string or choice(searx_data.USER_AGENTS['os']), version=choice(searx_data.USER_AGENTS['versions'])
|
||||
)
|
||||
|
||||
|
||||
class _HTMLTextExtractorException(Exception):
|
||||
|
@ -600,7 +602,7 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
|
|||
|
||||
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
|
||||
fasttext.FastText.eprint = lambda x: None
|
||||
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
|
||||
_FASTTEXT_MODEL = fasttext.load_model(str(searx_data.data_dir / 'lid.176.ftz'))
|
||||
return _FASTTEXT_MODEL
|
||||
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ from searx import infopage
|
|||
from searx import limiter
|
||||
from searx.botdetection import link_token
|
||||
|
||||
from searx.data import ENGINE_DESCRIPTIONS
|
||||
from searx import data
|
||||
from searx.results import Timing
|
||||
from searx.settings_defaults import OUTPUT_FORMATS
|
||||
from searx.settings_loader import get_default_settings_path
|
||||
|
@ -1102,14 +1102,14 @@ def image_proxy():
|
|||
@app.route('/engine_descriptions.json', methods=['GET'])
|
||||
def engine_descriptions():
|
||||
locale = get_locale().split('_')[0]
|
||||
result = ENGINE_DESCRIPTIONS['en'].copy()
|
||||
result = data.ENGINE_DESCRIPTIONS['en'].copy()
|
||||
if locale != 'en':
|
||||
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
|
||||
for engine, description in data.ENGINE_DESCRIPTIONS.get(locale, {}).items():
|
||||
result[engine] = description
|
||||
for engine, description in result.items():
|
||||
if len(description) == 2 and description[1] == 'ref':
|
||||
ref_engine, ref_lang = description[0].split(':')
|
||||
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
|
||||
description = data.ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
|
||||
if isinstance(description, str):
|
||||
description = [description, 'wikipedia']
|
||||
result[engine] = description
|
||||
|
|
Loading…
Reference in a new issue