[mod] lower memory footprint by lazy loading JSON data

This patch implements lazy loading of the JSON data.

Motivation: in most requests not all JSON data is needed, but loaded.  By
example these four JSON files:

- currencies.json ~550KB
- engine_descriptions.json ~1,3MB
- external_bangs.json ~1,3MB
- osm_keys_tags.json ~ 2,2MB

most often not used and consume a lot of memory and BTW they also extend the
time required to instantiate a walker.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2024-04-29 18:36:26 +02:00
parent e45a7cc063
commit 82fd0dac60
15 changed files with 73 additions and 48 deletions

View file

@ -20,13 +20,20 @@ __all__ = [
import json import json
from pathlib import Path from pathlib import Path
from searx import logger
data_dir = Path(__file__).parent data_dir = Path(__file__).parent
logger = logger.getChild('data')
CURRENCIES: dict
def _load(filename): USER_AGENTS: dict
with open(data_dir / filename, encoding='utf-8') as f: EXTERNAL_URLS: dict
return json.load(f) WIKIDATA_UNITS: dict
EXTERNAL_BANGS: dict
OSM_KEYS_TAGS: dict
ENGINE_DESCRIPTIONS: dict
ENGINE_TRAITS: dict
LOCALES: dict
def ahmia_blacklist_loader(): def ahmia_blacklist_loader():
@ -42,12 +49,27 @@ def ahmia_blacklist_loader():
return f.read().split() return f.read().split()
CURRENCIES = _load('currencies.json') NAME_TO_JSON_FILE = {
USER_AGENTS = _load('useragents.json') 'CURRENCIES': 'currencies.json',
EXTERNAL_URLS = _load('external_urls.json') 'USER_AGENTS': 'useragents.json',
WIKIDATA_UNITS = _load('wikidata_units.json') 'EXTERNAL_URLS': 'external_urls.json',
EXTERNAL_BANGS = _load('external_bangs.json') 'WIKIDATA_UNITS': 'wikidata_units.json',
OSM_KEYS_TAGS = _load('osm_keys_tags.json') 'EXTERNAL_BANGS': 'external_bangs.json',
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') 'OSM_KEYS_TAGS': 'osm_keys_tags.json',
ENGINE_TRAITS = _load('engine_traits.json') 'ENGINE_DESCRIPTIONS': 'engine_descriptions.json',
LOCALES = _load('locales.json') 'ENGINE_TRAITS': 'engine_traits.json',
'LOCALES': 'locales.json',
}
def __getattr__(name: str):
# lazy load of JSON files ..
filename = NAME_TO_JSON_FILE.get(name)
if filename:
filename = data_dir / filename
logger.debug("init global %s from JSON file %s", name, filename)
with open(filename, encoding='utf-8') as f:
globals()[name] = json.load(f)
return globals()[name]
else:
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

View file

@ -16,7 +16,7 @@ import types
from typing import Dict, Literal, Iterable, Union, Callable, Optional, TYPE_CHECKING from typing import Dict, Literal, Iterable, Union, Callable, Optional, TYPE_CHECKING
from searx import locales from searx import locales
from searx.data import data_dir, ENGINE_TRAITS from searx import data
if TYPE_CHECKING: if TYPE_CHECKING:
from . import Engine from . import Engine
@ -193,7 +193,7 @@ class EngineTraits:
class EngineTraitsMap(Dict[str, EngineTraits]): class EngineTraitsMap(Dict[str, EngineTraits]):
"""A python dictionary to map :class:`EngineTraits` by engine name.""" """A python dictionary to map :class:`EngineTraits` by engine name."""
ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve() ENGINE_TRAITS_FILE = (data.data_dir / 'engine_traits.json').resolve()
"""File with persistence of the :py:obj:`EngineTraitsMap`.""" """File with persistence of the :py:obj:`EngineTraitsMap`."""
def save_data(self): def save_data(self):
@ -205,7 +205,7 @@ class EngineTraitsMap(Dict[str, EngineTraits]):
def from_data(cls) -> 'EngineTraitsMap': def from_data(cls) -> 'EngineTraitsMap':
"""Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`""" """Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`"""
obj = cls() obj = cls()
for k, v in ENGINE_TRAITS.items(): for k, v in data.ENGINE_TRAITS.items():
obj[k] = EngineTraits(**v) obj[k] = EngineTraits(**v)
return obj return obj

View file

@ -37,9 +37,9 @@ from typing import List, Dict, Any, Optional
from urllib.parse import quote from urllib.parse import quote
from lxml import html from lxml import html
from searx import data
from searx.utils import extract_text, eval_xpath, eval_xpath_list from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
# about # about
about: Dict[str, Any] = { about: Dict[str, Any] = {
@ -86,7 +86,7 @@ aa_ext: str = ''
def init(engine_settings=None): # pylint: disable=unused-argument def init(engine_settings=None): # pylint: disable=unused-argument
"""Check of engine's settings.""" """Check of engine's settings."""
traits = EngineTraits(**ENGINE_TRAITS['annas archive']) traits = EngineTraits(**data.ENGINE_TRAITS['annas archive'])
if aa_content and aa_content not in traits.custom['content']: if aa_content and aa_content not in traits.custom['content']:
raise ValueError(f'invalid setting content: {aa_content}') raise ValueError(f'invalid setting content: {aa_content}')

View file

@ -12,6 +12,7 @@ import babel
import lxml.html import lxml.html
from searx import ( from searx import (
data,
locales, locales,
redislib, redislib,
external_bang, external_bang,
@ -230,7 +231,7 @@ def quote_ddg_bangs(query):
for val in re.split(r'(\s+)', query): for val in re.split(r'(\s+)', query):
if not val.strip(): if not val.strip():
continue continue
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]): if val.startswith('!') and external_bang.get_node(data.EXTERNAL_BANGS, val[1:]):
val = f"'{val}'" val = f"'{val}'"
query_parts.append(val) query_parts.append(val)
return ' '.join(query_parts) return ' '.join(query_parts)

View file

@ -18,7 +18,7 @@ from typing import TYPE_CHECKING
from urllib.parse import urlencode, urlparse, urljoin from urllib.parse import urlencode, urlparse, urljoin
from lxml import html from lxml import html
from searx.data import WIKIDATA_UNITS from searx import data
from searx.utils import extract_text, html_to_text, get_string_replaces_function from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
@ -238,7 +238,7 @@ def unit_to_str(unit):
for prefix in WIKIDATA_PREFIX: for prefix in WIKIDATA_PREFIX:
if unit.startswith(prefix): if unit.startswith(prefix):
wikidata_entity = unit[len(prefix) :] wikidata_entity = unit[len(prefix) :]
real_unit = WIKIDATA_UNITS.get(wikidata_entity) real_unit = data.WIKIDATA_UNITS.get(wikidata_entity)
if real_unit is None: if real_unit is None:
return unit return unit
return real_unit['symbol'] return real_unit['symbol']

View file

@ -10,7 +10,7 @@ from functools import partial
from flask_babel import gettext from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES from searx import data as searx_data
from searx.utils import searx_useragent from searx.utils import searx_useragent
from searx.external_urls import get_external_url from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
@ -435,7 +435,7 @@ def get_label(labels, lang):
def get_tag_label(tag_category, tag_name, lang): def get_tag_label(tag_category, tag_name, lang):
"""Get tag label from OSM_KEYS_TAGS""" """Get tag label from OSM_KEYS_TAGS"""
tag_name = '' if tag_name is None else tag_name tag_name = '' if tag_name is None else tag_name
tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {}) tag_labels = searx_data.OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {})
return get_label(tag_labels, lang) return get_label(tag_labels, lang)
@ -449,12 +449,12 @@ def get_key_label(key_name, lang):
# https://taginfo.openstreetmap.org/keys/currency#values # https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':') currency = key_name.split(':')
if len(currency) > 1: if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency[1]) o = searx_data.CURRENCIES['iso4217'].get(currency[1])
if o: if o:
return get_label(o, lang).lower() return get_label(o, lang).lower()
return currency[1] return currency[1]
labels = OSM_KEYS_TAGS['keys'] labels = searx_data.OSM_KEYS_TAGS['keys']
for k in key_name.split(':') + ['*']: for k in key_name.split(':') + ['*']:
labels = labels.get(k) labels = labels.get(k)
if labels is None: if labels is None:

View file

@ -13,7 +13,7 @@ from json import loads
from dateutil.parser import isoparse from dateutil.parser import isoparse
from babel.dates import format_datetime, format_date, format_time, get_datetime_format from babel.dates import format_datetime, format_date, format_time, get_datetime_format
from searx.data import WIKIDATA_UNITS from searx import data
from searx.network import post, get from searx.network import post, get
from searx.utils import searx_useragent, get_string_replaces_function from searx.utils import searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
@ -762,7 +762,7 @@ def debug_explain_wikidata_query(query, method='GET'):
def init(engine_settings=None): # pylint: disable=unused-argument def init(engine_settings=None): # pylint: disable=unused-argument
# WIKIDATA_PROPERTIES : add unit symbols # WIKIDATA_PROPERTIES : add unit symbols
WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS) WIKIDATA_PROPERTIES.update(data.WIKIDATA_UNITS)
# WIKIDATA_PROPERTIES : add property labels # WIKIDATA_PROPERTIES : add property labels
wikidata_property_names = [] wikidata_property_names = []

View file

@ -40,9 +40,9 @@ from urllib.parse import quote
from lxml import html from lxml import html
from flask_babel import gettext from flask_babel import gettext
from searx import data
from searx.utils import extract_text, eval_xpath, eval_xpath_list from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
if TYPE_CHECKING: if TYPE_CHECKING:
import httpx import httpx
@ -80,7 +80,7 @@ zlib_ext: str = ""
def init(engine_settings=None) -> None: # pylint: disable=unused-argument def init(engine_settings=None) -> None: # pylint: disable=unused-argument
"""Check of engine's settings.""" """Check of engine's settings."""
traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"]) traits: EngineTraits = EngineTraits(**data.ENGINE_TRAITS["z-library"])
if zlib_ext and zlib_ext not in traits.custom["ext"]: if zlib_ext and zlib_ext not in traits.custom["ext"]:
raise ValueError(f"invalid setting ext: {zlib_ext}") raise ValueError(f"invalid setting ext: {zlib_ext}")

View file

@ -2,7 +2,7 @@
# pylint: disable=missing-module-docstring # pylint: disable=missing-module-docstring
from urllib.parse import quote_plus, urlparse from urllib.parse import quote_plus, urlparse
from searx.data import EXTERNAL_BANGS from searx import data
LEAF_KEY = chr(16) LEAF_KEY = chr(16)
@ -56,7 +56,7 @@ def resolve_bang_definition(bang_definition, query):
def get_bang_definition_and_autocomplete(bang, external_bangs_db=None): # pylint: disable=invalid-name def get_bang_definition_and_autocomplete(bang, external_bangs_db=None): # pylint: disable=invalid-name
if external_bangs_db is None: if external_bangs_db is None:
external_bangs_db = EXTERNAL_BANGS external_bangs_db = data.EXTERNAL_BANGS
bang_definition, bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang) bang_definition, bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang)
@ -90,7 +90,7 @@ def get_bang_url(search_query, external_bangs_db=None):
ret_val = None ret_val = None
if external_bangs_db is None: if external_bangs_db is None:
external_bangs_db = EXTERNAL_BANGS external_bangs_db = data.EXTERNAL_BANGS
if search_query.external_bang: if search_query.external_bang:
bang_definition, _ = get_bang_definition_and_ac(external_bangs_db, search_query.external_bang) bang_definition, _ = get_bang_definition_and_ac(external_bangs_db, search_query.external_bang)

View file

@ -3,7 +3,7 @@
import math import math
from searx.data import EXTERNAL_URLS from searx import data
IMDB_PREFIX_TO_URL_ID = { IMDB_PREFIX_TO_URL_ID = {
@ -43,7 +43,7 @@ def get_external_url(url_id, item_id, alternative="default"):
elif url_id == 'wikimedia_image': elif url_id == 'wikimedia_image':
item_id = get_wikimedia_image_id(item_id) item_id = get_wikimedia_image_id(item_id)
url_description = EXTERNAL_URLS.get(url_id) url_description = data.EXTERNAL_URLS.get(url_id)
if url_description: if url_description:
url_template = url_description["urls"].get(alternative) url_template = url_description["urls"].get(alternative)
if url_template is not None: if url_template is not None:

View file

@ -2,7 +2,7 @@
# pylint: disable=missing-module-docstring # pylint: disable=missing-module-docstring
from hashlib import md5 from hashlib import md5
from searx.data import ahmia_blacklist_loader from searx import data
name = "Ahmia blacklist" name = "Ahmia blacklist"
description = "Filter out onion results that appear in Ahmia's blacklist. (See https://ahmia.fi/blacklist)" description = "Filter out onion results that appear in Ahmia's blacklist. (See https://ahmia.fi/blacklist)"
@ -24,5 +24,5 @@ def init(_app, settings):
if not settings['outgoing']['using_tor_proxy']: if not settings['outgoing']['using_tor_proxy']:
# disable the plugin # disable the plugin
return False return False
ahmia_blacklist = ahmia_blacklist_loader() ahmia_blacklist = data.ahmia_blacklist_loader()
return True return True

View file

@ -4,7 +4,7 @@
from flask_babel import gettext from flask_babel import gettext
from searx.data import WIKIDATA_UNITS from searx import data
name = "Unit converter plugin" name = "Unit converter plugin"
description = gettext("Convert between units") description = gettext("Convert between units")
@ -38,7 +38,7 @@ def _parse_text_and_convert(search, splitted_query):
from_unit = None from_unit = None
to_unit = None to_unit = None
for unit in WIKIDATA_UNITS.values(): for unit in data.WIKIDATA_UNITS.values():
if unit['symbol'] == from_unit_key: if unit['symbol'] == from_unit_key:
from_unit = unit from_unit = unit

View file

@ -6,7 +6,7 @@
import unicodedata import unicodedata
import re import re
from searx.data import CURRENCIES from searx import data
from .online import OnlineProcessor from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
@ -20,14 +20,14 @@ def normalize_name(name):
def name_to_iso4217(name): def name_to_iso4217(name):
name = normalize_name(name) name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name]) currency = data.CURRENCIES['names'].get(name, [name])
if isinstance(currency, str): if isinstance(currency, str):
return currency return currency
return currency[0] return currency[0]
def iso4217_to_name(iso4217, language): def iso4217_to_name(iso4217, language):
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217) return data.CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor): class OnlineCurrencyProcessor(OnlineProcessor):

View file

@ -21,7 +21,7 @@ from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
from searx import settings from searx import settings
from searx.data import USER_AGENTS, data_dir from searx import data as searx_data
from searx.version import VERSION_TAG from searx.version import VERSION_TAG
from searx.sxng_locales import sxng_locales from searx.sxng_locales import sxng_locales
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
@ -81,7 +81,9 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
See searx/data/useragents.json See searx/data/useragents.json
""" """
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])) return searx_data.USER_AGENTS['ua'].format(
os=os_string or choice(searx_data.USER_AGENTS['os']), version=choice(searx_data.USER_AGENTS['versions'])
)
class _HTMLTextExtractorException(Exception): class _HTMLTextExtractorException(Exception):
@ -600,7 +602,7 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model. # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
fasttext.FastText.eprint = lambda x: None fasttext.FastText.eprint = lambda x: None
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) _FASTTEXT_MODEL = fasttext.load_model(str(searx_data.data_dir / 'lid.176.ftz'))
return _FASTTEXT_MODEL return _FASTTEXT_MODEL

View file

@ -58,7 +58,7 @@ from searx import infopage
from searx import limiter from searx import limiter
from searx.botdetection import link_token from searx.botdetection import link_token
from searx.data import ENGINE_DESCRIPTIONS from searx import data
from searx.results import Timing from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path from searx.settings_loader import get_default_settings_path
@ -1102,14 +1102,14 @@ def image_proxy():
@app.route('/engine_descriptions.json', methods=['GET']) @app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions(): def engine_descriptions():
locale = get_locale().split('_')[0] locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy() result = data.ENGINE_DESCRIPTIONS['en'].copy()
if locale != 'en': if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items(): for engine, description in data.ENGINE_DESCRIPTIONS.get(locale, {}).items():
result[engine] = description result[engine] = description
for engine, description in result.items(): for engine, description in result.items():
if len(description) == 2 and description[1] == 'ref': if len(description) == 2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':') ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine] description = data.ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str): if isinstance(description, str):
description = [description, 'wikipedia'] description = [description, 'wikipedia']
result[engine] = description result[engine] = description