diff --git a/docs/admin/settings/settings_search.rst b/docs/admin/settings/settings_search.rst index eb63ab684..860a94af9 100644 --- a/docs/admin/settings/settings_search.rst +++ b/docs/admin/settings/settings_search.rst @@ -43,7 +43,8 @@ - ``wikipedia`` ``favicon_resolver``: - Favicon resolver, leave blank to turn off the feature by default. + :ref:`Favicon resolver `, leave blank to turn off the feature by + default. - ``allesedv`` - ``duckduckgo`` diff --git a/docs/conf.py b/docs/conf.py index fec9eb64b..27881a4e0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -127,6 +127,7 @@ extensions = [ "sphinx_tabs.tabs", # https://github.com/djungelorm/sphinx-tabs 'myst_parser', # https://www.sphinx-doc.org/en/master/usage/markdown.html 'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page + 'sphinxcontrib.autodoc_pydantic', # https://github.com/mansenfranzen/autodoc_pydantic ] autodoc_default_options = { diff --git a/docs/src/searx.favicons.rst b/docs/src/searx.favicons.rst new file mode 100644 index 000000000..6b98d5b8e --- /dev/null +++ b/docs/src/searx.favicons.rst @@ -0,0 +1,48 @@ +.. _favicons: + +======== +Favicons +======== + +.. contents:: + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.favicons + :members: + +.. _favicons.config: + +Favicons Config +=============== + +.. automodule:: searx.favicons.config + :members: + +.. _favicons.proxy: + +Favicons Proxy +============== + +.. automodule:: searx.favicons.proxy + :members: + +.. _favicons.resolver: + +Favicons Resolver +================= + +.. automodule:: searx.favicons.resolvers + :members: + +.. _favicons.cache: + +Favicons Cache +============== + +.. automodule:: searx.favicons.cache + :members: + + + diff --git a/requirements-dev.txt b/requirements-dev.txt index 1ff5920cc..ee12666f8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,3 +21,4 @@ wlc==1.15 coloredlogs==15.0.1 docutils>=0.21.2 parameterized==0.9.0 +autodoc_pydantic==2.2.0 diff --git a/requirements.txt b/requirements.txt index 9e6b515fa..94fc2090f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,6 @@ redis==5.0.8 markdown-it-py==3.0.0 fasttext-predict==0.9.2.2 pytomlpp==1.0.13; python_version < '3.11' +pydantic==2.8.2 +eval_type_backport; python_version < '3.9' +typer-slim==0.12.5 diff --git a/searx/compat.py b/searx/compat.py new file mode 100644 index 000000000..035726469 --- /dev/null +++ b/searx/compat.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Compatibility with older versions""" + +# pylint: disable=unused-import + +__all__ = [ + "tomllib", +] + +import sys + +# TOML (lib) compatibility +# ------------------------ + +if sys.version_info >= (3, 11): + import tomllib +else: + import tomli as tomllib diff --git a/searx/favicon_resolver.py b/searx/favicon_resolver.py deleted file mode 100644 index d292d4ce7..000000000 --- a/searx/favicon_resolver.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -"""This module implements functions needed for the favicon resolver. - -""" -# pylint: disable=use-dict-literal - -from httpx import HTTPError - -from searx import settings - -from searx.network import get as http_get, post as http_post -from searx.exceptions import SearxEngineResponseException - - -def update_kwargs(**kwargs): - if 'timeout' not in kwargs: - kwargs['timeout'] = settings['outgoing']['request_timeout'] - kwargs['raise_for_httperror'] = False - - -def get(*args, **kwargs): - update_kwargs(**kwargs) - return http_get(*args, **kwargs) - - -def post(*args, **kwargs): - update_kwargs(**kwargs) - return http_post(*args, **kwargs) - - -def allesedv(domain): - """Favicon Resolver from allesedv.com""" - - url = 'https://f1.allesedv.com/32/{domain}' - - # will just return a 200 regardless of the favicon existing or not - # sometimes will be correct size, sometimes not - response = get(url.format(domain=domain)) - - # returns image/gif if the favicon does not exist - if response.headers['Content-Type'] == 'image/gif': - return [] - - return response.content - - -def duckduckgo(domain): - """Favicon Resolver from duckduckgo.com""" - - url = 'https://icons.duckduckgo.com/ip2/{domain}.ico' - - # will return a 404 if the favicon does not exist and a 200 if it does, - response = get(url.format(domain=domain)) - - # api will respond with a 32x32 png image - if response.status_code == 200: - return response.content - return [] - - -def google(domain): - """Favicon Resolver from google.com""" - - url = 'https://www.google.com/s2/favicons?sz=32&domain={domain}' - - # will return a 404 if the favicon does not exist and a 200 if it does, - response = get(url.format(domain=domain)) - - # api will respond with a 32x32 png image - if response.status_code == 200: - return response.content - return [] - - -def yandex(domain): - """Favicon Resolver from yandex.com""" - - url = 'https://favicon.yandex.net/favicon/{domain}' - - # will always return 200 - response = get(url.format(domain=domain)) - - # api will respond with a 16x16 png image, if it doesn't exist, it will be a 1x1 png image (70 bytes) - if response.status_code == 200: - if len(response.content) > 70: - return response.content - return [] - - -backends = { - 'allesedv': allesedv, - 'duckduckgo': duckduckgo, - 'google': google, - 'yandex': yandex, -} - - -def search_favicon(backend_name, domain): - backend = backends.get(backend_name) - if backend is None: - return [] - try: - return backend(domain) - except (HTTPError, SearxEngineResponseException): - return [] diff --git a/searx/favicons/__init__.py b/searx/favicons/__init__.py new file mode 100644 index 000000000..2a9893932 --- /dev/null +++ b/searx/favicons/__init__.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Implementations for providing the favicons in SearXNG""" + +from __future__ import annotations + +__all__ = ["init", "favicon_url", "favicon_proxy"] + +import pathlib +from searx import logger +from searx import get_setting +from .proxy import favicon_url, favicon_proxy + +logger = logger.getChild('favicons') + + +def is_active(): + return bool(get_setting("search.favicon_resolver", False)) + + +def init(): + + # pylint: disable=import-outside-toplevel + + from . import config, cache, proxy + + cfg_file = pathlib.Path("/etc/searxng/favicons.toml") + if not cfg_file.exists(): + if is_active(): + logger.error(f"missing favicon config: {cfg_file}") + cfg_file = config.DEFAULT_CFG_TOML + + logger.debug(f"load favicon config: {cfg_file}") + cfg = config.FaviconConfig.from_toml_file(cfg_file, use_cache=True) + cache.init(cfg.cache) + proxy.init(cfg.proxy) + + del cache, config, proxy, cfg diff --git a/searx/favicons/__main__.py b/searx/favicons/__main__.py new file mode 100644 index 000000000..c515edfea --- /dev/null +++ b/searx/favicons/__main__.py @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Command line implementation""" + +import typer + +from . import cache +from . import init + +init() +app = typer.Typer() +app.add_typer(cache.app, name="cache", help="commands related to the cache") +app() diff --git a/searx/favicons/cache.py b/searx/favicons/cache.py new file mode 100644 index 000000000..4b8276154 --- /dev/null +++ b/searx/favicons/cache.py @@ -0,0 +1,476 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Implementations for caching favicons. + +:py:obj:`FaviconCacheConfig`: + Configuration of the favicon cache + +:py:obj:`FaviconCache`: + Abstract base class for the implementation of a favicon cache. + +:py:obj:`FaviconCacheSQLite`: + Favicon cache that manages the favicon BLOBs in a SQLite DB. + +:py:obj:`FaviconCacheNull`: + Fallback solution if the configured cache cannot be used for system reasons. + +---- + +""" + +from __future__ import annotations +from typing import Literal + +import abc +import dataclasses +import hashlib +import logging +import pathlib +import sqlite3 +import tempfile +import time +import typer + +from pydantic import BaseModel + +from searx import sqlitedb +from searx import logger +from searx.utils import humanize_bytes, humanize_number + +CACHE: "FaviconCache" +FALLBACK_ICON = b"FALLBACK_ICON" + +logger = logger.getChild('favicons.cache') +app = typer.Typer() + + +@app.command() +def state(): + """show state of the cache""" + print(CACHE.state().report()) + + +@app.command() +def maintenance(force: bool = True, debug: bool = False): + """perform maintenance of the cache""" + root_log = logging.getLogger() + if debug: + root_log.setLevel(logging.DEBUG) + else: + root_log.handlers = [] + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + logger.addHandler(handler) + logger.setLevel(logging.DEBUG) + + state_t0 = CACHE.state() + CACHE.maintenance(force=force) + state_t1 = CACHE.state() + state_delta = state_t0 - state_t1 + print("The cache has been reduced by:") + print(state_delta.report("\n- {descr}: {val}").lstrip("\n")) + + +def init(cfg: "FaviconCacheConfig"): + """Initialization of a global ``CACHE``""" + + global CACHE # pylint: disable=global-statement + if cfg.db_type == "sqlite": + if sqlite3.sqlite_version_info <= (3, 35): + logger.critical( + "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)", + sqlite3.sqlite_version, + ) + CACHE = FaviconCacheNull(cfg) + else: + CACHE = FaviconCacheSQLite(cfg) + elif cfg.db_type == "mem": + logger.error("Favicons are cached in memory, don't use this in production!") + CACHE = FaviconCacheMEM(cfg) + else: + raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown") + + +class FaviconCacheConfig(BaseModel): + """Configuration of the favicon cache.""" + + db_type: Literal["sqlite", "mem"] = "sqlite" + """Type of the database: + + ``sqlite``: + :py:obj:`.cache.FaviconCacheSQLite` + + ``mem``: + :py:obj:`.cache.FaviconCacheMEM` (not recommended) + """ + + db_url: pathlib.Path = pathlib.Path(tempfile.gettempdir()) / "faviconcache.db" + """URL of the SQLite DB, the path to the database file.""" + + HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days + """Hold time (default in sec.), after which a BLOB is removed from the cache.""" + + LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB + """Maximum of bytes (default) stored in the cache of all blobs. Note: The + limit is only reached at each maintenance interval after which the oldest + BLOBs are deleted; the limit is exceeded during the maintenance period. If + the maintenance period is *too long* or maintenance is switched off + completely, the cache grows uncontrollably.""" + + BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB + """The maximum BLOB size in bytes that a favicon may have so that it can be + saved in the cache. If the favicon is larger, it is not saved in the cache + and must be requested by the client via the proxy.""" + + MAINTENANCE_PERIOD: int = 60 * 60 + """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to + ``auto``.""" + + MAINTENANCE_MODE: Literal["auto", "off"] = "auto" + """Type of maintenance mode + + ``auto``: + Maintenance is carried out automatically as part of the maintenance + intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required. + + ``off``: + Maintenance is switched off and must be carried out by an external process + if required. + """ + + +@dataclasses.dataclass +class FaviconCacheStats: + """Dataclass wich provides information on the status of the cache.""" + + favicons: int | None = None + bytes: int | None = None + domains: int | None = None + resolvers: int | None = None + + field_descr = ( + ("favicons", "number of favicons in cache", humanize_number), + ("bytes", "total size (approx. bytes) of cache", humanize_bytes), + ("domains", "total number of domains in cache", humanize_number), + ("resolvers", "number of resolvers", str), + ) + + def __sub__(self, other) -> FaviconCacheStats: + if not isinstance(other, self.__class__): + raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'") + kwargs = {} + for field, _, _ in self.field_descr: + self_val, other_val = getattr(self, field), getattr(other, field) + if None in (self_val, other_val): + continue + if isinstance(self_val, int): + kwargs[field] = self_val - other_val + else: + kwargs[field] = self_val + return self.__class__(**kwargs) + + def report(self, fmt: str = "{descr}: {val}\n"): + s = [] + for field, descr, cast in self.field_descr: + val = getattr(self, field) + if val is None: + val = "--" + else: + val = cast(val) + s.append(fmt.format(descr=descr, val=val)) + return "".join(s) + + +class FaviconCache(abc.ABC): + """Abstract base class for the implementation of a favicon cache.""" + + @abc.abstractmethod + def __init__(self, cfg: FaviconCacheConfig): + """An instance of the favicon cache is build up from the configuration.""" + + @abc.abstractmethod + def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]: + """Returns ``None`` or the tuple of ``(data, mime)`` that has been + registered in the cache. The ``None`` indicates that there was no entry + in the cache.""" + + @abc.abstractmethod + def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: + """Set data and mime-type in the cache. If data is None, the + :py:obj:`FALLBACK_ICON` is registered. in the cache.""" + + @abc.abstractmethod + def state(self) -> FaviconCacheStats: + """Returns a :py:obj:`FaviconCacheStats` (key/values) with information + on the state of the cache.""" + + @abc.abstractmethod + def maintenance(self, force=False): + """Performs maintenance on the cache""" + + +class FaviconCacheNull(FaviconCache): + """A dummy favicon cache that caches nothing / a fallback solution. The + NullCache is used when more efficient caches such as the + :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite + library is only available in an old version and does not meet the + requirements.""" + + def __init__(self, cfg: FaviconCacheConfig): + return None + + def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]: + return None + + def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: + return False + + def state(self): + return FaviconCacheStats(favicons=0) + + def maintenance(self, force=False): + pass + + +class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache): + """Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB + model in the SQLite DB is implemented using the abstract class + :py:obj:`sqlitedb.SQLiteAppl`. + + The following configurations are required / supported: + + - :py:obj:`FaviconCacheConfig.db_url` + - :py:obj:`FaviconCacheConfig.HOLD_TIME` + - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES` + - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES` + - :py:obj:`MAINTENANCE_PERIOD` + - :py:obj:`MAINTENANCE_MODE` + """ + + DB_SCHEMA = 1 + + DDL_BLOBS = """\ +CREATE TABLE IF NOT EXISTS blobs ( + sha256 TEXT, + bytes_c INTEGER, + mime TEXT NOT NULL, + data BLOB NOT NULL, + PRIMARY KEY (sha256))""" + + """Table to store BLOB objects by their sha256 hash values.""" + + DDL_BLOB_MAP = """\ +CREATE TABLE IF NOT EXISTS blob_map ( + m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec. + sha256 TEXT, + resolver TEXT, + authority TEXT, + PRIMARY KEY (resolver, authority))""" + + """Table to map from (resolver, authority) to sha256 hash values.""" + + DDL_CREATE_TABLES = { + "blobs": DDL_BLOBS, + "blob_map": DDL_BLOB_MAP, + } + + SQL_DROP_LEFTOVER_BLOBS = ( + "DELETE FROM blobs WHERE sha256 IN (" + " SELECT b.sha256" + " FROM blobs b" + " LEFT JOIN blob_map bm" + " ON b.sha256 = bm.sha256" + " WHERE bm.sha256 IS NULL)" + ) + """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256.""" + + SQL_ITER_BLOBS_SHA256_BYTES_C = ( + "SELECT b.sha256, b.bytes_c FROM blobs b" + " JOIN blob_map bm " + " ON b.sha256 = bm.sha256" + " ORDER BY bm.m_time ASC" + ) + + SQL_INSERT_BLOBS = ( + "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)" + " ON CONFLICT (sha256) DO NOTHING" + ) # fmt: skip + + SQL_INSERT_BLOB_MAP = ( + "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)" + " ON CONFLICT DO UPDATE " + " SET sha256=excluded.sha256, m_time=strftime('%s', 'now')" + ) + + def __init__(self, cfg: FaviconCacheConfig): + """An instance of the favicon cache is build up from the configuration.""" # + + if cfg.db_url == ":memory:": + logger.critical("don't use SQLite DB in :memory: in production!!") + super().__init__(cfg.db_url) + self.cfg = cfg + + def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]: + + sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?" + res = self.DB.execute(sql, (resolver, authority)).fetchone() + if res is None: + return None + + data, mime = (None, None) + sha256 = res[0] + if sha256 == FALLBACK_ICON: + return data, mime + + sql = "SELECT data, mime FROM blobs WHERE sha256 = ?" + res = self.DB.execute(sql, (sha256,)).fetchone() + if res is not None: + data, mime = res + return data, mime + + def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: + + if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time: + # Should automatic maintenance be moved to a new thread? + self.maintenance() + + if data is not None and mime is None: + logger.error( + "favicon resolver %s tries to cache mime-type None for authority %s", + resolver, + authority, + ) + return False + + bytes_c = len(data or b"") + if bytes_c > self.cfg.BLOB_MAX_BYTES: + logger.info( + "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c) + ) + return False + + if data is None: + sha256 = FALLBACK_ICON + else: + sha256 = hashlib.sha256(data).hexdigest() + + with self.connect() as conn: + if sha256 != FALLBACK_ICON: + conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data)) + conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority)) + + return True + + @property + def next_maintenance_time(self) -> int: + """Returns (unix epoch) time of the next maintenance.""" + + return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE") + + def maintenance(self, force=False): + + # Prevent parallel DB maintenance cycles from other DB connections + # (e.g. in multi thread or process environments). + + if not force and int(time.time()) < self.next_maintenance_time: + logger.debug("no maintenance required yet, next maintenance interval is in the future") + return + self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property! + + # do maintenance tasks + + with self.connect() as conn: + + # drop items not in HOLD time + res = conn.execute( + f"DELETE FROM blob_map" + f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}" + ) + logger.debug("dropped %s obsolete blob_map items from db", res.rowcount) + res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS) + logger.debug("dropped %s obsolete BLOBS from db", res.rowcount) + + # drop old items to be in LIMIT_TOTAL_BYTES + total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0 + if total_bytes > self.cfg.LIMIT_TOTAL_BYTES: + + x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES + c = 0 + sha_list = [] + for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C): + sha256, bytes_c = row + sha_list.append(sha256) + c += bytes_c + if c > x: + break + if sha_list: + conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list)) + conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list)) + logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c) + + def _query_val(self, sql, default=None): + val = self.DB.execute(sql).fetchone() + if val is not None: + val = val[0] + if val is None: + val = default + return val + + def state(self) -> FaviconCacheStats: + return FaviconCacheStats( + favicons=self._query_val("SELECT count(*) FROM blobs", 0), + bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0), + domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0), + resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0), + ) + + +class FaviconCacheMEM(FaviconCache): + """Favicon cache in process' memory. Its just a POC that stores the + favicons in the memory of the process. + + .. attention:: + + Don't use it in production, it will blow up your memory!! + + """ + + def __init__(self, cfg): + + self.cfg = cfg + self._data = {} + self._sha_mime = {} + + def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]: + + sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None)) + if sha is None: + return None + data = self._data.get(sha) + if data == FALLBACK_ICON: + data = None + return data, mime + + def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool: + + if data is None: + data = FALLBACK_ICON + mime = None + + elif mime is None: + logger.error( + "favicon resolver %s tries to cache mime-type None for authority %s", + resolver, + authority, + ) + return False + + digest = hashlib.sha256(data).hexdigest() + self._data[digest] = data + self._sha_mime[f"{resolver}:{authority}"] = (digest, mime) + return True + + def state(self): + return FaviconCacheStats(favicons=len(self._data.keys())) + + def maintenance(self, force=False): + pass diff --git a/searx/favicons/config.py b/searx/favicons/config.py new file mode 100644 index 000000000..1c18b1631 --- /dev/null +++ b/searx/favicons/config.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring + +from __future__ import annotations + +import pathlib +from pydantic import BaseModel + +from searx.compat import tomllib +from .cache import FaviconCacheConfig +from .proxy import FaviconProxyConfig + +CONFIG_SCHEMA: int = 1 +"""Version of the configuration schema.""" + +TOML_CACHE: dict[str, "FaviconConfig"] = {} +"""Cache config objects by TOML's filename.""" + +DEFAULT_CFG_TOML = pathlib.Path(__file__).parent / "favicons.toml" + + +class FaviconConfig(BaseModel): + """The class aggregates configurations of the favicon tools""" + + cfg_schema: int + """Config's schema version. The specification of the version of the schema + is mandatory, currently only version :py:obj:`CONFIG_SCHEMA` is supported. + By specifying a version, it is possible to ensure downward compatibility in + the event of future changes to the configuration schema""" + + cache: FaviconCacheConfig = FaviconCacheConfig() + """Setup of the :py:obj:`.cache.FaviconCacheConfig`.""" + + proxy: FaviconProxyConfig = FaviconProxyConfig() + """Setup of the :py:obj:`.proxy.FaviconProxyConfig`.""" + + @classmethod + def from_toml_file(cls, cfg_file: pathlib.Path, use_cache: bool) -> "FaviconConfig": + """Create a config object from a TOML file, the ``use_cache`` argument + specifies whether a cache should be used. + """ + + cached = TOML_CACHE.get(str(cfg_file)) + if use_cache and cached: + return cached + + with cfg_file.open("rb") as f: + + cfg = tomllib.load(f) + cfg = cfg.get("favicons", cfg) + + schema = cfg.get("cfg_schema") + if schema != CONFIG_SCHEMA: + raise ValueError( + f"config schema version {CONFIG_SCHEMA} is needed, version {schema} is given in {cfg_file}" + ) + + cfg = cls(**cfg) + if use_cache and cached: + TOML_CACHE[str(cfg_file.resolve())] = cfg + + return cfg diff --git a/searx/favicons/favicons.toml b/searx/favicons/favicons.toml new file mode 100644 index 000000000..0e433d3aa --- /dev/null +++ b/searx/favicons/favicons.toml @@ -0,0 +1,25 @@ +[favicons] + +cfg_schema = 1 # config's schema version no. + +[favicons.proxy] + +# max_age = 5184000 # 60 days / default: 7 days (604800 sec) + +# [favicons.proxy.resolver_map] +# +# The available favicon resolvers are registered here. +# +# "duckduckgo" = "searx.favicons.resolvers.duckduckgo" +# "allesedv" = "searx.favicons.resolvers.allesedv" +# "google" = "searx.favicons.resolvers.google" +# "yandex" = "searx.favicons.resolvers.yandex" + +[favicons.cache] + +# db_url = "/var/cache/searxng/faviconcache.db" # default: "/tmp/faviconcache.db" +# HOLD_TIME = 5184000 # 60 days / default: 30 days +# LIMIT_TOTAL_BYTES = 2147483648 # 2 GB / default: 50 MB +# BLOB_MAX_BYTES = 40960 # 40 KB / default 20 KB +# MAINTENANCE_MODE = "off" # default: "auto" +# MAINTENANCE_PERIOD = 600 # 10min / default: 1h \ No newline at end of file diff --git a/searx/favicons/proxy.py b/searx/favicons/proxy.py new file mode 100644 index 000000000..8cefe6c59 --- /dev/null +++ b/searx/favicons/proxy.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Implementations for a favicon proxy""" + +from __future__ import annotations + +from typing import Callable + +import importlib +import base64 +import pathlib +import urllib.parse + +import flask +from httpx import HTTPError +from pydantic import BaseModel + +from searx import get_setting + +from searx.webutils import new_hmac, is_hmac_of +from searx.exceptions import SearxEngineResponseException + +from .resolvers import DEFAULT_RESOLVER_MAP +from . import cache + +DEFAULT_FAVICON_URL = {} +CFG: FaviconProxyConfig = None # type: ignore + + +def init(cfg: FaviconProxyConfig): + global CFG # pylint: disable=global-statement + CFG = cfg + + +def _initial_resolver_map(): + d = {} + name: str = get_setting("search.favicon_resolver", None) # type: ignore + if name: + func = DEFAULT_RESOLVER_MAP.get(name) + if func: + d = {name: f"searx.favicons.resolvers.{func.__name__}"} + return d + + +class FaviconProxyConfig(BaseModel): + """Configuration of the favicon proxy.""" + + max_age: int = 60 * 60 * 24 * 7 # seven days + """HTTP header Cache-Control_ ``max-age`` + + .. _Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control + """ + + secret_key: str = get_setting("server.secret_key") # type: ignore + """By default, the value from :ref:`server.secret_key ` + setting is used.""" + + resolver_timeout: int = get_setting("outgoing.request_timeout") # type: ignore + """Timeout which the resolvers should not exceed, is usually passed to the + outgoing request of the resolver. By default, the value from + :ref:`outgoing.request_timeout ` setting is used.""" + + resolver_map: dict[str, str] = _initial_resolver_map() + """The resolver_map is a key / value dictionary where the key is the name of + the resolver and the value is the fully qualifying name (fqn) of resolver's + function (the callable). The resolvers from the python module + :py:obj:`searx.favicons.resolver` are available by default.""" + + def get_resolver(self, name: str) -> Callable | None: + """Returns the callable object (function) of the resolver with the + ``name``. If no resolver is registered for the ``name``, ``None`` is + returned. + """ + fqn = self.resolver_map.get(name) + if fqn is None: + return None + mod_name, _, func_name = fqn.rpartition('.') + mod = importlib.import_module(mod_name) + func = getattr(mod, func_name) + if func is None: + raise ValueError(f"resolver {fqn} is not implemented") + return func + + favicon_path: str = get_setting("ui.static_path") + "/themes/{theme}/img/empty_favicon.svg" # type: ignore + favicon_mime_type: str = "image/svg+xml" + + def favicon(self, **replacements): + """Returns pathname and mimetype of the default favicon.""" + return ( + pathlib.Path(self.favicon_path.format(**replacements)), + self.favicon_mime_type, + ) + + def favicon_data_url(self, **replacements): + """Returns data image URL of the default favicon.""" + + cache_key = ", ".join(f"{x}:{replacements[x]}" for x in sorted(list(replacements.keys()), key=str)) + data_url = DEFAULT_FAVICON_URL.get(cache_key) + if data_url is not None: + return data_url + + fav, mimetype = CFG.favicon(**replacements) + # hint: encoding utf-8 limits favicons to be a SVG image + with fav.open("r", encoding="utf-8") as f: + data_url = f.read() + + data_url = urllib.parse.quote(data_url) + data_url = f"data:{mimetype};utf8,{data_url}" + DEFAULT_FAVICON_URL[cache_key] = data_url + return data_url + + +def favicon_proxy(): + """REST API of SearXNG's favicon proxy service + + :: + + /favicon_proxy?authority=<...>&h=<...> + + ``authority``: + Domain name :rfc:`3986` / see :py:obj:`favicon_url` + + ``h``: + HMAC :rfc:`2104`, build up from the :ref:`server.secret_key ` setting. + + """ + authority = flask.request.args.get('authority') + + # malformed request or RFC 3986 authority + if not authority or "/" in authority: + return '', 400 + + # malformed request / does not have authorisation + if not is_hmac_of( + CFG.secret_key, + authority.encode(), + flask.request.args.get('h', ''), + ): + return '', 400 + + resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore + # if resolver is empty or not valid, just return HTTP 400. + if not resolver or resolver not in CFG.resolver_map.keys(): + return "", 400 + + data, mime = search_favicon(resolver, authority) + + if data is not None and mime is not None: + resp = flask.Response(data, mimetype=mime) # type: ignore + resp.headers['Cache-Control'] = f"max-age={CFG.max_age}" + return resp + + # return default favicon from static path + theme = flask.request.preferences.get_value("theme") # type: ignore + fav, mimetype = CFG.favicon(theme=theme) + return flask.send_from_directory(fav.parent, fav.name, mimetype=mimetype) + + +def search_favicon(resolver: str, authority: str) -> tuple[None | bytes, None | str]: + """Sends the request to the favicon resolver and returns a tuple for the + favicon. The tuple consists of ``(data, mime)``, if the resolver has not + determined a favicon, both values are ``None``. + + ``data``: + Binary data of the favicon. + + ``mime``: + Mime type of the favicon. + + """ + + data, mime = (None, None) + + func = CFG.get_resolver(resolver) + if func is None: + return data, mime + + # to avoid superfluous requests to the resolver, first look in the cache + data_mime = cache.CACHE(resolver, authority) + if data_mime is not None: + return data_mime + + try: + data, mime = func(authority, timeout=CFG.resolver_timeout) + if data is None or mime is None: + data, mime = (None, None) + + except (HTTPError, SearxEngineResponseException): + pass + + cache.CACHE.set(resolver, authority, mime, data) + return data, mime + + +def favicon_url(authority: str) -> str: + """Function to generate the image URL used for favicons in SearXNG's result + lists. The ``authority`` argument (aka netloc / :rfc:`3986`) is usually a + (sub-) domain name. This function is used in the HTML (jinja) templates. + + .. code:: html + +
+ +
+ + The returned URL is a route to :py:obj:`favicon_proxy` REST API. + + If the favicon is already in the cache, the returned URL is a `data URL`_ + (something like ``data:image/png;base64,...``). By generating a data url from + the :py:obj:`.cache.FaviconCache`, additional HTTP roundtripps via the + :py:obj:`favicon_proxy` are saved. However, it must also be borne in mind + that data urls are not cached in the client (web browser). + + .. _data URL: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs + + """ + + resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore + # if resolver is empty or not valid, just return nothing. + if not resolver or resolver not in CFG.resolver_map.keys(): + return "" + + data_mime = cache.CACHE(resolver, authority) + + if data_mime == (None, None): + # we have already checked, the resolver does not have a favicon + theme = flask.request.preferences.get_value("theme") # type: ignore + return CFG.favicon_data_url(theme=theme) + + if data_mime is not None: + data, mime = data_mime + return f"data:{mime};base64,{str(base64.b64encode(data), 'utf-8')}" # type: ignore + + h = new_hmac(CFG.secret_key, authority.encode()) + proxy_url = flask.url_for('favicon_proxy') + query = urllib.parse.urlencode({"authority": authority, "h": h}) + return f"{proxy_url}?{query}" diff --git a/searx/favicons/resolvers.py b/searx/favicons/resolvers.py new file mode 100644 index 000000000..bde5ae2b8 --- /dev/null +++ b/searx/favicons/resolvers.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Implementations of the favicon *resolvers* that are available in the favicon +proxy by default. A *resolver* is a function that obtains the favicon from an +external source. The *resolver* function receives two arguments (``domain, +timeout``) and returns a tuple ``(data, mime)``. + +""" + +from __future__ import annotations + +__all__ = ["DEFAULT_RESOLVER_MAP", "allesedv", "duckduckgo", "google", "yandex"] + +from typing import Callable +from searx import network +from searx import logger + +DEFAULT_RESOLVER_MAP: dict[str, Callable] +logger = logger.getChild('favicons.resolvers') + + +def _req_args(**kwargs): + # add the request arguments from the searx.network + d = {"raise_for_httperror": False} + d.update(kwargs) + return d + + +def allesedv(domain: str, timeout: int) -> tuple[None | bytes, None | str]: + """Favicon Resolver from allesedv.com / https://favicon.allesedv.com/""" + data, mime = (None, None) + url = f"https://f1.allesedv.com/32/{domain}" + logger.debug("fetch favicon from: %s", url) + + # will just return a 200 regardless of the favicon existing or not + # sometimes will be correct size, sometimes not + response = network.get(url, **_req_args(timeout=timeout)) + if response and response.status_code == 200: + mime = response.headers['Content-Type'] + if mime != 'image/gif': + data = response.content + return data, mime + + +def duckduckgo(domain: str, timeout: int) -> tuple[None | bytes, None | str]: + """Favicon Resolver from duckduckgo.com / https://blog.jim-nielsen.com/2021/displaying-favicons-for-any-domain/""" + data, mime = (None, None) + url = f"https://icons.duckduckgo.com/ip2/{domain}.ico" + logger.debug("fetch favicon from: %s", url) + + # will return a 404 if the favicon does not exist and a 200 if it does, + response = network.get(url, **_req_args(timeout=timeout)) + if response and response.status_code == 200: + # api will respond with a 32x32 png image + mime = response.headers['Content-Type'] + data = response.content + return data, mime + + +def google(domain: str, timeout: int) -> tuple[None | bytes, None | str]: + """Favicon Resolver from google.com""" + data, mime = (None, None) + + # URL https://www.google.com/s2/favicons?sz=32&domain={domain}" will be + # redirected (HTTP 301 Moved Permanently) to t1.gstatic.com/faviconV2: + url = ( + f"https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL" + f"&url=https://{domain}&size=32" + ) + logger.debug("fetch favicon from: %s", url) + + # will return a 404 if the favicon does not exist and a 200 if it does, + response = network.get(url, **_req_args(timeout=timeout)) + if response and response.status_code == 200: + # api will respond with a 32x32 png image + mime = response.headers['Content-Type'] + data = response.content + return data, mime + + +def yandex(domain: str, timeout: int) -> tuple[None | bytes, None | str]: + """Favicon Resolver from yandex.com""" + data, mime = (None, None) + url = f"https://favicon.yandex.net/favicon/{domain}" + logger.debug("fetch favicon from: %s", url) + + # api will respond with a 16x16 png image, if it doesn't exist, it will be a + # 1x1 png image (70 bytes) + response = network.get(url, **_req_args(timeout=timeout)) + if response and response.status_code == 200 and len(response.content) > 70: + mime = response.headers['Content-Type'] + data = response.content + return data, mime + + +DEFAULT_RESOLVER_MAP = { + "allesedv": allesedv, + "duckduckgo": duckduckgo, + "google": google, + "yandex": yandex, +} diff --git a/searx/preferences.py b/searx/preferences.py index 92758efa6..18c3f08d8 100644 --- a/searx/preferences.py +++ b/searx/preferences.py @@ -13,7 +13,7 @@ from collections import OrderedDict import flask import babel -from searx import settings, autocomplete, favicon_resolver +from searx import settings, autocomplete, favicons from searx.enginelib import Engine from searx.plugins import Plugin from searx.locales import LOCALE_NAMES @@ -409,7 +409,7 @@ class Preferences: 'favicon_resolver': EnumStringSetting( settings['search']['favicon_resolver'], locked=is_locked('favicon_resolver'), - choices=list(favicon_resolver.backends.keys()) + [''] + choices=list(favicons.proxy.CFG.resolver_map.keys()) + [''] ), 'image_proxy': BooleanSetting( settings['server']['image_proxy'], diff --git a/searx/static/themes/simple/img/empty_favicon.svg b/searx/static/themes/simple/img/empty_favicon.svg index f4e3e334d..b65e09399 100644 --- a/searx/static/themes/simple/img/empty_favicon.svg +++ b/searx/static/themes/simple/img/empty_favicon.svg @@ -1,5 +1,4 @@ - - \ No newline at end of file + diff --git a/searx/templates/simple/macros.html b/searx/templates/simple/macros.html index 418f85227..858ee2d7e 100644 --- a/searx/templates/simple/macros.html +++ b/searx/templates/simple/macros.html @@ -23,12 +23,7 @@ {{- result_open_link(result.url, "url_wrapper") -}} {% if not rtl %} {%- if favicon_resolver != "" %} -
- {{ result.parsed_url.netloc }} -
+
{%- endif -%} {%- endif -%} {%- for part in get_pretty_url(result.parsed_url) -%} @@ -36,12 +31,7 @@ {%- endfor %} {% if rtl %} {%- if favicon_resolver != "" %} -
- {{ result.parsed_url.netloc }} -
+
{%- endif -%} {%- endif -%} {{- result_close_link() -}} diff --git a/searx/templates/simple/preferences/favicon.html b/searx/templates/simple/preferences/favicon.html index 207bf2a24..3159ddfc7 100644 --- a/searx/templates/simple/preferences/favicon.html +++ b/searx/templates/simple/preferences/favicon.html @@ -3,7 +3,7 @@
{{- '' -}}