[mod] Revision of the favicon solution

All favicons implementations have been documented and moved to the Python
package:

    searx.favicons

There is a configuration (based on Pydantic) for the favicons and all its
components:

    searx.favicons.config

A solution for caching favicons has been implemented:

    searx.favicon.cache

If the favicon is already in the cache, the returned URL is a data URL [1]
(something like `data:image/png;base64,...`).  By generating a data url from
the FaviconCache, additional HTTP roundtripps via the favicon_proxy are saved:

    favicons.proxy.favicon_url

The favicon proxy service now sets a HTTP header "Cache-Control: max-age=...":

    favicons.proxy.favicon_proxy

The resolvers now also provide the mime type (data, mime):

    searx.favicon.resolvers

[1] https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2024-08-19 17:47:54 +02:00 committed by Markus Heiser
parent c49a2707c1
commit 7ab577a1fb
21 changed files with 1039 additions and 181 deletions

View file

@ -43,7 +43,8 @@
- ``wikipedia``
``favicon_resolver``:
Favicon resolver, leave blank to turn off the feature by default.
:ref:`Favicon resolver <favicons>`, leave blank to turn off the feature by
default.
- ``allesedv``
- ``duckduckgo``

View file

@ -127,6 +127,7 @@ extensions = [
"sphinx_tabs.tabs", # https://github.com/djungelorm/sphinx-tabs
'myst_parser', # https://www.sphinx-doc.org/en/master/usage/markdown.html
'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page
'sphinxcontrib.autodoc_pydantic', # https://github.com/mansenfranzen/autodoc_pydantic
]
autodoc_default_options = {

View file

@ -0,0 +1,48 @@
.. _favicons:
========
Favicons
========
.. contents::
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.favicons
:members:
.. _favicons.config:
Favicons Config
===============
.. automodule:: searx.favicons.config
:members:
.. _favicons.proxy:
Favicons Proxy
==============
.. automodule:: searx.favicons.proxy
:members:
.. _favicons.resolver:
Favicons Resolver
=================
.. automodule:: searx.favicons.resolvers
:members:
.. _favicons.cache:
Favicons Cache
==============
.. automodule:: searx.favicons.cache
:members:

View file

@ -21,3 +21,4 @@ wlc==1.15
coloredlogs==15.0.1
docutils>=0.21.2
parameterized==0.9.0
autodoc_pydantic==2.2.0

View file

@ -16,3 +16,6 @@ redis==5.0.8
markdown-it-py==3.0.0
fasttext-predict==0.9.2.2
pytomlpp==1.0.13; python_version < '3.11'
pydantic==2.8.2
eval_type_backport; python_version < '3.9'
typer-slim==0.12.5

18
searx/compat.py Normal file
View file

@ -0,0 +1,18 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Compatibility with older versions"""
# pylint: disable=unused-import
__all__ = [
"tomllib",
]
import sys
# TOML (lib) compatibility
# ------------------------
if sys.version_info >= (3, 11):
import tomllib
else:
import tomli as tomllib

View file

@ -1,105 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This module implements functions needed for the favicon resolver.
"""
# pylint: disable=use-dict-literal
from httpx import HTTPError
from searx import settings
from searx.network import get as http_get, post as http_post
from searx.exceptions import SearxEngineResponseException
def update_kwargs(**kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = settings['outgoing']['request_timeout']
kwargs['raise_for_httperror'] = False
def get(*args, **kwargs):
update_kwargs(**kwargs)
return http_get(*args, **kwargs)
def post(*args, **kwargs):
update_kwargs(**kwargs)
return http_post(*args, **kwargs)
def allesedv(domain):
"""Favicon Resolver from allesedv.com"""
url = 'https://f1.allesedv.com/32/{domain}'
# will just return a 200 regardless of the favicon existing or not
# sometimes will be correct size, sometimes not
response = get(url.format(domain=domain))
# returns image/gif if the favicon does not exist
if response.headers['Content-Type'] == 'image/gif':
return []
return response.content
def duckduckgo(domain):
"""Favicon Resolver from duckduckgo.com"""
url = 'https://icons.duckduckgo.com/ip2/{domain}.ico'
# will return a 404 if the favicon does not exist and a 200 if it does,
response = get(url.format(domain=domain))
# api will respond with a 32x32 png image
if response.status_code == 200:
return response.content
return []
def google(domain):
"""Favicon Resolver from google.com"""
url = 'https://www.google.com/s2/favicons?sz=32&domain={domain}'
# will return a 404 if the favicon does not exist and a 200 if it does,
response = get(url.format(domain=domain))
# api will respond with a 32x32 png image
if response.status_code == 200:
return response.content
return []
def yandex(domain):
"""Favicon Resolver from yandex.com"""
url = 'https://favicon.yandex.net/favicon/{domain}'
# will always return 200
response = get(url.format(domain=domain))
# api will respond with a 16x16 png image, if it doesn't exist, it will be a 1x1 png image (70 bytes)
if response.status_code == 200:
if len(response.content) > 70:
return response.content
return []
backends = {
'allesedv': allesedv,
'duckduckgo': duckduckgo,
'google': google,
'yandex': yandex,
}
def search_favicon(backend_name, domain):
backend = backends.get(backend_name)
if backend is None:
return []
try:
return backend(domain)
except (HTTPError, SearxEngineResponseException):
return []

View file

@ -0,0 +1,37 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations for providing the favicons in SearXNG"""
from __future__ import annotations
__all__ = ["init", "favicon_url", "favicon_proxy"]
import pathlib
from searx import logger
from searx import get_setting
from .proxy import favicon_url, favicon_proxy
logger = logger.getChild('favicons')
def is_active():
return bool(get_setting("search.favicon_resolver", False))
def init():
# pylint: disable=import-outside-toplevel
from . import config, cache, proxy
cfg_file = pathlib.Path("/etc/searxng/favicons.toml")
if not cfg_file.exists():
if is_active():
logger.error(f"missing favicon config: {cfg_file}")
cfg_file = config.DEFAULT_CFG_TOML
logger.debug(f"load favicon config: {cfg_file}")
cfg = config.FaviconConfig.from_toml_file(cfg_file, use_cache=True)
cache.init(cfg.cache)
proxy.init(cfg.proxy)
del cache, config, proxy, cfg

View file

@ -0,0 +1,12 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Command line implementation"""
import typer
from . import cache
from . import init
init()
app = typer.Typer()
app.add_typer(cache.app, name="cache", help="commands related to the cache")
app()

476
searx/favicons/cache.py Normal file
View file

@ -0,0 +1,476 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations for caching favicons.
:py:obj:`FaviconCacheConfig`:
Configuration of the favicon cache
:py:obj:`FaviconCache`:
Abstract base class for the implementation of a favicon cache.
:py:obj:`FaviconCacheSQLite`:
Favicon cache that manages the favicon BLOBs in a SQLite DB.
:py:obj:`FaviconCacheNull`:
Fallback solution if the configured cache cannot be used for system reasons.
----
"""
from __future__ import annotations
from typing import Literal
import abc
import dataclasses
import hashlib
import logging
import pathlib
import sqlite3
import tempfile
import time
import typer
from pydantic import BaseModel
from searx import sqlitedb
from searx import logger
from searx.utils import humanize_bytes, humanize_number
CACHE: "FaviconCache"
FALLBACK_ICON = b"FALLBACK_ICON"
logger = logger.getChild('favicons.cache')
app = typer.Typer()
@app.command()
def state():
"""show state of the cache"""
print(CACHE.state().report())
@app.command()
def maintenance(force: bool = True, debug: bool = False):
"""perform maintenance of the cache"""
root_log = logging.getLogger()
if debug:
root_log.setLevel(logging.DEBUG)
else:
root_log.handlers = []
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(message)s"))
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
state_t0 = CACHE.state()
CACHE.maintenance(force=force)
state_t1 = CACHE.state()
state_delta = state_t0 - state_t1
print("The cache has been reduced by:")
print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
def init(cfg: "FaviconCacheConfig"):
"""Initialization of a global ``CACHE``"""
global CACHE # pylint: disable=global-statement
if cfg.db_type == "sqlite":
if sqlite3.sqlite_version_info <= (3, 35):
logger.critical(
"Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
sqlite3.sqlite_version,
)
CACHE = FaviconCacheNull(cfg)
else:
CACHE = FaviconCacheSQLite(cfg)
elif cfg.db_type == "mem":
logger.error("Favicons are cached in memory, don't use this in production!")
CACHE = FaviconCacheMEM(cfg)
else:
raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
class FaviconCacheConfig(BaseModel):
"""Configuration of the favicon cache."""
db_type: Literal["sqlite", "mem"] = "sqlite"
"""Type of the database:
``sqlite``:
:py:obj:`.cache.FaviconCacheSQLite`
``mem``:
:py:obj:`.cache.FaviconCacheMEM` (not recommended)
"""
db_url: pathlib.Path = pathlib.Path(tempfile.gettempdir()) / "faviconcache.db"
"""URL of the SQLite DB, the path to the database file."""
HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
"""Hold time (default in sec.), after which a BLOB is removed from the cache."""
LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
"""Maximum of bytes (default) stored in the cache of all blobs. Note: The
limit is only reached at each maintenance interval after which the oldest
BLOBs are deleted; the limit is exceeded during the maintenance period. If
the maintenance period is *too long* or maintenance is switched off
completely, the cache grows uncontrollably."""
BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
"""The maximum BLOB size in bytes that a favicon may have so that it can be
saved in the cache. If the favicon is larger, it is not saved in the cache
and must be requested by the client via the proxy."""
MAINTENANCE_PERIOD: int = 60 * 60
"""Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
``auto``."""
MAINTENANCE_MODE: Literal["auto", "off"] = "auto"
"""Type of maintenance mode
``auto``:
Maintenance is carried out automatically as part of the maintenance
intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
``off``:
Maintenance is switched off and must be carried out by an external process
if required.
"""
@dataclasses.dataclass
class FaviconCacheStats:
"""Dataclass wich provides information on the status of the cache."""
favicons: int | None = None
bytes: int | None = None
domains: int | None = None
resolvers: int | None = None
field_descr = (
("favicons", "number of favicons in cache", humanize_number),
("bytes", "total size (approx. bytes) of cache", humanize_bytes),
("domains", "total number of domains in cache", humanize_number),
("resolvers", "number of resolvers", str),
)
def __sub__(self, other) -> FaviconCacheStats:
if not isinstance(other, self.__class__):
raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
kwargs = {}
for field, _, _ in self.field_descr:
self_val, other_val = getattr(self, field), getattr(other, field)
if None in (self_val, other_val):
continue
if isinstance(self_val, int):
kwargs[field] = self_val - other_val
else:
kwargs[field] = self_val
return self.__class__(**kwargs)
def report(self, fmt: str = "{descr}: {val}\n"):
s = []
for field, descr, cast in self.field_descr:
val = getattr(self, field)
if val is None:
val = "--"
else:
val = cast(val)
s.append(fmt.format(descr=descr, val=val))
return "".join(s)
class FaviconCache(abc.ABC):
"""Abstract base class for the implementation of a favicon cache."""
@abc.abstractmethod
def __init__(self, cfg: FaviconCacheConfig):
"""An instance of the favicon cache is build up from the configuration."""
@abc.abstractmethod
def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
"""Returns ``None`` or the tuple of ``(data, mime)`` that has been
registered in the cache. The ``None`` indicates that there was no entry
in the cache."""
@abc.abstractmethod
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
"""Set data and mime-type in the cache. If data is None, the
:py:obj:`FALLBACK_ICON` is registered. in the cache."""
@abc.abstractmethod
def state(self) -> FaviconCacheStats:
"""Returns a :py:obj:`FaviconCacheStats` (key/values) with information
on the state of the cache."""
@abc.abstractmethod
def maintenance(self, force=False):
"""Performs maintenance on the cache"""
class FaviconCacheNull(FaviconCache):
"""A dummy favicon cache that caches nothing / a fallback solution. The
NullCache is used when more efficient caches such as the
:py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
library is only available in an old version and does not meet the
requirements."""
def __init__(self, cfg: FaviconCacheConfig):
return None
def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
return None
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
return False
def state(self):
return FaviconCacheStats(favicons=0)
def maintenance(self, force=False):
pass
class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache):
"""Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
model in the SQLite DB is implemented using the abstract class
:py:obj:`sqlitedb.SQLiteAppl`.
The following configurations are required / supported:
- :py:obj:`FaviconCacheConfig.db_url`
- :py:obj:`FaviconCacheConfig.HOLD_TIME`
- :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
- :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
- :py:obj:`MAINTENANCE_PERIOD`
- :py:obj:`MAINTENANCE_MODE`
"""
DB_SCHEMA = 1
DDL_BLOBS = """\
CREATE TABLE IF NOT EXISTS blobs (
sha256 TEXT,
bytes_c INTEGER,
mime TEXT NOT NULL,
data BLOB NOT NULL,
PRIMARY KEY (sha256))"""
"""Table to store BLOB objects by their sha256 hash values."""
DDL_BLOB_MAP = """\
CREATE TABLE IF NOT EXISTS blob_map (
m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
sha256 TEXT,
resolver TEXT,
authority TEXT,
PRIMARY KEY (resolver, authority))"""
"""Table to map from (resolver, authority) to sha256 hash values."""
DDL_CREATE_TABLES = {
"blobs": DDL_BLOBS,
"blob_map": DDL_BLOB_MAP,
}
SQL_DROP_LEFTOVER_BLOBS = (
"DELETE FROM blobs WHERE sha256 IN ("
" SELECT b.sha256"
" FROM blobs b"
" LEFT JOIN blob_map bm"
" ON b.sha256 = bm.sha256"
" WHERE bm.sha256 IS NULL)"
)
"""Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
SQL_ITER_BLOBS_SHA256_BYTES_C = (
"SELECT b.sha256, b.bytes_c FROM blobs b"
" JOIN blob_map bm "
" ON b.sha256 = bm.sha256"
" ORDER BY bm.m_time ASC"
)
SQL_INSERT_BLOBS = (
"INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
" ON CONFLICT (sha256) DO NOTHING"
) # fmt: skip
SQL_INSERT_BLOB_MAP = (
"INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
" ON CONFLICT DO UPDATE "
" SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
)
def __init__(self, cfg: FaviconCacheConfig):
"""An instance of the favicon cache is build up from the configuration.""" #
if cfg.db_url == ":memory:":
logger.critical("don't use SQLite DB in :memory: in production!!")
super().__init__(cfg.db_url)
self.cfg = cfg
def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
res = self.DB.execute(sql, (resolver, authority)).fetchone()
if res is None:
return None
data, mime = (None, None)
sha256 = res[0]
if sha256 == FALLBACK_ICON:
return data, mime
sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
res = self.DB.execute(sql, (sha256,)).fetchone()
if res is not None:
data, mime = res
return data, mime
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time:
# Should automatic maintenance be moved to a new thread?
self.maintenance()
if data is not None and mime is None:
logger.error(
"favicon resolver %s tries to cache mime-type None for authority %s",
resolver,
authority,
)
return False
bytes_c = len(data or b"")
if bytes_c > self.cfg.BLOB_MAX_BYTES:
logger.info(
"favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
)
return False
if data is None:
sha256 = FALLBACK_ICON
else:
sha256 = hashlib.sha256(data).hexdigest()
with self.connect() as conn:
if sha256 != FALLBACK_ICON:
conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
return True
@property
def next_maintenance_time(self) -> int:
"""Returns (unix epoch) time of the next maintenance."""
return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
def maintenance(self, force=False):
# Prevent parallel DB maintenance cycles from other DB connections
# (e.g. in multi thread or process environments).
if not force and int(time.time()) < self.next_maintenance_time:
logger.debug("no maintenance required yet, next maintenance interval is in the future")
return
self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
# do maintenance tasks
with self.connect() as conn:
# drop items not in HOLD time
res = conn.execute(
f"DELETE FROM blob_map"
f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
)
logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
# drop old items to be in LIMIT_TOTAL_BYTES
total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
c = 0
sha_list = []
for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
sha256, bytes_c = row
sha_list.append(sha256)
c += bytes_c
if c > x:
break
if sha_list:
conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
def _query_val(self, sql, default=None):
val = self.DB.execute(sql).fetchone()
if val is not None:
val = val[0]
if val is None:
val = default
return val
def state(self) -> FaviconCacheStats:
return FaviconCacheStats(
favicons=self._query_val("SELECT count(*) FROM blobs", 0),
bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
)
class FaviconCacheMEM(FaviconCache):
"""Favicon cache in process' memory. Its just a POC that stores the
favicons in the memory of the process.
.. attention::
Don't use it in production, it will blow up your memory!!
"""
def __init__(self, cfg):
self.cfg = cfg
self._data = {}
self._sha_mime = {}
def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
if sha is None:
return None
data = self._data.get(sha)
if data == FALLBACK_ICON:
data = None
return data, mime
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
if data is None:
data = FALLBACK_ICON
mime = None
elif mime is None:
logger.error(
"favicon resolver %s tries to cache mime-type None for authority %s",
resolver,
authority,
)
return False
digest = hashlib.sha256(data).hexdigest()
self._data[digest] = data
self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
return True
def state(self):
return FaviconCacheStats(favicons=len(self._data.keys()))
def maintenance(self, force=False):
pass

62
searx/favicons/config.py Normal file
View file

@ -0,0 +1,62 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from __future__ import annotations
import pathlib
from pydantic import BaseModel
from searx.compat import tomllib
from .cache import FaviconCacheConfig
from .proxy import FaviconProxyConfig
CONFIG_SCHEMA: int = 1
"""Version of the configuration schema."""
TOML_CACHE: dict[str, "FaviconConfig"] = {}
"""Cache config objects by TOML's filename."""
DEFAULT_CFG_TOML = pathlib.Path(__file__).parent / "favicons.toml"
class FaviconConfig(BaseModel):
"""The class aggregates configurations of the favicon tools"""
cfg_schema: int
"""Config's schema version. The specification of the version of the schema
is mandatory, currently only version :py:obj:`CONFIG_SCHEMA` is supported.
By specifying a version, it is possible to ensure downward compatibility in
the event of future changes to the configuration schema"""
cache: FaviconCacheConfig = FaviconCacheConfig()
"""Setup of the :py:obj:`.cache.FaviconCacheConfig`."""
proxy: FaviconProxyConfig = FaviconProxyConfig()
"""Setup of the :py:obj:`.proxy.FaviconProxyConfig`."""
@classmethod
def from_toml_file(cls, cfg_file: pathlib.Path, use_cache: bool) -> "FaviconConfig":
"""Create a config object from a TOML file, the ``use_cache`` argument
specifies whether a cache should be used.
"""
cached = TOML_CACHE.get(str(cfg_file))
if use_cache and cached:
return cached
with cfg_file.open("rb") as f:
cfg = tomllib.load(f)
cfg = cfg.get("favicons", cfg)
schema = cfg.get("cfg_schema")
if schema != CONFIG_SCHEMA:
raise ValueError(
f"config schema version {CONFIG_SCHEMA} is needed, version {schema} is given in {cfg_file}"
)
cfg = cls(**cfg)
if use_cache and cached:
TOML_CACHE[str(cfg_file.resolve())] = cfg
return cfg

View file

@ -0,0 +1,25 @@
[favicons]
cfg_schema = 1 # config's schema version no.
[favicons.proxy]
# max_age = 5184000 # 60 days / default: 7 days (604800 sec)
# [favicons.proxy.resolver_map]
#
# The available favicon resolvers are registered here.
#
# "duckduckgo" = "searx.favicons.resolvers.duckduckgo"
# "allesedv" = "searx.favicons.resolvers.allesedv"
# "google" = "searx.favicons.resolvers.google"
# "yandex" = "searx.favicons.resolvers.yandex"
[favicons.cache]
# db_url = "/var/cache/searxng/faviconcache.db" # default: "/tmp/faviconcache.db"
# HOLD_TIME = 5184000 # 60 days / default: 30 days
# LIMIT_TOTAL_BYTES = 2147483648 # 2 GB / default: 50 MB
# BLOB_MAX_BYTES = 40960 # 40 KB / default 20 KB
# MAINTENANCE_MODE = "off" # default: "auto"
# MAINTENANCE_PERIOD = 600 # 10min / default: 1h

237
searx/favicons/proxy.py Normal file
View file

@ -0,0 +1,237 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations for a favicon proxy"""
from __future__ import annotations
from typing import Callable
import importlib
import base64
import pathlib
import urllib.parse
import flask
from httpx import HTTPError
from pydantic import BaseModel
from searx import get_setting
from searx.webutils import new_hmac, is_hmac_of
from searx.exceptions import SearxEngineResponseException
from .resolvers import DEFAULT_RESOLVER_MAP
from . import cache
DEFAULT_FAVICON_URL = {}
CFG: FaviconProxyConfig = None # type: ignore
def init(cfg: FaviconProxyConfig):
global CFG # pylint: disable=global-statement
CFG = cfg
def _initial_resolver_map():
d = {}
name: str = get_setting("search.favicon_resolver", None) # type: ignore
if name:
func = DEFAULT_RESOLVER_MAP.get(name)
if func:
d = {name: f"searx.favicons.resolvers.{func.__name__}"}
return d
class FaviconProxyConfig(BaseModel):
"""Configuration of the favicon proxy."""
max_age: int = 60 * 60 * 24 * 7 # seven days
"""HTTP header Cache-Control_ ``max-age``
.. _Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control
"""
secret_key: str = get_setting("server.secret_key") # type: ignore
"""By default, the value from :ref:`server.secret_key <settings server>`
setting is used."""
resolver_timeout: int = get_setting("outgoing.request_timeout") # type: ignore
"""Timeout which the resolvers should not exceed, is usually passed to the
outgoing request of the resolver. By default, the value from
:ref:`outgoing.request_timeout <settings outgoing>` setting is used."""
resolver_map: dict[str, str] = _initial_resolver_map()
"""The resolver_map is a key / value dictionary where the key is the name of
the resolver and the value is the fully qualifying name (fqn) of resolver's
function (the callable). The resolvers from the python module
:py:obj:`searx.favicons.resolver` are available by default."""
def get_resolver(self, name: str) -> Callable | None:
"""Returns the callable object (function) of the resolver with the
``name``. If no resolver is registered for the ``name``, ``None`` is
returned.
"""
fqn = self.resolver_map.get(name)
if fqn is None:
return None
mod_name, _, func_name = fqn.rpartition('.')
mod = importlib.import_module(mod_name)
func = getattr(mod, func_name)
if func is None:
raise ValueError(f"resolver {fqn} is not implemented")
return func
favicon_path: str = get_setting("ui.static_path") + "/themes/{theme}/img/empty_favicon.svg" # type: ignore
favicon_mime_type: str = "image/svg+xml"
def favicon(self, **replacements):
"""Returns pathname and mimetype of the default favicon."""
return (
pathlib.Path(self.favicon_path.format(**replacements)),
self.favicon_mime_type,
)
def favicon_data_url(self, **replacements):
"""Returns data image URL of the default favicon."""
cache_key = ", ".join(f"{x}:{replacements[x]}" for x in sorted(list(replacements.keys()), key=str))
data_url = DEFAULT_FAVICON_URL.get(cache_key)
if data_url is not None:
return data_url
fav, mimetype = CFG.favicon(**replacements)
# hint: encoding utf-8 limits favicons to be a SVG image
with fav.open("r", encoding="utf-8") as f:
data_url = f.read()
data_url = urllib.parse.quote(data_url)
data_url = f"data:{mimetype};utf8,{data_url}"
DEFAULT_FAVICON_URL[cache_key] = data_url
return data_url
def favicon_proxy():
"""REST API of SearXNG's favicon proxy service
::
/favicon_proxy?authority=<...>&h=<...>
``authority``:
Domain name :rfc:`3986` / see :py:obj:`favicon_url`
``h``:
HMAC :rfc:`2104`, build up from the :ref:`server.secret_key <settings
server>` setting.
"""
authority = flask.request.args.get('authority')
# malformed request or RFC 3986 authority
if not authority or "/" in authority:
return '', 400
# malformed request / does not have authorisation
if not is_hmac_of(
CFG.secret_key,
authority.encode(),
flask.request.args.get('h', ''),
):
return '', 400
resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore
# if resolver is empty or not valid, just return HTTP 400.
if not resolver or resolver not in CFG.resolver_map.keys():
return "", 400
data, mime = search_favicon(resolver, authority)
if data is not None and mime is not None:
resp = flask.Response(data, mimetype=mime) # type: ignore
resp.headers['Cache-Control'] = f"max-age={CFG.max_age}"
return resp
# return default favicon from static path
theme = flask.request.preferences.get_value("theme") # type: ignore
fav, mimetype = CFG.favicon(theme=theme)
return flask.send_from_directory(fav.parent, fav.name, mimetype=mimetype)
def search_favicon(resolver: str, authority: str) -> tuple[None | bytes, None | str]:
"""Sends the request to the favicon resolver and returns a tuple for the
favicon. The tuple consists of ``(data, mime)``, if the resolver has not
determined a favicon, both values are ``None``.
``data``:
Binary data of the favicon.
``mime``:
Mime type of the favicon.
"""
data, mime = (None, None)
func = CFG.get_resolver(resolver)
if func is None:
return data, mime
# to avoid superfluous requests to the resolver, first look in the cache
data_mime = cache.CACHE(resolver, authority)
if data_mime is not None:
return data_mime
try:
data, mime = func(authority, timeout=CFG.resolver_timeout)
if data is None or mime is None:
data, mime = (None, None)
except (HTTPError, SearxEngineResponseException):
pass
cache.CACHE.set(resolver, authority, mime, data)
return data, mime
def favicon_url(authority: str) -> str:
"""Function to generate the image URL used for favicons in SearXNG's result
lists. The ``authority`` argument (aka netloc / :rfc:`3986`) is usually a
(sub-) domain name. This function is used in the HTML (jinja) templates.
.. code:: html
<div class="favicon">
<img src="{{ favicon_url(result.parsed_url.netloc) }}">
</div>
The returned URL is a route to :py:obj:`favicon_proxy` REST API.
If the favicon is already in the cache, the returned URL is a `data URL`_
(something like ``data:image/png;base64,...``). By generating a data url from
the :py:obj:`.cache.FaviconCache`, additional HTTP roundtripps via the
:py:obj:`favicon_proxy` are saved. However, it must also be borne in mind
that data urls are not cached in the client (web browser).
.. _data URL: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
"""
resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore
# if resolver is empty or not valid, just return nothing.
if not resolver or resolver not in CFG.resolver_map.keys():
return ""
data_mime = cache.CACHE(resolver, authority)
if data_mime == (None, None):
# we have already checked, the resolver does not have a favicon
theme = flask.request.preferences.get_value("theme") # type: ignore
return CFG.favicon_data_url(theme=theme)
if data_mime is not None:
data, mime = data_mime
return f"data:{mime};base64,{str(base64.b64encode(data), 'utf-8')}" # type: ignore
h = new_hmac(CFG.secret_key, authority.encode())
proxy_url = flask.url_for('favicon_proxy')
query = urllib.parse.urlencode({"authority": authority, "h": h})
return f"{proxy_url}?{query}"

100
searx/favicons/resolvers.py Normal file
View file

@ -0,0 +1,100 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations of the favicon *resolvers* that are available in the favicon
proxy by default. A *resolver* is a function that obtains the favicon from an
external source. The *resolver* function receives two arguments (``domain,
timeout``) and returns a tuple ``(data, mime)``.
"""
from __future__ import annotations
__all__ = ["DEFAULT_RESOLVER_MAP", "allesedv", "duckduckgo", "google", "yandex"]
from typing import Callable
from searx import network
from searx import logger
DEFAULT_RESOLVER_MAP: dict[str, Callable]
logger = logger.getChild('favicons.resolvers')
def _req_args(**kwargs):
# add the request arguments from the searx.network
d = {"raise_for_httperror": False}
d.update(kwargs)
return d
def allesedv(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from allesedv.com / https://favicon.allesedv.com/"""
data, mime = (None, None)
url = f"https://f1.allesedv.com/32/{domain}"
logger.debug("fetch favicon from: %s", url)
# will just return a 200 regardless of the favicon existing or not
# sometimes will be correct size, sometimes not
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200:
mime = response.headers['Content-Type']
if mime != 'image/gif':
data = response.content
return data, mime
def duckduckgo(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from duckduckgo.com / https://blog.jim-nielsen.com/2021/displaying-favicons-for-any-domain/"""
data, mime = (None, None)
url = f"https://icons.duckduckgo.com/ip2/{domain}.ico"
logger.debug("fetch favicon from: %s", url)
# will return a 404 if the favicon does not exist and a 200 if it does,
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200:
# api will respond with a 32x32 png image
mime = response.headers['Content-Type']
data = response.content
return data, mime
def google(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from google.com"""
data, mime = (None, None)
# URL https://www.google.com/s2/favicons?sz=32&domain={domain}" will be
# redirected (HTTP 301 Moved Permanently) to t1.gstatic.com/faviconV2:
url = (
f"https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL"
f"&url=https://{domain}&size=32"
)
logger.debug("fetch favicon from: %s", url)
# will return a 404 if the favicon does not exist and a 200 if it does,
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200:
# api will respond with a 32x32 png image
mime = response.headers['Content-Type']
data = response.content
return data, mime
def yandex(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from yandex.com"""
data, mime = (None, None)
url = f"https://favicon.yandex.net/favicon/{domain}"
logger.debug("fetch favicon from: %s", url)
# api will respond with a 16x16 png image, if it doesn't exist, it will be a
# 1x1 png image (70 bytes)
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200 and len(response.content) > 70:
mime = response.headers['Content-Type']
data = response.content
return data, mime
DEFAULT_RESOLVER_MAP = {
"allesedv": allesedv,
"duckduckgo": duckduckgo,
"google": google,
"yandex": yandex,
}

View file

@ -13,7 +13,7 @@ from collections import OrderedDict
import flask
import babel
from searx import settings, autocomplete, favicon_resolver
from searx import settings, autocomplete, favicons
from searx.enginelib import Engine
from searx.plugins import Plugin
from searx.locales import LOCALE_NAMES
@ -409,7 +409,7 @@ class Preferences:
'favicon_resolver': EnumStringSetting(
settings['search']['favicon_resolver'],
locked=is_locked('favicon_resolver'),
choices=list(favicon_resolver.backends.keys()) + ['']
choices=list(favicons.proxy.CFG.resolver_map.keys()) + ['']
),
'image_proxy': BooleanSetting(
settings['server']['image_proxy'],

View file

@ -1,5 +1,4 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
<path fill="#fff" d="M0 0h24v24H0z"/>
<path fill="#58f" d="M11 20.85a.92.92 0 0 1-1.1.93A10 10 0 0 1 2.06 13c-.06-.55.4-1 .95-1h3a1 1 0 0 1 1 1 3 3 0 0 0 3 3 1 1 0 0 1 1 1v3.85Zm6-1.92c0 .77.83 1.23 1.42.74a10 10 0 0 0 2.03-2.32c.39-.61-.09-1.35-.81-1.35H18a1 1 0 0 0-1 1v1.93ZM12 2a10 10 0 0 1 6.65 2.53c.61.55.17 1.47-.65 1.47h-.15A2.85 2.85 0 0 0 15 8.85c0 .33-.18.62-.47.77l-.08.04a1 1 0 0 1-.9 0l-.08-.04a.85.85 0 0 1-.47-.77A2.85 2.85 0 0 0 10.15 6H10a1 1 0 0 1-1-1V3.2c0-.44.28-.84.7-.94C10.45 2.1 11.22 2 12 2Z"/>
<path fill="#58f" d="M3.42 10c-.63 0-1.1-.58-.9-1.18.6-1.8 1.7-3.36 3.12-4.53C6.2 3.82 7 4.26 7 5a3 3 0 0 0 3 3h.15c.47 0 .85.38.85.85 0 1.09.61 2.07 1.58 2.56l.08.04a3 3 0 0 0 2.68 0l.08-.04A2.85 2.85 0 0 0 17 8.85c0-.47.38-.85.85-.85h2.66c.4 0 .77.23.9.6a9.98 9.98 0 0 1 .52 4.6.94.94 0 0 1-.95.8H18a3 3 0 0 0-3 3v3.8c0 .44-.28.84-.7.94l-.2.04a.92.92 0 0 1-1.1-.93V17a3 3 0 0 0-3-3 1 1 0 0 1-1-1 3 3 0 0 0-3-3H3.42Z"/>
</svg>
</svg>

Before

Width:  |  Height:  |  Size: 1 KiB

After

Width:  |  Height:  |  Size: 989 B

View file

@ -23,12 +23,7 @@
{{- result_open_link(result.url, "url_wrapper") -}}
{% if not rtl %}
{%- if favicon_resolver != "" %}
<div class="favicon">
<img
alt="{{ result.parsed_url.netloc }}"
src="{{ favicon_proxify(result.parsed_url.netloc) }}"
>
</div>
<div class="favicon"><img loading="lazy" src="{{ favicon_url(result.parsed_url.netloc) }}"></div>
{%- endif -%}
{%- endif -%}
{%- for part in get_pretty_url(result.parsed_url) -%}
@ -36,12 +31,7 @@
{%- endfor %}
{% if rtl %}
{%- if favicon_resolver != "" %}
<div class="favicon">
<img
alt="{{ result.parsed_url.netloc }}"
src="{{ favicon_proxify(result.parsed_url.netloc) }}"
>
</div>
<div class="favicon"><img loading="lazy" src="{{ favicon_url(result.parsed_url.netloc) }}"></div>
{%- endif -%}
{%- endif -%}
{{- result_close_link() -}}

View file

@ -3,7 +3,7 @@
<div class="value">{{- '' -}}
<select name="favicon_resolver" aria-labelledby="pref_favicon_resolver">{{- '' -}}
<option value=""> - </option>
{%- for backend in favicon_backends -%}
{%- for backend in favicon_resolver_names -%}
<option value="{{ backend }}"
{%- if backend == favicon_resolver %} selected="selected" {%- endif -%}>
{{- backend -}}

View file

@ -123,7 +123,8 @@ from searx.locales import (
# renaming names from searx imports ...
from searx.autocomplete import search_autocomplete, backends as autocomplete_backends
from searx.favicon_resolver import search_favicon, backends as favicon_backends
from searx import favicons
from searx.redisdb import initialize as redis_initialize
from searx.sxng_locales import sxng_locales
from searx.search import SearchWithPlugins, initialize as search_initialize
@ -298,24 +299,6 @@ def morty_proxify(url: str):
return '{0}?{1}'.format(settings['result_proxy']['url'], urlencode(url_params))
def favicon_proxify(url: str):
# url is a FQDN (e.g. example.com, en.wikipedia.org)
resolver = request.preferences.get_value('favicon_resolver')
# if resolver is empty, just return nothing
if not resolver:
return ""
# check resolver is valid
if resolver not in favicon_backends:
return ""
h = new_hmac(settings['server']['secret_key'], url.encode())
return '{0}?{1}'.format(url_for('favicon_proxy'), urlencode(dict(q=url.encode(), h=h)))
def image_proxify(url: str):
if url.startswith('//'):
@ -377,7 +360,6 @@ def get_client_settings():
return {
'autocomplete_provider': req_pref.get_value('autocomplete'),
'autocomplete_min': get_setting('search.autocomplete_min'),
'favicon_resolver': req_pref.get_value('favicon_resolver'),
'http_method': req_pref.get_value('method'),
'infinite_scroll': req_pref.get_value('infinite_scroll'),
'translations': get_translations(),
@ -452,7 +434,7 @@ def render(template_name: str, **kwargs):
# helpers to create links to other pages
kwargs['url_for'] = custom_url_for # override url_for function in templates
kwargs['image_proxify'] = image_proxify
kwargs['favicon_proxify'] = favicon_proxify
kwargs['favicon_url'] = favicons.favicon_url
kwargs['proxify'] = morty_proxify if settings['result_proxy']['url'] is not None else None
kwargs['proxify_results'] = settings['result_proxy']['proxify_results']
kwargs['cache_url'] = settings['ui']['cache_url']
@ -895,42 +877,6 @@ def autocompleter():
return Response(suggestions, mimetype=mimetype)
@app.route('/favicon', methods=['GET'])
def favicon_proxy():
"""Return proxied favicon results"""
url = request.args.get('q')
# malformed request
if not url:
return '', 400
# malformed request / does not have authorisation
if not is_hmac_of(settings['server']['secret_key'], url.encode(), request.args.get('h', '')):
return '', 400
resolver = request.preferences.get_value('favicon_resolver')
# check if the favicon resolver is valid
if not resolver or resolver not in favicon_backends:
return '', 400
# parse query
raw_text_query = RawTextQuery(url, [])
resp = search_favicon(resolver, raw_text_query)
# return 404 if the favicon is not found
if not resp:
theme = request.preferences.get_value("theme")
# return favicon from /static/themes/simple/img/empty_favicon.svg
# we can't rely on an onerror event in the img tag to display a default favicon as this violates the CSP.
# using redirect to save network bandwidth (user will have this location cached).
return redirect(url_for('static', filename='themes/' + theme + '/img/empty_favicon.svg'))
# will always return a PNG image
return Response(resp, mimetype='image/png')
@app.route('/preferences', methods=['GET', 'POST'])
def preferences():
"""Render preferences page && save user preferences"""
@ -1078,7 +1024,7 @@ def preferences():
],
disabled_engines = disabled_engines,
autocomplete_backends = autocomplete_backends,
favicon_backends = favicon_backends,
favicon_resolver_names = favicons.proxy.CFG.resolver_map.keys(),
shortcuts = {y: x for x, y in engine_shortcuts.items()},
themes = themes,
plugins = plugins,
@ -1092,6 +1038,9 @@ def preferences():
)
app.add_url_rule('/favicon_proxy', methods=['GET'], endpoint="favicon_proxy", view_func=favicons.favicon_proxy)
@app.route('/image_proxy', methods=['GET'])
def image_proxy():
# pylint: disable=too-many-return-statements, too-many-branches
@ -1403,6 +1352,7 @@ if not werkzeug_reloader or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_
plugin_initialize(app)
search_initialize(enable_checker=True, check_network=True, enable_metrics=settings['general']['enable_metrics'])
limiter.initialize(app, settings)
favicons.init()
def run():

View file

@ -61,6 +61,7 @@ setup(
'data/*.json',
'data/*.txt',
'data/*.ftz',
'favicons/*.toml',
'infopage/*/*',
'static/themes/simple/css/*',
'static/themes/simple/css/*/*',

View file

@ -2,6 +2,7 @@
# pylint: disable=missing-module-docstring, invalid-name
from tests import SearxTestCase
from searx import favicons
from searx.locales import locales_initialize
from searx.preferences import (
EnumStringSetting,
@ -14,6 +15,7 @@ from searx.preferences import (
from searx.plugins import Plugin
locales_initialize()
favicons.init()
class PluginStub(Plugin): # pylint: disable=missing-class-docstring, too-few-public-methods