[fix] engine zlibrary: handle seized domain

The domains of zlibrary instances are known to be seized from time to time.
This leads to problems when, for example, the automated tasks try to update the
engine traits (aka fetch_traits). The search function should also generate a
suitable error message (currently either SSL errors or empty result lists are
returned). [1]

[1] https://github.com/searxng/searxng/issues/3610
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2024-06-25 14:15:24 +02:00 committed by Markus Heiser
parent b8fa4d6195
commit 39ffec87b7

View file

@ -43,6 +43,7 @@ from flask_babel import gettext
from searx.utils import extract_text, eval_xpath, eval_xpath_list from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS from searx.data import ENGINE_TRAITS
from searx.exceptions import SearxException
if TYPE_CHECKING: if TYPE_CHECKING:
import httpx import httpx
@ -108,13 +109,21 @@ def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]:
zlib_year_to=zlib_year_to, zlib_year_to=zlib_year_to,
zlib_ext=zlib_ext, zlib_ext=zlib_ext,
) )
params["verify"] = False
return params return params
def domain_is_seized(dom):
return bool(dom.xpath('//title') and "seized" in dom.xpath('//title')[0].text.lower())
def response(resp: httpx.Response) -> List[Dict[str, Any]]: def response(resp: httpx.Response) -> List[Dict[str, Any]]:
results: List[Dict[str, Any]] = [] results: List[Dict[str, Any]] = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
if domain_is_seized(dom):
raise SearxException(f"zlibrary domain is seized: {base_url}")
for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'): for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'):
results.append(_parse_result(item)) results.append(_parse_result(item))
@ -168,22 +177,30 @@ def _parse_result(item) -> Dict[str, Any]:
def fetch_traits(engine_traits: EngineTraits) -> None: def fetch_traits(engine_traits: EngineTraits) -> None:
"""Fetch languages and other search arguments from zlibrary's search form.""" """Fetch languages and other search arguments from zlibrary's search form."""
# pylint: disable=import-outside-toplevel # pylint: disable=import-outside-toplevel, too-many-branches
import babel import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag from searx.locales import language_tag
resp = get(base_url, verify=False)
if not resp.ok: # type: ignore
raise RuntimeError("Response from zlibrary's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
if domain_is_seized(dom):
print(f"ERROR: zlibrary domain is seized: {base_url}")
# don't change anything, re-use the existing values
engine_traits.all_locale = ENGINE_TRAITS["z-library"]["all_locale"]
engine_traits.custom = ENGINE_TRAITS["z-library"]["custom"]
engine_traits.languages = ENGINE_TRAITS["z-library"]["languages"]
return
engine_traits.all_locale = "" engine_traits.all_locale = ""
engine_traits.custom["ext"] = [] engine_traits.custom["ext"] = []
engine_traits.custom["year_from"] = [] engine_traits.custom["year_from"] = []
engine_traits.custom["year_to"] = [] engine_traits.custom["year_to"] = []
resp = get(base_url)
if not resp.ok: # type: ignore
raise RuntimeError("Response from zlibrary's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"): for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"):
engine_traits.custom["year_from"].append(year.get("value")) engine_traits.custom["year_from"].append(year.get("value"))