[feat] engine: re-enables z-library (zlibrary-global.se)

- re-enables z-library as the new domain zlibrary-global.se is now available
  from the open web.   The announcement of the domain:

    https://www.reddit.com/r/zlibrary/comments/13whe08/mod_note_zlibraryglobalse_domain_is_officially/

  It is an official domain, it requires to log in to the "personal" subdomain
  only to download files, but the search works.

- changes the result template of zlibrary to paper.html, filling the appropriate fields
- implements language filtering for zlibrary
- implement zlibrary custom filters (engine traits)
- refactor and document the zlibrary engine
This commit is contained in:
Paolo Basso 2023-06-25 18:32:15 +02:00 committed by Markus Heiser
parent cb92767f19
commit cada89ee36
5 changed files with 808 additions and 78 deletions

View file

@ -0,0 +1,13 @@
.. _zlibrary engine:
=========
Z-Library
=========
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.engines.zlibrary
:members:

View file

@ -4256,5 +4256,602 @@
"zh_Hant": "zh_cht"
},
"regions": {}
},
"z-library": {
"all_locale": "",
"custom": {
"ext": [
"",
"TXT",
"PDF",
"FB2",
"EPUB",
"LIT",
"MOBI",
"RTF",
"DJV",
"DJVU",
"AZW",
"AZW3"
],
"year_from": [
"",
"2023",
"2022",
"2021",
"2020",
"2019",
"2018",
"2017",
"2016",
"2015",
"2014",
"2013",
"2012",
"2011",
"2010",
"2009",
"2008",
"2007",
"2006",
"2005",
"2004",
"2003",
"2002",
"2001",
"2000",
"1999",
"1998",
"1997",
"1996",
"1995",
"1994",
"1993",
"1992",
"1991",
"1990",
"1989",
"1988",
"1987",
"1986",
"1985",
"1984",
"1983",
"1982",
"1981",
"1980",
"1979",
"1978",
"1977",
"1976",
"1975",
"1974",
"1973",
"1972",
"1971",
"1970",
"1969",
"1968",
"1967",
"1966",
"1965",
"1964",
"1963",
"1962",
"1961",
"1960",
"1959",
"1958",
"1957",
"1956",
"1955",
"1954",
"1953",
"1952",
"1951",
"1950",
"1949",
"1948",
"1947",
"1946",
"1945",
"1944",
"1943",
"1942",
"1941",
"1940",
"1939",
"1938",
"1937",
"1936",
"1935",
"1934",
"1933",
"1932",
"1931",
"1930",
"1929",
"1928",
"1927",
"1926",
"1925",
"1924",
"1923",
"1922",
"1921",
"1920",
"1919",
"1918",
"1917",
"1916",
"1915",
"1914",
"1913",
"1912",
"1911",
"1910",
"1909",
"1908",
"1907",
"1906",
"1905",
"1904",
"1903",
"1902",
"1901",
"1900",
"1899",
"1898",
"1897",
"1896",
"1895",
"1894",
"1893",
"1892",
"1891",
"1890",
"1889",
"1888",
"1887",
"1886",
"1885",
"1884",
"1883",
"1882",
"1881",
"1880",
"1879",
"1878",
"1877",
"1876",
"1875",
"1874",
"1873",
"1872",
"1871",
"1870",
"1869",
"1868",
"1867",
"1866",
"1865",
"1864",
"1863",
"1862",
"1861",
"1860",
"1859",
"1858",
"1857",
"1856",
"1855",
"1854",
"1853",
"1852",
"1851",
"1850",
"1849",
"1848",
"1847",
"1846",
"1845",
"1844",
"1843",
"1842",
"1841",
"1840",
"1839",
"1838",
"1837",
"1836",
"1835",
"1834",
"1833",
"1832",
"1831",
"1830",
"1829",
"1828",
"1827",
"1826",
"1825",
"1824",
"1823",
"1822",
"1821",
"1820",
"1819",
"1818",
"1817",
"1816",
"1815",
"1814",
"1813",
"1812",
"1811",
"1810",
"1809",
"1808",
"1807",
"1806",
"1805",
"1804",
"1803",
"1802",
"1801",
"1800"
],
"year_to": [
"",
"2023",
"2022",
"2021",
"2020",
"2019",
"2018",
"2017",
"2016",
"2015",
"2014",
"2013",
"2012",
"2011",
"2010",
"2009",
"2008",
"2007",
"2006",
"2005",
"2004",
"2003",
"2002",
"2001",
"2000",
"1999",
"1998",
"1997",
"1996",
"1995",
"1994",
"1993",
"1992",
"1991",
"1990",
"1989",
"1988",
"1987",
"1986",
"1985",
"1984",
"1983",
"1982",
"1981",
"1980",
"1979",
"1978",
"1977",
"1976",
"1975",
"1974",
"1973",
"1972",
"1971",
"1970",
"1969",
"1968",
"1967",
"1966",
"1965",
"1964",
"1963",
"1962",
"1961",
"1960",
"1959",
"1958",
"1957",
"1956",
"1955",
"1954",
"1953",
"1952",
"1951",
"1950",
"1949",
"1948",
"1947",
"1946",
"1945",
"1944",
"1943",
"1942",
"1941",
"1940",
"1939",
"1938",
"1937",
"1936",
"1935",
"1934",
"1933",
"1932",
"1931",
"1930",
"1929",
"1928",
"1927",
"1926",
"1925",
"1924",
"1923",
"1922",
"1921",
"1920",
"1919",
"1918",
"1917",
"1916",
"1915",
"1914",
"1913",
"1912",
"1911",
"1910",
"1909",
"1908",
"1907",
"1906",
"1905",
"1904",
"1903",
"1902",
"1901",
"1900",
"1899",
"1898",
"1897",
"1896",
"1895",
"1894",
"1893",
"1892",
"1891",
"1890",
"1889",
"1888",
"1887",
"1886",
"1885",
"1884",
"1883",
"1882",
"1881",
"1880",
"1879",
"1878",
"1877",
"1876",
"1875",
"1874",
"1873",
"1872",
"1871",
"1870",
"1869",
"1868",
"1867",
"1866",
"1865",
"1864",
"1863",
"1862",
"1861",
"1860",
"1859",
"1858",
"1857",
"1856",
"1855",
"1854",
"1853",
"1852",
"1851",
"1850",
"1849",
"1848",
"1847",
"1846",
"1845",
"1844",
"1843",
"1842",
"1841",
"1840",
"1839",
"1838",
"1837",
"1836",
"1835",
"1834",
"1833",
"1832",
"1831",
"1830",
"1829",
"1828",
"1827",
"1826",
"1825",
"1824",
"1823",
"1822",
"1821",
"1820",
"1819",
"1818",
"1817",
"1816",
"1815",
"1814",
"1813",
"1812",
"1811",
"1810",
"1809",
"1808",
"1807",
"1806",
"1805",
"1804",
"1803",
"1802",
"1801",
"1800"
]
},
"data_type": "traits_v1",
"languages": {
"af": "afrikaans",
"ak": "akan",
"am": "amharic",
"ar": "arabic",
"as": "assamese",
"az": "azerbaijani",
"be": "belarusian",
"bg": "bulgarian",
"bm": "bambara",
"bo": "tibetan",
"br": "breton",
"bs": "bosnian",
"ca": "catalan",
"ce": "chechen",
"cs": "czech",
"cv": "chuvash",
"cy": "welsh",
"da": "danish",
"de": "german",
"dz": "dzongkha",
"ee": "ewe",
"el": "greek",
"en": "english",
"eo": "esperanto",
"es": "spanish",
"et": "estonian",
"eu": "basque",
"fa": "persian",
"fi": "finnish",
"fo": "faroese",
"fr": "french",
"ga": "irish",
"gl": "galician",
"gu": "gujarati",
"gv": "manx",
"ha": "hausa",
"he": "hebrew",
"hi": "hindi",
"hr": "croatian",
"hu": "hungarian",
"hy": "armenian",
"ia": "interlingua",
"id": "indonesian",
"ig": "igbo",
"is": "icelandic",
"it": "italian",
"ja": "japanese",
"jv": "javanese",
"ka": "georgian",
"ki": "kikuyu",
"kk": "kazakh",
"kl": "kalaallisut",
"kn": "kannada",
"ko": "korean",
"ks": "kashmiri",
"ku": "kurdish",
"kw": "cornish",
"ky": "kyrgyz",
"lb": "luxembourgish",
"lg": "ganda",
"ln": "lingala",
"lo": "lao",
"lt": "lithuanian",
"lu": "luba-katanga",
"lv": "latvian",
"mg": "malagasy",
"mk": "macedonian",
"ml": "malayalam",
"mn": "mongolian",
"mr": "marathi",
"mt": "maltese",
"my": "burmese",
"ne": "nepali",
"nl": "dutch",
"no": "norwegian",
"oc": "occitan",
"om": "oromo",
"or": "odia",
"pa": "punjabi",
"pl": "polish",
"ps": "pashto",
"pt": "portuguese",
"qu": "quechua",
"rm": "romansh",
"rn": "rundi",
"ro": "romanian",
"ru": "russian",
"rw": "kinyarwanda",
"sa": "sanskrit",
"sc": "sardinian",
"sd": "sindhi",
"sg": "sango",
"si": "sinhala",
"sk": "slovak",
"sl": "slovenian",
"sn": "shona",
"so": "somali",
"sq": "albanian",
"sr": "serbian",
"su": "sundanese",
"sv": "swedish",
"sw": "swahili",
"ta": "tamil",
"te": "telugu",
"tg": "tajik",
"th": "thai",
"ti": "tigrinya",
"tk": "turkmen",
"tr": "turkish",
"tt": "tatar",
"uk": "ukrainian",
"ur": "urdu",
"uz": "uzbek",
"vi": "vietnamese",
"wo": "wolof",
"xh": "xhosa",
"yi": "yiddish",
"yo": "yoruba",
"zh": "chinese",
"zu": "zulu"
},
"regions": {}
}
}
}

View file

@ -1,94 +1,221 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Z-Library
"""`Z-Library`_ (abbreviated as z-lib, formerly BookFinder) is a shadow library
project for file-sharing access to scholarly journal articles, academic texts
and general-interest books. It began as a mirror of Library Genesis, from which
most of its books originate.
Z-Library uses regional domains (see https://z-lib.org). Known ``base_url:``
.. _Z-Library: https://zlibrary-global.se/
- base_url: https://b-ok.cc
- base_url: https://de1lib.org
- base_url: https://booksc.eu does not have cover preview
- base_url: https://booksc.org does not have cover preview
Configuration
=============
The engine has the following additional settings:
- :py:obj:`zlib_year_from`
- :py:obj:`zlib_year_to`
- :py:obj:`zlib_ext`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific searches in Z-Library. For example a engine to search
only for EPUB from 2010 to 2020.
.. code:: yaml
- name: z-library 2010s epub
engine: zlibrary
shortcut: zlib2010s
zlib_year_from: '2010'
zlib_year_to: '2020'
zlib_ext: 'EPUB'
Implementations
===============
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from typing import List, Dict, Any, Optional
from datetime import datetime
from urllib.parse import quote
from lxml import html
from flask_babel import gettext
from searx.utils import extract_text, eval_xpath
from searx.network import get as http_get
from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
if TYPE_CHECKING:
import httpx
import logging
logger: logging.Logger
# about
about = {
"website": "https://z-lib.org",
about: Dict[str, Any] = {
"website": "https://zlibrary-global.se",
"wikidata_id": "Q104863992",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"results": "HTML",
}
categories = ['files']
paging = True
base_url = ''
categories: List[str] = ["files"]
paging: bool = True
base_url: str = "https://zlibrary-global.se"
zlib_year_from: str = ""
"""Filter z-library's results by year from. E.g '2010'.
"""
zlib_year_to: str = ""
"""Filter z-library's results by year to. E.g. '2010'.
"""
zlib_ext: str = ""
"""Filter z-library's results by a file ending. Common filters for example are
``PDF`` and ``EPUB``.
"""
def init(engine_settings=None):
global base_url # pylint: disable=global-statement
def init(engine_settings=None) -> None: # pylint: disable=unused-argument
"""Check of engine's settings."""
traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"])
if "base_url" not in engine_settings:
resp = http_get('https://z-lib.org', timeout=5.0)
if resp.ok:
dom = html.fromstring(resp.text)
base_url = extract_text(
eval_xpath(dom, './/a[contains(@class, "domain-check-link") and @data-mode="books"]/@href')
)
logger.debug("using base_url: %s" % base_url)
if zlib_ext and zlib_ext not in traits.custom["ext"]:
raise ValueError(f"invalid setting ext: {zlib_ext}")
if zlib_year_from and zlib_year_from not in traits.custom["year_from"]:
raise ValueError(f"invalid setting year_from: {zlib_year_from}")
if zlib_year_to and zlib_year_to not in traits.custom["year_to"]:
raise ValueError(f"invalid setting year_to: {zlib_year_to}")
def request(query, params):
search_url = base_url + '/s/{search_query}/?page={pageno}'
params['url'] = search_url.format(search_query=quote(query), pageno=params['pageno'])
def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]:
lang: str = traits.get_language(params["language"], traits.all_locale) # type: ignore
search_url: str = (
base_url
+ "/s/{search_query}/?page={pageno}"
+ "&yearFrom={zlib_year_from}"
+ "&yearTo={zlib_year_to}"
+ "&languages[]={lang}"
+ "&extensions[]={zlib_ext}"
)
params["url"] = search_url.format(
search_query=quote(query),
pageno=params["pageno"],
lang=lang,
zlib_year_from=zlib_year_from,
zlib_year_to=zlib_year_to,
zlib_ext=zlib_ext,
)
return params
def response(resp):
results = []
def response(resp: httpx.Response) -> List[Dict[str, Any]]:
results: List[Dict[str, Any]] = []
dom = html.fromstring(resp.text)
for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'):
result = {}
result["url"] = base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0]
result["title"] = extract_text(eval_xpath(item, './/*[@itemprop="name"]'))
year = extract_text(
eval_xpath(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]')
)
if year:
year = '(%s) ' % year
result[
"content"
] = "{year}{authors}. {publisher}. Language: {language}. {file_type}. \
Book rating: {book_rating}, book quality: {book_quality}".format(
year=year,
authors=extract_text(eval_xpath(item, './/div[@class="authors"]')),
publisher=extract_text(eval_xpath(item, './/div[@title="Publisher"]')),
file_type=extract_text(
eval_xpath(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]')
),
language=extract_text(
eval_xpath(
item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]'
)
),
book_rating=extract_text(eval_xpath(item, './/span[contains(@class, "book-rating-interest-score")]')),
book_quality=extract_text(eval_xpath(item, './/span[contains(@class, "book-rating-quality-score")]')),
)
result["img_src"] = extract_text(eval_xpath(item, './/img[contains(@class, "cover")]/@data-src'))
results.append(result)
results.append(_parse_result(item))
return results
def _text(item, selector: str) -> str | None:
return extract_text(eval_xpath(item, selector))
i18n_language = gettext("Language")
i18n_book_rating = gettext("Book rating")
i18n_file_quality = gettext("File quality")
def _parse_result(item) -> Dict[str, Any]:
author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]')
result = {
"template": "paper.html",
"url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0],
"title": _text(item, './/*[@itemprop="name"]'),
"authors": [extract_text(author) for author in author_elements],
"publisher": _text(item, './/a[@title="Publisher"]'),
"type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'),
"img_src": _text(item, './/img[contains(@class, "cover")]/@data-src'),
}
year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]')
if year:
result["publishedDate"] = datetime.strptime(year, '%Y')
content = []
language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]')
if language:
content.append(f"{i18n_language}: {language.capitalize()}")
book_rating = _text(item, './/span[contains(@class, "book-rating-interest-score")]')
if book_rating and float(book_rating):
content.append(f"{i18n_book_rating}: {book_rating}")
file_quality = _text(item, './/span[contains(@class, "book-rating-quality-score")]')
if file_quality and float(file_quality):
content.append(f"{i18n_file_quality}: {file_quality}")
result["content"] = " | ".join(content)
return result
def fetch_traits(engine_traits: EngineTraits) -> None:
"""Fetch languages and other search arguments from zlibrary's search form."""
# pylint: disable=import-outside-toplevel
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
engine_traits.all_locale = ""
engine_traits.custom["ext"] = []
engine_traits.custom["year_from"] = []
engine_traits.custom["year_to"] = []
resp = get(base_url)
if not resp.ok: # type: ignore
raise RuntimeError("Response from zlibrary's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"):
engine_traits.custom["year_from"].append(year.get("value"))
for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"):
engine_traits.custom["year_to"].append(year.get("value"))
for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"):
value: Optional[str] = ext.get("value")
if value is None:
value = ""
engine_traits.custom["ext"].append(value)
# Handle languages
# Z-library uses English names for languages, so we need to map them to their respective locales
language_name_locale_map: Dict[str, babel.Locale] = {}
for locale in babel.core.localedata.locale_identifiers(): # type: ignore
# Create a Locale object for the current locale
loc = babel.Locale.parse(locale)
language_name_locale_map[loc.english_name.lower()] = loc # type: ignore
for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"):
eng_lang = x.get("value")
if eng_lang is None:
continue
try:
locale = language_name_locale_map[eng_lang.lower()]
except KeyError:
# silently ignore unknown languages
# print("ERROR: %s is unknown by babel" % (eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = eng_lang

View file

@ -909,19 +909,11 @@ engines:
require_api_key: false
results: HTML
# Disabling zlibrary due to z-lib.org domain seizure
# https://github.com/searxng/searxng/pull/1937
#
# - name: z-library
# engine: zlibrary
# shortcut: zlib
# categories: files
# timeout: 3.0
# # choose base_url, otherwise engine will do it at initialization time
# # base_url: https://b-ok.cc
# # base_url: https://de1lib.org
# # base_url: https://booksc.eu # does not have cover preview
# # base_url: https://booksc.org # does not have cover preview
- name: z-library
engine: zlibrary
shortcut: zlib
categories: files
timeout: 7.0
- name: library of congress
engine: loc

View file

@ -41,6 +41,7 @@ sxng_locales = (
('es-US', 'Español', 'Estados Unidos', 'Spanish', '\U0001f1fa\U0001f1f8'),
('et', 'Eesti', '', 'Estonian', '\U0001f310'),
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
('fa', 'فارسی', '', 'Persian', '\U0001f310'),
('fi', 'Suomi', '', 'Finnish', '\U0001f310'),
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
('fr', 'Français', '', 'French', '\U0001f310'),