mirror of
https://github.com/searxng/searxng.git
synced 2024-11-23 03:11:00 +00:00
[fix] engine: Anna's Archive - grep results from '.js-scroll-hidden' elements
The renderuing of the WEB page is very strange; except the firts position all other positions of Anna's result page are enclosed in SGML comments. These cooments are *uncommented* by some JS code, see query of the class '.js-scroll-hidden' in Anna's HTML template [1]. [1] https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
e2df6b77a3
commit
87e7926ae9
1 changed files with 22 additions and 20 deletions
|
@ -97,14 +97,6 @@ aa_ext: str = ''
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# xpath queries
|
|
||||||
xpath_results: str = '//main//a[starts-with(@href,"/md5")]'
|
|
||||||
xpath_url: str = ".//@href"
|
|
||||||
xpath_title: str = ".//h3/text()[1]"
|
|
||||||
xpath_authors: str = './/div[contains(@class, "italic")]'
|
|
||||||
xpath_publisher: str = './/div[contains(@class, "text-sm")]'
|
|
||||||
xpath_file_info: str = './/div[contains(@class, "text-xs")]'
|
|
||||||
|
|
||||||
|
|
||||||
def init(engine_settings=None): # pylint: disable=unused-argument
|
def init(engine_settings=None): # pylint: disable=unused-argument
|
||||||
"""Check of engine's settings."""
|
"""Check of engine's settings."""
|
||||||
|
@ -131,24 +123,34 @@ def response(resp) -> List[Dict[str, Optional[str]]]:
|
||||||
results: List[Dict[str, Optional[str]]] = []
|
results: List[Dict[str, Optional[str]]] = []
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
for item in dom.xpath(xpath_results):
|
for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
|
||||||
result: Dict[str, Optional[str]] = {}
|
results.append(_get_result(item))
|
||||||
|
|
||||||
result["url"] = base_url + item.xpath(xpath_url)[0]
|
# The rendering of the WEB page is very strange; except the first position
|
||||||
|
# all other positions of Anna's result page are enclosed in SGML comments.
|
||||||
|
# These comments are *uncommented* by some JS code, see query of class
|
||||||
|
# '.js-scroll-hidden' in Anna's HTML template:
|
||||||
|
# https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
|
||||||
|
|
||||||
result["title"] = extract_text(eval_xpath(item, xpath_title))
|
for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
|
||||||
|
item = html.fromstring(item.xpath('./comment()')[0].text)
|
||||||
result["content"] = "{publisher}. {authors}. {file_info}".format(
|
results.append(_get_result(item))
|
||||||
authors=extract_text(eval_xpath(item, xpath_authors)),
|
|
||||||
publisher=extract_text(eval_xpath(item, xpath_publisher)),
|
|
||||||
file_info=extract_text(eval_xpath(item, xpath_file_info)),
|
|
||||||
)
|
|
||||||
|
|
||||||
results.append(result)
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _get_result(item):
|
||||||
|
return {
|
||||||
|
'template': 'paper.html',
|
||||||
|
'url': base_url + item.xpath('./@href')[0],
|
||||||
|
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
|
||||||
|
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
|
||||||
|
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
|
||||||
|
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
|
||||||
|
'img_src': item.xpath('.//img/@src')[0],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def fetch_traits(engine_traits: EngineTraits):
|
def fetch_traits(engine_traits: EngineTraits):
|
||||||
"""Fetch languages and other search arguments from Anna's search form."""
|
"""Fetch languages and other search arguments from Anna's search form."""
|
||||||
# pylint: disable=import-outside-toplevel
|
# pylint: disable=import-outside-toplevel
|
||||||
|
|
Loading…
Reference in a new issue