From 35dd0694027baef2c2eb18d27bd0f5dcbcc999ad Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Thu, 11 Feb 2021 12:32:58 +0100 Subject: [PATCH] [fix] fix seznam engine no paging support --- searx/engines/seznam.py | 64 +++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 18 +----------- 2 files changed, 65 insertions(+), 17 deletions(-) create mode 100644 searx/engines/seznam.py diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py new file mode 100644 index 000000000..1df92a845 --- /dev/null +++ b/searx/engines/seznam.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Seznam +""" + +from urllib.parse import urlencode, urlparse +from lxml import html +from searx.poolrequests import get +from searx.exceptions import SearxEngineAccessDeniedException +from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex + +# about +about = { + "website": "https://www.seznam.cz/", + "wikidata_id": "Q3490485", + "official_api_documentation": "https://api.sklik.cz/", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +base_url = 'https://search.seznam.cz/' + + +def request(query, params): + response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) + dom = html.fromstring(response_index.text) + + url_params = {'q': query} + for e in eval_xpath_list(dom, '//input[@type="hidden"]'): + name = e.get('name') + value = e.get('value') + url_params[name] = value + + params['url'] = base_url + '?' + urlencode(url_params) + params['cookies'] = response_index.cookies + return params + + +def response(resp): + resp_url = urlparse(resp.url) + if resp_url.path.startswith('/verify'): + raise SearxEngineAccessDeniedException() + + results = [] + + dom = html.fromstring(resp.content.decode()) + for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'): + dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None) + if dot_data is None: + title_element = eval_xpath_getindex(result_element, './/h3/a', 0) + results.append({ + 'url': title_element.get('href'), + 'title': extract_text(title_element), + 'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)), + }) + elif dot_data == '{"reporter_name":"hint/related/relates"}': + suggestions_element = eval_xpath_getindex(result_element, + './div/div[@data-dot="main-box"]', 0, default=None) + if suggestions_element is not None: + for suggestion in eval_xpath_list(suggestions_element, './/ul/li'): + results.append({'suggestion': extract_text(suggestion)}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 20b6c18c1..81a5190e9 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1170,24 +1170,8 @@ engines: - name : seznam shortcut: szn - engine: xpath - paging : True - search_url : https://search.seznam.cz/?q={query}&count=10&from={pageno} - results_xpath: //div[@class="Page-content"]//div[contains(@class, "Result ")] - url_xpath : ./h3/a/@href - title_xpath : ./h3 - content_xpath : .//p[@class="Result-description"] - suggestion_xpath: //div[@class="Related-container"]//div[@class="RelatedItem"]/div/span/a - first_page_num : 0 - page_size : 10 + engine: seznam disabled : True - about: - website: https://www.seznam.cz/ - wikidata_id: Q3490485 - official_api_documentation: https://api.sklik.cz/ - use_official_api: false - require_api_key: false - results: HTML - name : mojeek shortcut: mjk