From 84abab0808669bb237843e88e725946e74f44642 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Tue, 23 Jul 2024 21:20:21 +0200 Subject: [PATCH] [feat] engine: implementation of geizhals.de --- docs/dev/engines/online/geizhals.rst | 8 +++ searx/engines/geizhals.py | 97 ++++++++++++++++++++++++++++ searx/settings.yml | 5 ++ 3 files changed, 110 insertions(+) create mode 100644 docs/dev/engines/online/geizhals.rst create mode 100644 searx/engines/geizhals.py diff --git a/docs/dev/engines/online/geizhals.rst b/docs/dev/engines/online/geizhals.rst new file mode 100644 index 000000000..766eb5f59 --- /dev/null +++ b/docs/dev/engines/online/geizhals.rst @@ -0,0 +1,8 @@ +.. _gitea geizhals: + +======== +Geizhals +======== + +.. automodule:: searx.engines.geizhals + :members: diff --git a/searx/engines/geizhals.py b/searx/engines/geizhals.py new file mode 100644 index 000000000..eae110b20 --- /dev/null +++ b/searx/engines/geizhals.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Geizhals is a German website to compare the price of a product on the +most common German shopping sites and find the lowest price. + +The sorting of the search results can be influenced by the following additions +to the search term: + +``asc`` or ``price`` + To sort by price in ascending order. + +``desc`` + To sort by price in descending order. + +""" + +import re + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import eval_xpath, eval_xpath_list, extract_text + +about = { + 'website': 'https://geizhals.de', + 'wikidata_id': 'Q15977657', + 'use_official_api': False, + 'official_api_documentation': None, + 'require_api_key': False, + 'results': 'HTML', + 'language': 'de', +} +paging = True +categories = ['shopping'] + +base_url = "https://geizhals.de" +sort_order = 'relevance' + +SORT_RE = re.compile(r"sort:(\w+)") +sort_order_map = { + 'relevance': None, + 'price': 'p', + 'asc': 'p', + 'desc': '-p', +} + + +def request(query, params): + sort = None + + sort_order_path = SORT_RE.search(query) + if sort_order_path: + sort = sort_order_map.get(sort_order_path.group(1)) + query = SORT_RE.sub("", query) + logger.debug(query) + + args = { + 'fs': query, + 'pg': params['pageno'], + 'toggle_all': 1, # load item specs + 'sort': sort, + } + params['url'] = f"{base_url}/?{urlencode(args)}" + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + for result in eval_xpath_list(dom, "//article[contains(@class, 'listview__item')]"): + content = [] + for spec in eval_xpath_list(result, ".//div[contains(@class, 'specs-grid__item')]"): + content.append(f"{extract_text(eval_xpath(spec, './dt'))}: {extract_text(eval_xpath(spec, './dd'))}") + + metadata = [ + extract_text(eval_xpath(result, ".//div[contains(@class, 'stars-rating-label')]")), + extract_text(eval_xpath(result, ".//div[contains(@class, 'listview__offercount')]")), + ] + + item = { + 'template': 'products.html', + 'url': ( + base_url + "/" + extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__name-link')]/@href")) + ), + 'title': extract_text(eval_xpath(result, ".//h3[contains(@class, 'listview__name')]")), + 'content': ' | '.join(content), + 'thumbnail': extract_text(eval_xpath(result, ".//img[contains(@class, 'listview__image')]/@src")), + 'price': "Bestes Angebot: " + + extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__price-link')]")).split(" ")[1] + + "€", + 'metadata': ', '.join(item for item in metadata if item), + } + + results.append(item) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index a26d60026..93abe9566 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -797,6 +797,11 @@ engines: timeout: 8.0 disabled: true + - name: geizhals + engine: geizhals + shortcut: geiz + disabled: true + - name: genius engine: genius shortcut: gen