From a88b4d7036002639ee09d01efb0279c87b8e23c3 Mon Sep 17 00:00:00 2001 From: Aadniz <8147434+Aadniz@users.noreply.github.com> Date: Fri, 7 Mar 2025 23:28:28 +0100 Subject: [PATCH] [fix] presearch engine: domain sometimes included in beginning of titles --- searx/engines/presearch.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py index 8a2614bb5..870f2383b 100644 --- a/searx/engines/presearch.py +++ b/searx/engines/presearch.py @@ -64,7 +64,7 @@ Implementations """ -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse from searx import locales from searx.network import get from searx.utils import gen_useragent, html_to_text @@ -155,13 +155,34 @@ def _strip_leading_strings(text): return text.strip() +def _fix_title(title, url): + """ + Titles from Presearch shows domain + title without spacing, and HTML + This function removes these 2 issues. + Transforming "translate.google.co.inGoogle Translate" into "Google Translate" + """ + parsed_url = urlparse(url) + domain = parsed_url.netloc + title = html_to_text(title) + # Fixes issue where domain would show up in the title + # translate.google.co.inGoogle Translate -> Google Translate + if ( + title.startswith(domain) + and len(title) > len(domain) + and not title.startswith(domain + "/") + and not title.startswith(domain + " ") + ): + title = title.removeprefix(domain) + return title + + def parse_search_query(json_results): results = [] for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []): result = { 'url': item['link'], - 'title': html_to_text(item['title']), + 'title': _fix_title(item['title'], item['link']), 'thumbnail': item['image'], 'content': '', 'metadata': item.get('source'), @@ -171,7 +192,7 @@ def parse_search_query(json_results): for item in json_results.get('standardResults', []): result = { 'url': item['link'], - 'title': html_to_text(item['title']), + 'title': _fix_title(item['title'], item['link']), 'content': html_to_text(item['description']), } results.append(result)