mirror of
https://github.com/searxng/searxng.git
synced 2025-03-12 15:41:15 +00:00
[fix] presearch engine: domain sometimes included in beginning of titles
This commit is contained in:
parent
73d50f5748
commit
a88b4d7036
1 changed files with 24 additions and 3 deletions
|
@ -64,7 +64,7 @@ Implementations
|
|||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from urllib.parse import urlencode, urlparse
|
||||
from searx import locales
|
||||
from searx.network import get
|
||||
from searx.utils import gen_useragent, html_to_text
|
||||
|
@ -155,13 +155,34 @@ def _strip_leading_strings(text):
|
|||
return text.strip()
|
||||
|
||||
|
||||
def _fix_title(title, url):
|
||||
"""
|
||||
Titles from Presearch shows domain + title without spacing, and HTML
|
||||
This function removes these 2 issues.
|
||||
Transforming "translate.google.co.in<em>Google</em> Translate" into "Google Translate"
|
||||
"""
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
title = html_to_text(title)
|
||||
# Fixes issue where domain would show up in the title
|
||||
# translate.google.co.inGoogle Translate -> Google Translate
|
||||
if (
|
||||
title.startswith(domain)
|
||||
and len(title) > len(domain)
|
||||
and not title.startswith(domain + "/")
|
||||
and not title.startswith(domain + " ")
|
||||
):
|
||||
title = title.removeprefix(domain)
|
||||
return title
|
||||
|
||||
|
||||
def parse_search_query(json_results):
|
||||
results = []
|
||||
|
||||
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
||||
result = {
|
||||
'url': item['link'],
|
||||
'title': html_to_text(item['title']),
|
||||
'title': _fix_title(item['title'], item['link']),
|
||||
'thumbnail': item['image'],
|
||||
'content': '',
|
||||
'metadata': item.get('source'),
|
||||
|
@ -171,7 +192,7 @@ def parse_search_query(json_results):
|
|||
for item in json_results.get('standardResults', []):
|
||||
result = {
|
||||
'url': item['link'],
|
||||
'title': html_to_text(item['title']),
|
||||
'title': _fix_title(item['title'], item['link']),
|
||||
'content': html_to_text(item['description']),
|
||||
}
|
||||
results.append(result)
|
||||
|
|
Loading…
Reference in a new issue