mirror of
https://github.com/searxng/searxng.git
synced 2025-03-12 23:42:40 +00:00
[fix] presearch engine: domain sometimes included in beginning of titles
This commit is contained in:
parent
73d50f5748
commit
a88b4d7036
1 changed files with 24 additions and 3 deletions
|
@ -64,7 +64,7 @@ Implementations
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode, urlparse
|
||||||
from searx import locales
|
from searx import locales
|
||||||
from searx.network import get
|
from searx.network import get
|
||||||
from searx.utils import gen_useragent, html_to_text
|
from searx.utils import gen_useragent, html_to_text
|
||||||
|
@ -155,13 +155,34 @@ def _strip_leading_strings(text):
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_title(title, url):
|
||||||
|
"""
|
||||||
|
Titles from Presearch shows domain + title without spacing, and HTML
|
||||||
|
This function removes these 2 issues.
|
||||||
|
Transforming "translate.google.co.in<em>Google</em> Translate" into "Google Translate"
|
||||||
|
"""
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
domain = parsed_url.netloc
|
||||||
|
title = html_to_text(title)
|
||||||
|
# Fixes issue where domain would show up in the title
|
||||||
|
# translate.google.co.inGoogle Translate -> Google Translate
|
||||||
|
if (
|
||||||
|
title.startswith(domain)
|
||||||
|
and len(title) > len(domain)
|
||||||
|
and not title.startswith(domain + "/")
|
||||||
|
and not title.startswith(domain + " ")
|
||||||
|
):
|
||||||
|
title = title.removeprefix(domain)
|
||||||
|
return title
|
||||||
|
|
||||||
|
|
||||||
def parse_search_query(json_results):
|
def parse_search_query(json_results):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
||||||
result = {
|
result = {
|
||||||
'url': item['link'],
|
'url': item['link'],
|
||||||
'title': html_to_text(item['title']),
|
'title': _fix_title(item['title'], item['link']),
|
||||||
'thumbnail': item['image'],
|
'thumbnail': item['image'],
|
||||||
'content': '',
|
'content': '',
|
||||||
'metadata': item.get('source'),
|
'metadata': item.get('source'),
|
||||||
|
@ -171,7 +192,7 @@ def parse_search_query(json_results):
|
||||||
for item in json_results.get('standardResults', []):
|
for item in json_results.get('standardResults', []):
|
||||||
result = {
|
result = {
|
||||||
'url': item['link'],
|
'url': item['link'],
|
||||||
'title': html_to_text(item['title']),
|
'title': _fix_title(item['title'], item['link']),
|
||||||
'content': html_to_text(item['description']),
|
'content': html_to_text(item['description']),
|
||||||
}
|
}
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
Loading…
Reference in a new issue