mirror of
https://github.com/searxng/searxng.git
synced 2024-12-01 15:11:03 +00:00
[feat] brave: fix empty content and parse published dates
This commit is contained in:
parent
90072eb6ff
commit
efbee96b1d
1 changed files with 21 additions and 1 deletions
|
@ -103,11 +103,13 @@ from urllib.parse import (
|
||||||
parse_qs,
|
parse_qs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from dateutil import parser
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
from searx import locales
|
from searx import locales
|
||||||
from searx.utils import (
|
from searx.utils import (
|
||||||
extract_text,
|
extract_text,
|
||||||
|
eval_xpath,
|
||||||
eval_xpath_list,
|
eval_xpath_list,
|
||||||
eval_xpath_getindex,
|
eval_xpath_getindex,
|
||||||
js_variable_to_python,
|
js_variable_to_python,
|
||||||
|
@ -207,6 +209,16 @@ def request(query, params):
|
||||||
logger.debug("cookies %s", params['cookies'])
|
logger.debug("cookies %s", params['cookies'])
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_published_date(published_date_raw):
|
||||||
|
if published_date_raw is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
return parser.parse(published_date_raw)
|
||||||
|
except parser.ParserError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
|
|
||||||
if brave_category == 'search':
|
if brave_category == 'search':
|
||||||
|
@ -252,13 +264,15 @@ def _parse_search(resp):
|
||||||
if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
|
if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
|
||||||
continue
|
continue
|
||||||
|
|
||||||
content_tag = eval_xpath_getindex(result, './/div[@class="snippet-description"]', 0, default='')
|
content_tag = eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
|
||||||
|
pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
|
||||||
img_src = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
|
img_src = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
|
||||||
|
|
||||||
item = {
|
item = {
|
||||||
'url': url,
|
'url': url,
|
||||||
'title': extract_text(title_tag),
|
'title': extract_text(title_tag),
|
||||||
'content': extract_text(content_tag),
|
'content': extract_text(content_tag),
|
||||||
|
'publishedDate': _extract_published_date(pub_date_raw),
|
||||||
'img_src': img_src,
|
'img_src': img_src,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -275,6 +289,10 @@ def _parse_search(resp):
|
||||||
item['iframe_src'] = iframe_src
|
item['iframe_src'] = iframe_src
|
||||||
item['template'] = 'videos.html'
|
item['template'] = 'videos.html'
|
||||||
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
||||||
|
pub_date_raw = extract_text(
|
||||||
|
eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
|
||||||
|
)
|
||||||
|
item['publishedDate'] = _extract_published_date(pub_date_raw)
|
||||||
else:
|
else:
|
||||||
item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
||||||
|
|
||||||
|
@ -300,6 +318,7 @@ def _parse_news(json_resp):
|
||||||
'url': result['url'],
|
'url': result['url'],
|
||||||
'title': result['title'],
|
'title': result['title'],
|
||||||
'content': result['description'],
|
'content': result['description'],
|
||||||
|
'publishedDate': _extract_published_date(result['age']),
|
||||||
}
|
}
|
||||||
if result['thumbnail'] is not None:
|
if result['thumbnail'] is not None:
|
||||||
item['img_src'] = result['thumbnail']['src']
|
item['img_src'] = result['thumbnail']['src']
|
||||||
|
@ -339,6 +358,7 @@ def _parse_videos(json_resp):
|
||||||
'template': 'videos.html',
|
'template': 'videos.html',
|
||||||
'length': result['video']['duration'],
|
'length': result['video']['duration'],
|
||||||
'duration': result['video']['duration'],
|
'duration': result['video']['duration'],
|
||||||
|
'publishedDate': _extract_published_date(result['age']),
|
||||||
}
|
}
|
||||||
|
|
||||||
if result['thumbnail'] is not None:
|
if result['thumbnail'] is not None:
|
||||||
|
|
Loading…
Reference in a new issue