From dcf1d408a53a0dbf61e4bd545537508b42153158 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 20 Sep 2022 18:04:21 +0200 Subject: [PATCH] [fix] google-news: origin result does not have a content area The google news are in a rework, the content area of a news item has been removed. Closes: https://github.com/searxng/searxng/issues/1790 Signed-off-by: Markus Heiser --- searx/engines/google_news.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 8f5a4b104..87867d65a 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -150,24 +150,12 @@ def response(resp): # the first

tag in the
contains the title of the link title = extract_text(eval_xpath(result, './article/h3[1]')) - # the first
tag in the
contains the content of the link - content = extract_text(eval_xpath(result, './article/div[1]')) + # The pub_date is mostly a string like 'yesertday', not a real + # timezone date or time. Therefore we can't use publishedDate. + pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time')) + pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a')) - # the second
tag contains origin publisher and the publishing date - - pub_date = extract_text(eval_xpath(result, './article/div[2]//time')) - pub_origin = extract_text(eval_xpath(result, './article/div[2]//a')) - - pub_info = [] - if pub_origin: - pub_info.append(pub_origin) - if pub_date: - # The pub_date is mostly a string like 'yesertday', not a real - # timezone date or time. Therefore we can't use publishedDate. - pub_info.append(pub_date) - pub_info = ', '.join(pub_info) - if pub_info: - content = pub_info + ': ' + content + content = ' / '.join([x for x in [pub_origin, pub_date] if x]) # The image URL is located in a preceding sibling tag, e.g.: # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"