From 1ce09df9aa4d08d2125dca8f83906c5954048d0a Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 26 Nov 2021 01:14:17 +0100 Subject: [PATCH] [fix] google video engine - rework of the HTML parser The google video response has been changed slightly, a rework of the parser was needed. Signed-off-by: Markus Heiser --- searx/engines/google_videos.py | 53 ++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 86dc1867d..abf046f4c 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -31,11 +31,8 @@ from searx.engines.google import ( get_lang_info, time_range_dict, filter_mapping, - results_xpath, g_section_with_header, title_xpath, - href_xpath, - content_xpath, suggestion_xpath, detect_google_sorry, ) @@ -73,11 +70,27 @@ def _re(regexpr): RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) return RE_CACHE[regexpr] + +def scrap_out_thumbs_src(dom): + ret_val = {} + thumb_name = 'dimg_' + for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): + _script = script.text + # "dimg_35":"https://i.ytimg.c....", + _dimurl = _re("s='([^']*)").findall( _script) + for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script): + v = v.replace(r'\u003d','=') + v = v.replace(r'\u0026','&') + ret_val[k] = v + logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) + return ret_val + + def scrap_out_thumbs(dom): """Scrap out thumbnail data from