From df5f8d0e8ef2ef49f2ca424305ce3436a6145189 Mon Sep 17 00:00:00 2001 From: Emilien Devos Date: Tue, 20 Sep 2022 20:35:55 +0200 Subject: [PATCH 1/2] use the internal API for google images --- searx/engines/google_images.py | 185 +++++++-------------------------- 1 file changed, 35 insertions(+), 150 deletions(-) diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index e1f676dd6..ab93f5580 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,6 +1,12 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google images engine. +"""This is the implementation of the google images engine using the google internal API used the Google Go Android app. +This internal API offer results in +- JSON (_fmt:json) +- Protobuf (_fmt:pb) +- Protobuf compressed? (_fmt:pc) +- HTML (_fmt:html) +- Protobuf encoded in JSON (_fmt:jspb). .. admonition:: Content-Security-Policy (CSP) @@ -13,16 +19,8 @@ https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs """ -import re -from urllib.parse import urlencode, unquote -from lxml import html - -from searx.utils import ( - eval_xpath, - eval_xpath_list, - eval_xpath_getindex, - extract_text, -) +from urllib.parse import urlencode +from json import loads from searx.engines.google import ( get_lang_info, @@ -42,12 +40,12 @@ about = { "official_api_documentation": 'https://developers.google.com/custom-search', "use_official_api": False, "require_api_key": False, - "results": 'HTML', + "results": 'JSON', } # engine dependent config categories = ['images', 'web'] -paging = False +paging = True use_locale_domain = True time_range_support = True safesearch = True @@ -56,74 +54,8 @@ send_accept_language_header = True filter_mapping = {0: 'images', 1: 'active', 2: 'active'} -def scrap_out_thumbs(dom): - """Scrap out thumbnail data from - # - # - # The second script contains the URLs of the images. - - # The AF_initDataCallback(..) is called with very large dictionary, that - # looks like JSON but it is not JSON since it contains JS variables and - # constants like 'null' (we can't use a JSON parser for). - # - # The alternative is to parse the entire