[fix] engines: Google-Web & Google-Video (random arc_id)

Both enghines have been reported ``TooManyRequests``, additionaly Google-Videos
thumbnails needed a review.

Based on the research from @unixfox [1] this patch generates every hour a new random
``arc_id``.

[1] https://github.com/searxng/searxng/issues/4435#issuecomment-2703279522

Closes:

- https://github.com/searxng/searxng/issues/4435
- https://github.com/searxng/searxng/issues/4431

Related:

- https://github.com/searxng/searxng/discussions/4434
- https://github.com/searxng/searxng/discussions/4429

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2025-03-06 10:14:10 +01:00 committed by Markus Heiser
parent 8984d7ae02
commit 194f222203
2 changed files with 55 additions and 14 deletions

View file

@ -10,10 +10,14 @@ engines:
- :ref:`google autocomplete` - :ref:`google autocomplete`
""" """
from __future__ import annotations
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import re import re
import random
import string
import time
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
import babel import babel
@ -64,11 +68,31 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# from the links not the links itself. # from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
# # celebrities like '!google natasha allegri' _arcid_range = string.ascii_letters + string.digits + "_-"
# # or '!google chris evans' _arcid_random: tuple[str, int] | None = None
UI_ASYNC = 'use_ac:true,_fmt:prog'
"""Format of the response from UI's async request."""
def ui_async(start: int) -> str:
"""Format of the response from UI's async request.
- ``arc_id:<...>,use_ac:true,_fmt:prog``
The arc_id is random generated every hour.
"""
global _arcid_random # pylint: disable=global-statement
use_ac = "use_ac:true"
# _fmt:html returns a HTTP 500 when user search for celebrities like
# '!google natasha allegri' or '!google chris evans'
_fmt = "_fmt:prog"
# create a new random arc_id every hour
if not _arcid_random or (int(time.time()) - _arcid_random[1]) > 3600:
_arcid_random = (''.join(random.choices(_arcid_range, k=23)), int(time.time()))
arc_id = f"arc_id:srp_{_arcid_random[0]}_1{start:02}"
return ",".join([arc_id, use_ac, _fmt])
def get_google_info(params, eng_traits): def get_google_info(params, eng_traits):
@ -258,8 +282,10 @@ def detect_google_sorry(resp):
def request(query, params): def request(query, params):
"""Google search request""" """Google search request"""
# pylint: disable=line-too-long # pylint: disable=line-too-long
offset = (params['pageno'] - 1) * 10 start = (params['pageno'] - 1) * 10
str_async = ui_async(start)
google_info = get_google_info(params, traits) google_info = get_google_info(params, traits)
logger.debug("ARC_ID: %s", str_async)
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = ( query_url = (
@ -272,7 +298,7 @@ def request(query, params):
'q': query, 'q': query,
**google_info['params'], **google_info['params'],
'filter': '0', 'filter': '0',
'start': offset, 'start': start,
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i', # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG', # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
# 'cs' : 1, # 'cs' : 1,
@ -284,7 +310,7 @@ def request(query, params):
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg' # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
# formally known as use_mobile_ui # formally known as use_mobile_ui
'asearch': 'arc', 'asearch': 'arc',
'async': UI_ASYNC, 'async': str_async,
} }
) )
) )
@ -303,15 +329,20 @@ def request(query, params):
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87; # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);') RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
RE_DATA_IMAGE_end = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*)$')
def _parse_data_images(dom): def parse_data_images(text: str):
data_image_map = {} data_image_map = {}
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
for img_id, data_image in RE_DATA_IMAGE.findall(text):
end_pos = data_image.rfind('=') end_pos = data_image.rfind('=')
if end_pos > 0: if end_pos > 0:
data_image = data_image[: end_pos + 1] data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image data_image_map[img_id] = data_image
last = RE_DATA_IMAGE_end.search(text)
if last:
data_image_map[last.group(1)] = last.group(2)
logger.debug('data:image objects --> %s', list(data_image_map.keys())) logger.debug('data:image objects --> %s', list(data_image_map.keys()))
return data_image_map return data_image_map
@ -320,12 +351,12 @@ def response(resp) -> EngineResults:
"""Get response from google's search request""" """Get response from google's search request"""
# pylint: disable=too-many-branches, too-many-statements # pylint: disable=too-many-branches, too-many-statements
detect_google_sorry(resp) detect_google_sorry(resp)
data_image_map = parse_data_images(resp.text)
results = EngineResults() results = EngineResults()
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
data_image_map = _parse_data_images(dom)
# results --> answer # results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')

View file

@ -12,6 +12,7 @@
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
""" """
from __future__ import annotations
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@ -32,6 +33,8 @@ from searx.engines.google import (
filter_mapping, filter_mapping,
suggestion_xpath, suggestion_xpath,
detect_google_sorry, detect_google_sorry,
ui_async,
parse_data_images,
) )
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.utils import get_embeded_stream_url from searx.utils import get_embeded_stream_url
@ -67,6 +70,7 @@ def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
google_info = get_google_info(params, traits) google_info = get_google_info(params, traits)
start = (params['pageno'] - 1) * 10
query_url = ( query_url = (
'https://' 'https://'
@ -80,7 +84,7 @@ def request(query, params):
'start': 10 * params['pageno'], 'start': 10 * params['pageno'],
**google_info['params'], **google_info['params'],
'asearch': 'arc', 'asearch': 'arc',
'async': 'use_ac:true,_fmt:html', 'async': ui_async(start),
} }
) )
) )
@ -101,6 +105,7 @@ def response(resp):
results = [] results = []
detect_google_sorry(resp) detect_google_sorry(resp)
data_image_map = parse_data_images(resp.text)
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
@ -109,8 +114,13 @@ def response(resp):
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
thumbnail = eval_xpath_getindex(result, './/img/@src', 0, None) thumbnail = eval_xpath_getindex(result, './/img/@src', 0, None)
if thumbnail is None: if thumbnail:
continue if thumbnail.startswith('data:image'):
img_id = eval_xpath_getindex(result, './/img/@id', 0, None)
if img_id:
thumbnail = data_image_map.get(img_id)
else:
thumbnail = None
title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0)) title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0) url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)