mirror of
https://github.com/searxng/searxng.git
synced 2024-12-01 15:11:03 +00:00
[fix] duckduckgo extra: crashes and returns no results
This commit is contained in:
parent
c4b874e9b0
commit
f0f0b2d4c9
2 changed files with 67 additions and 51 deletions
|
@ -1,12 +1,14 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""
|
"""
|
||||||
DuckDuckGo Lite
|
DuckDuckGo WEB
|
||||||
~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode, quote_plus
|
||||||
import json
|
import json
|
||||||
import babel
|
import babel
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
@ -18,12 +20,12 @@ from searx import (
|
||||||
)
|
)
|
||||||
from searx.utils import (
|
from searx.utils import (
|
||||||
eval_xpath,
|
eval_xpath,
|
||||||
|
extr,
|
||||||
extract_text,
|
extract_text,
|
||||||
)
|
)
|
||||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||||
from searx import redisdb
|
from searx import redisdb
|
||||||
from searx.enginelib.traits import EngineTraits
|
from searx.enginelib.traits import EngineTraits
|
||||||
from searx.utils import extr
|
|
||||||
from searx.exceptions import SearxEngineCaptchaException
|
from searx.exceptions import SearxEngineCaptchaException
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -60,42 +62,30 @@ form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
|
||||||
__CACHE = []
|
__CACHE = []
|
||||||
|
|
||||||
|
|
||||||
def _cache_key(data: dict):
|
def _cache_key(query: str, region: str):
|
||||||
return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{data['q']}//{data['kl']}")
|
return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}")
|
||||||
|
|
||||||
|
|
||||||
def cache_vqd(data: dict, value):
|
def cache_vqd(query: str, region: str, value: str):
|
||||||
"""Caches a ``vqd`` value from a query."""
|
"""Caches a ``vqd`` value from a query."""
|
||||||
c = redisdb.client()
|
c = redisdb.client()
|
||||||
if c:
|
if c:
|
||||||
logger.debug("cache vqd value: %s", value)
|
logger.debug("VALKEY cache vqd value: %s (%s)", value, region)
|
||||||
c.set(_cache_key(data), value, ex=600)
|
c.set(_cache_key(query, region), value, ex=600)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.debug("MEM cache vqd value: %s", value)
|
logger.debug("MEM cache vqd value: %s (%s)", value, region)
|
||||||
if len(__CACHE) > 100: # cache vqd from last 100 queries
|
if len(__CACHE) > 100: # cache vqd from last 100 queries
|
||||||
__CACHE.pop(0)
|
__CACHE.pop(0)
|
||||||
__CACHE.append((_cache_key(data), value))
|
__CACHE.append((_cache_key(query, region), value))
|
||||||
|
|
||||||
|
|
||||||
def get_vqd(data):
|
def get_vqd(query: str, region: str, force_request: bool = False):
|
||||||
"""Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST).
|
"""Returns the ``vqd`` that fits to the *query*.
|
||||||
|
|
||||||
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
|
:param query: The query term
|
||||||
(such as extremely long search terms that are often sent by bots), no ``vqd``
|
:param region: DDG's region code
|
||||||
value can be determined.
|
:param force_request: force a request to get a vqd value from DDG
|
||||||
|
|
||||||
If SearXNG cannot determine a ``vqd`` value, then no request should go out
|
|
||||||
to DDG:
|
|
||||||
|
|
||||||
A request with a wrong ``vqd`` value leads to DDG temporarily putting
|
|
||||||
SearXNG's IP on a block list.
|
|
||||||
|
|
||||||
Requests from IPs in this block list run into timeouts.
|
|
||||||
|
|
||||||
Not sure, but it seems the block list is a sliding window: to get my IP rid
|
|
||||||
from the bot list I had to cool down my IP for 1h (send no requests from
|
|
||||||
that IP to DDG).
|
|
||||||
|
|
||||||
TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
|
TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
|
||||||
by all request to DDG:
|
by all request to DDG:
|
||||||
|
@ -106,23 +96,46 @@ def get_vqd(data):
|
||||||
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
|
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
|
||||||
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
|
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
|
||||||
|
|
||||||
"""
|
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
|
||||||
|
(such as extremely long search terms that are often sent by bots), no ``vqd``
|
||||||
|
value can be determined.
|
||||||
|
|
||||||
|
If SearXNG cannot determine a ``vqd`` value, then no request should go out
|
||||||
|
to DDG.
|
||||||
|
|
||||||
|
.. attention::
|
||||||
|
|
||||||
|
A request with a wrong ``vqd`` value leads to DDG temporarily putting
|
||||||
|
SearXNG's IP on a block list.
|
||||||
|
|
||||||
|
Requests from IPs in this block list run into timeouts. Not sure, but it
|
||||||
|
seems the block list is a sliding window: to get my IP rid from the bot list
|
||||||
|
I had to cool down my IP for 1h (send no requests from that IP to DDG).
|
||||||
|
"""
|
||||||
|
key = _cache_key(query, region)
|
||||||
|
|
||||||
key = _cache_key(data)
|
|
||||||
value = None
|
|
||||||
c = redisdb.client()
|
c = redisdb.client()
|
||||||
if c:
|
if c:
|
||||||
value = c.get(key)
|
value = c.get(key)
|
||||||
if value or value == b'':
|
if value or value == b'':
|
||||||
value = value.decode('utf-8')
|
value = value.decode('utf-8') # type: ignore
|
||||||
logger.debug("re-use CACHED vqd value: %s", value)
|
logger.debug("re-use CACHED vqd value: %s", value)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
else:
|
for k, value in __CACHE:
|
||||||
for k, value in __CACHE:
|
if k == key:
|
||||||
if k == key:
|
logger.debug("MEM re-use CACHED vqd value: %s", value)
|
||||||
logger.debug("MEM re-use CACHED vqd value: %s", value)
|
return value
|
||||||
|
|
||||||
|
if force_request:
|
||||||
|
resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
|
||||||
|
if resp.status_code == 200: # type: ignore
|
||||||
|
value = extr(resp.text, 'vqd="', '"') # type: ignore
|
||||||
|
if value:
|
||||||
|
logger.debug("vqd value from DDG request: %s", value)
|
||||||
|
cache_vqd(query, region, value)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@ -251,7 +264,7 @@ def request(query, params):
|
||||||
for x in query.split()
|
for x in query.split()
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||||
if eng_region == "wt-wt":
|
if eng_region == "wt-wt":
|
||||||
# https://html.duckduckgo.com/html sets an empty value for "all".
|
# https://html.duckduckgo.com/html sets an empty value for "all".
|
||||||
eng_region = ""
|
eng_region = ""
|
||||||
|
@ -310,10 +323,7 @@ def request(query, params):
|
||||||
params['data']['v'] = form_data.get('v', 'l')
|
params['data']['v'] = form_data.get('v', 'l')
|
||||||
params['headers']['Referer'] = url
|
params['headers']['Referer'] = url
|
||||||
|
|
||||||
# from here on no more params['data'] shuld be set, since this dict is
|
vqd = get_vqd(query, eng_region, force_request=False)
|
||||||
# needed to get a vqd value from the cache ..
|
|
||||||
|
|
||||||
vqd = get_vqd(params['data'])
|
|
||||||
|
|
||||||
# Certain conditions must be met in order to call up one of the
|
# Certain conditions must be met in order to call up one of the
|
||||||
# following pages ...
|
# following pages ...
|
||||||
|
@ -362,7 +372,7 @@ def response(resp):
|
||||||
form = form[0]
|
form = form[0]
|
||||||
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
|
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
|
||||||
|
|
||||||
cache_vqd(resp.search_params["data"], form_vqd)
|
cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd)
|
||||||
|
|
||||||
# just select "web-result" and ignore results of class "result--ad result--ad--small"
|
# just select "web-result" and ignore results of class "result--ad result--ad--small"
|
||||||
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
|
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
|
||||||
|
@ -379,7 +389,7 @@ def response(resp):
|
||||||
results.append(item)
|
results.append(item)
|
||||||
|
|
||||||
zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
|
zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
|
||||||
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()
|
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() # type: ignore
|
||||||
|
|
||||||
if zero_click and (
|
if zero_click and (
|
||||||
"Your IP address is" not in zero_click
|
"Your IP address is" not in zero_click
|
||||||
|
@ -432,7 +442,7 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||||
if not resp.ok: # type: ignore
|
if not resp.ok: # type: ignore
|
||||||
print("ERROR: response from DuckDuckGo is not OK.")
|
print("ERROR: response from DuckDuckGo is not OK.")
|
||||||
|
|
||||||
js_code = extr(resp.text, 'regions:', ',snippetLengths')
|
js_code = extr(resp.text, 'regions:', ',snippetLengths') # type: ignore
|
||||||
|
|
||||||
regions = json.loads(js_code)
|
regions = json.loads(js_code)
|
||||||
for eng_tag, name in regions.items():
|
for eng_tag, name in regions.items():
|
||||||
|
@ -466,7 +476,7 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||||
|
|
||||||
engine_traits.custom['lang_region'] = {}
|
engine_traits.custom['lang_region'] = {}
|
||||||
|
|
||||||
js_code = extr(resp.text, 'languages:', ',regions')
|
js_code = extr(resp.text, 'languages:', ',regions') # type: ignore
|
||||||
|
|
||||||
languages = js_variable_to_python(js_code)
|
languages = js_variable_to_python(js_code)
|
||||||
for eng_lang, name in languages.items():
|
for eng_lang, name in languages.items():
|
||||||
|
|
|
@ -4,16 +4,15 @@ DuckDuckGo Extra (images, videos, news)
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from searx.utils import get_embeded_stream_url
|
from searx.utils import get_embeded_stream_url
|
||||||
|
|
||||||
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
||||||
from searx.engines.duckduckgo import (
|
from searx.engines.duckduckgo import get_ddg_lang, get_vqd
|
||||||
get_ddg_lang,
|
|
||||||
get_vqd,
|
|
||||||
)
|
|
||||||
from searx.enginelib.traits import EngineTraits
|
from searx.enginelib.traits import EngineTraits
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -48,15 +47,16 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||||
|
|
||||||
# request needs a vqd argument
|
# request needs a vqd argument
|
||||||
vqd = get_vqd(query)
|
vqd = get_vqd(query, eng_region, force_request=True)
|
||||||
|
|
||||||
if not vqd:
|
if not vqd:
|
||||||
# some search terms do not have results and therefore no vqd value
|
# some search terms do not have results and therefore no vqd value
|
||||||
params['url'] = None
|
params['url'] = None
|
||||||
return params
|
return params
|
||||||
|
|
||||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
|
||||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||||
|
|
||||||
args = {
|
args = {
|
||||||
|
@ -86,6 +86,12 @@ def request(query, params):
|
||||||
|
|
||||||
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
|
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
|
||||||
|
|
||||||
|
# sending these two headers prevents rate limiting for the query
|
||||||
|
params['headers'] = {
|
||||||
|
'Referer': 'https://duckduckgo.com/',
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
}
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue