[mod] engine ask.com - parse JS result to JSON

Parse the result list from ask.com given in the variable named
window.MESON.initialState::

    <script nonce="..">
        window.MESON = window.MESON || {};
        window.MESON.initialState = {"siteConfig": ...
          ...}};
        window.MESON.loadedLang = "en";
    </script>

The result list is in field::

    json_resp['search']['webResults']['results']

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2024-02-16 18:17:03 +01:00 committed by Markus Heiser
parent 3df53d6e50
commit 76845ea42c

View file

@ -3,8 +3,9 @@
"""Ask.com""" """Ask.com"""
from urllib.parse import urlencode from urllib.parse import urlencode
import re import dateutil
from lxml import html from lxml import html
from searx import utils
# Metadata # Metadata
about = { about = {
@ -37,20 +38,37 @@ def request(query, params):
def response(resp): def response(resp):
text = html.fromstring(resp.text).text_content() start_tag = 'window.MESON.initialState = {'
urls_match = re.findall(r'"url":"(.*?)"', text) end_tag = '}};'
titles_match = re.findall(r'"title":"(.*?)"', text)[3:]
content_match = re.findall(r'"abstract":"(.*?)"', text)
results = [ dom = html.fromstring(resp.text)
{ script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text
"url": url,
"title": title, pos = script.index(start_tag) + len(start_tag) - 1
"content": content, script = script[pos:]
} pos = script.index(end_tag) + len(end_tag) - 1
for url, title, content in zip(urls_match, titles_match, content_match) script = script[:pos]
if "&qo=relatedSearchNarrow" not in url
# Related searches shouldn't be in the search results: www.ask.com/web&q=related json_resp = utils.js_variable_to_python(script)
]
results = []
for item in json_resp['search']['webResults']['results']:
pubdate_original = item.get('pubdate_original')
if pubdate_original:
pubdate_original = dateutil.parser.parse(pubdate_original)
metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)]
results.append(
{
"url": item['url'],
"title": item['title'],
"content": item['abstract'],
"publishedDate": pubdate_original,
# "img_src": item.get('image_url') or None, # these are not thumbs / to large
"metadata": ' | '.join(metadata),
}
)
return results return results