[doc] add documentation about the XPath engine

- pylint searx/engines/xpath.py
- fix indentation of some long lines
- add logging
- add doc-strings

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2021-05-23 10:56:29 +02:00
parent 2398e9a1fe
commit 8cd544b2a6
4 changed files with 110 additions and 28 deletions

View file

@ -43,7 +43,7 @@ argument type information
categories list pages, in which the engine is working categories list pages, in which the engine is working
paging boolean support multible pages paging boolean support multible pages
time_range_support boolean support search time range time_range_support boolean support search time range
engine_type str ``online`` by default, other possibles values are engine_type str ``online`` by default, other possibles values are
``offline``, ``online_dictionnary``, ``online_currency`` ``offline``, ``online_dictionnary``, ``online_currency``
======================= =========== ======================================================== ======================= =========== ========================================================
@ -100,6 +100,8 @@ example code
paging = True paging = True
.. _engine request:
making a request making a request
================ ================
@ -198,6 +200,8 @@ example code
return params return params
.. _engine results:
returned results returned results
================ ================

View file

@ -9,6 +9,7 @@ Developer documentation
quickstart quickstart
contribution_guide contribution_guide
engine_overview engine_overview
xpath_engine
search_api search_api
plugins plugins
translation translation

View file

@ -0,0 +1,9 @@
.. _xpath_engine:
================
The XPath engine
================
.. automodule:: searx.engines.xpath
:members:

View file

@ -1,51 +1,106 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-function-docstring
"""The XPath engine is a *generic* engine with which it is possible to configure
engines in the settings.
Here is a simple example of a XPath engine configured in the
:ref:`settings engine` section, further read :ref:`engines-dev`.
.. code:: yaml
- name : bitbucket
engine : xpath
paging : True
search_url : https://bitbucket.org/repo/all/{pageno}?name={query}
url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href
title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]
content_xpath : //article[@class="repo-summary"]/p
"""
from urllib.parse import urlencode
from lxml import html from lxml import html
from urllib.parse import urlencode
from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
from searx import logger
logger = logger.getChild('XPath engine')
search_url = None search_url = None
url_xpath = None """
content_xpath = None Search URL of the engine, replacements are:
title_xpath = None
thumbnail_xpath = False ``{query}``:
paging = False Search terms from user.
suggestion_xpath = ''
``{pageno}``:
Page number if engine supports pagging :py:obj:`paging`
"""
soft_max_redirects = 0
'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
results_xpath = '' results_xpath = ''
'''XPath selector for the list of result items'''
url_xpath = None
'''XPath selector of result's ``url``.'''
content_xpath = None
'''XPath selector of result's ``content``.'''
title_xpath = None
'''XPath selector of result's ``title``.'''
thumbnail_xpath = False
'''XPath selector of result's ``img_src``.'''
suggestion_xpath = ''
'''XPath selector of result's ``suggestion``.'''
cached_xpath = '' cached_xpath = ''
cached_url = '' cached_url = ''
soft_max_redirects = 0
# parameters for engines with paging support paging = False
# '''Engine supports paging [True or False].'''
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1 page_size = 1
# number of the first page (usually 0 or 1) '''Number of results on each page. Only needed if the site requires not a page
first_page_num = 1 number, but an offset.'''
first_page_num = 1
'''Number of the first page (usually 0 or 1).'''
def request(query, params): def request(query, params):
'''Build request parameters (see :ref:`engine request`).
'''
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
fp = {'query': query} fargs = {'query': query}
if paging and search_url.find('{pageno}') >= 0: if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num fargs['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['url'] = search_url.format(**fp) params['url'] = search_url.format(**fargs)
params['query'] = query params['query'] = query
params['soft_max_redirects'] = soft_max_redirects params['soft_max_redirects'] = soft_max_redirects
logger.debug("query_url --> %s", params['url'])
return params return params
def response(resp): def response(resp):
'''Scrap *results* from the response (see :ref:`engine results`).
'''
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
is_onion = True if 'onions' in categories else False # pylint: disable=undefined-variable is_onion = 'onions' in categories # pylint: disable=undefined-variable
if results_xpath: if results_xpath:
for result in eval_xpath_list(dom, results_xpath): for result in eval_xpath_list(dom, results_xpath):
url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
content = extract_text(eval_xpath_list(result, content_xpath, min_len=1)) content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
@ -59,13 +114,16 @@ def response(resp):
# add alternative cached url if available # add alternative cached url if available
if cached_xpath: if cached_xpath:
tmp_result['cached_url'] = cached_url\ tmp_result['cached_url'] = (
cached_url
+ extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
)
if is_onion: if is_onion:
tmp_result['is_onion'] = True tmp_result['is_onion'] = True
results.append(tmp_result) results.append(tmp_result)
else: else:
if cached_xpath: if cached_xpath:
for url, title, content, cached in zip( for url, title, content, cached in zip(
@ -75,8 +133,12 @@ def response(resp):
map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)),
map(extract_text, eval_xpath_list(dom, cached_xpath)) map(extract_text, eval_xpath_list(dom, cached_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, results.append({
'cached_url': cached_url + cached, 'is_onion': is_onion}) 'url': url,
'title': title,
'content': content,
'cached_url': cached_url + cached, 'is_onion': is_onion
})
else: else:
for url, title, content in zip( for url, title, content in zip(
(extract_url(x, search_url) for (extract_url(x, search_url) for
@ -84,10 +146,16 @@ def response(resp):
map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)),
map(extract_text, eval_xpath_list(dom, content_xpath)) map(extract_text, eval_xpath_list(dom, content_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) results.append({
'url': url,
'title': title,
'content': content,
'is_onion': is_onion
})
if not suggestion_xpath: if suggestion_xpath:
return results for suggestion in eval_xpath(dom, suggestion_xpath):
for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)})
results.append({'suggestion': extract_text(suggestion)})
logger.debug("found %s results", len(results))
return results return results