mirror of
https://github.com/searxng/searxng.git
synced 2025-01-04 15:38:42 +00:00
[enh] xpath engine absolute xpath support
This commit is contained in:
parent
c09d69bd2c
commit
5d764f95cf
1 changed files with 12 additions and 7 deletions
|
@ -5,10 +5,10 @@ from cgi import escape
|
||||||
from lxml.etree import _ElementStringResult
|
from lxml.etree import _ElementStringResult
|
||||||
|
|
||||||
search_url = None
|
search_url = None
|
||||||
results_xpath = None
|
|
||||||
url_xpath = None
|
url_xpath = None
|
||||||
content_xpath = None
|
content_xpath = None
|
||||||
title_xpath = None
|
title_xpath = None
|
||||||
|
results_xpath = ''
|
||||||
|
|
||||||
def extract_url(xpath_results):
|
def extract_url(xpath_results):
|
||||||
url = ''
|
url = ''
|
||||||
|
@ -26,7 +26,7 @@ def extract_url(xpath_results):
|
||||||
else:
|
else:
|
||||||
url = xpath_results[0].attrib.get('href')
|
url = xpath_results[0].attrib.get('href')
|
||||||
else:
|
else:
|
||||||
raise Exception('Cannot handle xpath url resultset')
|
url = xpath_results.attrib.get('href')
|
||||||
if not url.startswith('http://') or not url.startswith('https://'):
|
if not url.startswith('http://') or not url.startswith('https://'):
|
||||||
url = 'http://'+url
|
url = 'http://'+url
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
|
@ -45,10 +45,15 @@ def response(resp):
|
||||||
results = []
|
results = []
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
query = resp.search_params['query']
|
query = resp.search_params['query']
|
||||||
|
if results_xpath:
|
||||||
for result in dom.xpath(results_xpath):
|
for result in dom.xpath(results_xpath):
|
||||||
url = extract_url(result.xpath(url_xpath))
|
url = extract_url(result.xpath(url_xpath))
|
||||||
title = ' '.join(result.xpath(title_xpath))
|
title = ' '.join(result.xpath(title_xpath))
|
||||||
content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
|
content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
results.append({'url': url, 'title': title, 'content': content})
|
||||||
|
else:
|
||||||
|
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
|
||||||
|
results.append({'url': url, 'title': title, 'content': content})
|
||||||
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
Loading…
Reference in a new issue