searxng/searx/engines/wolframalpha_noapi.py

87 lines
2.2 KiB
Python
Raw Normal View History

2015-12-30 02:59:51 +00:00
# WolframAlpha (Maths)
#
# @website http://www.wolframalpha.com/
2016-01-02 04:02:10 +00:00
# @provide-api yes (http://api.wolframalpha.com/v2/)
2015-12-30 02:59:51 +00:00
#
# @using-api no
# @results HTML
2015-12-30 02:59:51 +00:00
# @stable no
# @parse answer
2016-01-02 07:49:32 +00:00
from re import search, sub
from json import loads
2015-12-30 02:59:51 +00:00
from urllib import urlencode
2016-01-02 07:49:32 +00:00
from lxml import html
import HTMLParser
2015-12-30 02:59:51 +00:00
# search-url
url = 'http://www.wolframalpha.com/'
search_url = url+'input/?{query}'
2016-01-02 07:49:32 +00:00
# xpath variables
scripts_xpath = '//script'
title_xpath = '//title'
failure_xpath = '//p[attribute::class="pfail"]'
2015-12-30 02:59:51 +00:00
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'i': query}))
return params
# get response from search-request
def response(resp):
results = []
line = None
2016-01-02 07:49:32 +00:00
dom = html.fromstring(resp.text)
scripts = dom.xpath(scripts_xpath)
# the answer is inside a js function
# answer can be located in different 'pods', although by default it should be in pod_0200
2016-01-04 04:03:33 +00:00
possible_locations = ['pod_0200\.push\((.*)',
'pod_0100\.push\((.*)']
2015-12-30 02:59:51 +00:00
2016-01-02 07:49:32 +00:00
# failed result
if dom.xpath(failure_xpath):
return results
2015-12-30 02:59:51 +00:00
# get line that matches the pattern
for pattern in possible_locations:
2016-01-02 07:49:32 +00:00
for script in scripts:
try:
line = search(pattern, script.text_content()).group(1)
break
except AttributeError:
continue
if line:
break
2015-12-30 02:59:51 +00:00
2016-01-02 04:02:10 +00:00
if line:
# extract answer from json
answer = line[line.find('{'):line.rfind('}')+1]
2016-01-04 04:03:33 +00:00
try:
answer = loads(answer)
except Exception:
answer = loads(answer.encode('unicode-escape'))
answer = answer['stringified']
# clean plaintext answer
h = HTMLParser.HTMLParser()
answer = h.unescape(answer.decode('unicode-escape'))
2016-01-02 07:49:32 +00:00
answer = sub(r'\\', '', answer)
2016-01-02 06:41:14 +00:00
2016-01-02 04:02:10 +00:00
results.append({'answer': answer})
2015-12-30 02:59:51 +00:00
2016-01-02 07:49:32 +00:00
# user input is in first part of title
title = dom.xpath(title_xpath)[0].text.encode('utf-8')
2016-01-02 07:49:32 +00:00
result_url = request(title[:-16], {})['url']
2015-12-30 02:59:51 +00:00
2016-01-02 04:02:10 +00:00
# append result
2016-01-02 07:49:32 +00:00
results.append({'url': result_url,
'title': title.decode('utf-8')})
2015-12-30 02:59:51 +00:00
return results