[mod][fix] xpath engine simplified, yahoo engine never returns truncated urls

This commit is contained in:
Dalf 2014-01-05 14:06:52 +01:00
parent a2928e8d83
commit 3dc3fc7770
2 changed files with 63 additions and 27 deletions

View file

@ -5,7 +5,7 @@ number_of_results = 1
[bing] [bing]
engine = bing engine = bing
locale = en-us locale = en-US
[cc] [cc]
engine=currency_convert engine=currency_convert
@ -64,17 +64,17 @@ categories = social media
[urbandictionary] [urbandictionary]
engine = xpath engine = xpath
search_url = http://www.urbandictionary.com/define.php?term={query} search_url = http://www.urbandictionary.com/define.php?term={query}
url_xpath = //div[@id="entries"]//div[@class="word"]//a url_xpath = //div[@id="entries"]//div[@class="word"]/a/@href
title_xpath = //div[@id="entries"]//div[@class="word"]//span//text() title_xpath = //div[@id="entries"]//div[@class="word"]/span
content_xpath = //div[@id="entries"]//div[@class="text"]//div[@class="definition"]//text() content_xpath = //div[@id="entries"]//div[@class="text"]/div[@class="definition"]
[yahoo] [yahoo]
engine = xpath engine = xpath
search_url = http://search.yahoo.com/search?p={query} search_url = http://search.yahoo.com/search?p={query}
results_xpath = //div[@class="res"] results_xpath = //div[@class="res"]
url_xpath = .//span[@class="url"]//text() url_xpath = .//h3/a/@href
content_xpath = .//div[@class="abstr"]//text() title_xpath = .//h3/a
title_xpath = .//h3/a//text() content_xpath = .//div[@class="abstr"]
suggestion_xpath = //div[@id="satat"]//a suggestion_xpath = //div[@id="satat"]//a
[youtube] [youtube]

View file

@ -1,5 +1,5 @@
from lxml import html from lxml import html
from urllib import urlencode from urllib import urlencode, unquote
from urlparse import urlparse, urljoin from urlparse import urlparse, urljoin
from cgi import escape from cgi import escape
from lxml.etree import _ElementStringResult from lxml.etree import _ElementStringResult
@ -11,32 +11,64 @@ title_xpath = None
suggestion_xpath = '' suggestion_xpath = ''
results_xpath = '' results_xpath = ''
def extract_url(xpath_results): '''
url = '' if xpath_results is list, extract the text from each result and concat the list
parsed_search_url = urlparse(search_url) if xpath_results is a xml element, extract all the text node from it ( text_content() method from lxml )
if xpath_results is a string element, then it's already done
'''
def extract_text(xpath_results):
if type(xpath_results) == list: if type(xpath_results) == list:
# it's list of result : concat everything using recursive call
if not len(xpath_results): if not len(xpath_results):
raise Exception('Empty url resultset') raise Exception('Empty url resultset')
if type(xpath_results[0]) == _ElementStringResult: result = ''
url = ''.join(xpath_results) for e in xpath_results:
result = result + extract_text(e)
return result
elif type(xpath_results) == _ElementStringResult:
# it's a string
return ''.join(xpath_results)
else:
# it's a element
return xpath_results.text_content()
def extract_url(xpath_results):
url = extract_text(xpath_results)
if url.startswith('//'): if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
url = parsed_search_url.scheme+url url = parsed_search_url.scheme+url
elif url.startswith('/'): elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url) url = urljoin(search_url, url)
#TODO
else: # normalize url
url = xpath_results[0].attrib.get('href') url = normalize_url(url)
else:
url = xpath_results.attrib.get('href') return url
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://'+url
def normalize_url(url):
parsed_url = urlparse(url) parsed_url = urlparse(url)
# add a / at this end of the url if there is no path
if not parsed_url.netloc: if not parsed_url.netloc:
raise Exception('Cannot parse url') raise Exception('Cannot parse url')
if not parsed_url.path: if not parsed_url.path:
url += '/' url += '/'
# FIXME : hack for yahoo
if parsed_url.hostname == 'search.yahoo.com' and parsed_url.path.startswith('/r'):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
return unquote(p[mark+3:]).decode('utf-8')
return url return url
def request(query, params): def request(query, params):
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
params['url'] = search_url.format(query=query) params['url'] = search_url.format(query=query)
@ -50,15 +82,19 @@ def response(resp):
if results_xpath: if results_xpath:
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath)) url = extract_url(result.xpath(url_xpath))
title = ' '.join(result.xpath(title_xpath)) title = extract_text(result.xpath(title_xpath)[0 ])
content = escape(' '.join(result.xpath(content_xpath))) content = extract_text(result.xpath(content_xpath)[0])
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
else: else:
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)): for url, title, content in zip(
map(extract_url, dom.xpath(url_xpath)), \
map(extract_text, dom.xpath(title_xpath)), \
map(extract_text, dom.xpath(content_xpath)), \
):
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath: if not suggestion_xpath:
return results return results
for suggestion in dom.xpath(suggestion_xpath): for suggestion in dom.xpath(suggestion_xpath):
results.append({'suggestion': escape(''.join(suggestion.xpath('.//text()')))}) results.append({'suggestion': extract_text(suggestion)})
return results return results