update bing engines and fix bing_news

This commit is contained in:
Thomas Pointhuber 2014-09-01 14:38:59 +02:00
parent 55dfb305a0
commit 4b1e0423a0
2 changed files with 92 additions and 23 deletions

View file

@ -1,48 +1,81 @@
## Bing (Web)
#
# @website https://www.bing.com
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo publishedDate
from urllib import urlencode from urllib import urlencode
from cgi import escape from cgi import escape
from lxml import html from lxml import html
base_url = 'http://www.bing.com/' # engine dependent config
search_string = 'search?{query}&first={offset}' categories = ['general']
paging = True paging = True
language_support = True language_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'search?{query}&first={offset}'
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en-US' language = 'en-US'
else: else:
language = params['language'].replace('_', '-') language = params['language'].replace('_', '-')
search_path = search_string.format( search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}), query=urlencode({'q': query, 'setmkt': language}),
offset=offset) offset=offset)
params['cookies']['SRCHHPGUSR'] = \ params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
#if params['category'] == 'images':
# params['url'] = base_url + 'images/' + search_path
params['url'] = base_url + search_path params['url'] = base_url + search_path
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'): for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0] link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p//text()'))) content = escape(' '.join(result.xpath('.//p//text()')))
results.append({'url': url, 'title': title, 'content': content})
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results if something is found
if results: if results:
return results return results
# parse results again if nothing is found yet
for result in dom.xpath('//li[@class="b_algo"]'): for result in dom.xpath('//li[@class="b_algo"]'):
link = result.xpath('.//h2/a')[0] link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p//text()'))) content = escape(' '.join(result.xpath('.//p//text()')))
results.append({'url': url, 'title': title, 'content': content})
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results return results

View file

@ -1,50 +1,86 @@
## Bing (News)
#
# @website https://www.bing.com/news
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate
from urllib import urlencode from urllib import urlencode
from cgi import escape from cgi import escape
from lxml import html from lxml import html
from datetime import datetime, timedelta
from dateutil import parser
import re
# engine dependent config
categories = ['news'] categories = ['news']
base_url = 'http://www.bing.com/'
search_string = 'news/search?{query}&first={offset}'
paging = True paging = True
language_support = True language_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}'
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en-US' language = 'en-US'
else: else:
language = params['language'].replace('_', '-') language = params['language'].replace('_', '-')
search_path = search_string.format( search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}), query=urlencode({'q': query, 'setmkt': language}),
offset=offset) offset=offset)
params['cookies']['SRCHHPGUSR'] = \ params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
#if params['category'] == 'images':
# params['url'] = base_url + 'images/' + search_path
params['url'] = base_url + search_path params['url'] = base_url + search_path
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0] # parse results
for result in dom.xpath('//div[@class="sn_r"]'):
link = result.xpath('.//div[@class="newstitle"]/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p//text()'))) content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')))
results.append({'url': url, 'title': title, 'content': content})
# parse publishedDate
publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()')))
if results: if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
return results timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(minutes=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
else:
publishedDate = parser.parse(publishedDate)
for result in dom.xpath('//li[@class="b_algo"]'): # append result
link = result.xpath('.//h2/a')[0] results.append({'url': url,
url = link.attrib.get('href') 'title': title,
title = ' '.join(link.xpath('.//text()')) 'publishedDate': publishedDate,
content = escape(' '.join(result.xpath('.//p//text()'))) 'content': content})
results.append({'url': url, 'title': title, 'content': content})
# return results
return results return results