From 5dc3eb3399366b3e30caac1cbd271018da4d7d45 Mon Sep 17 00:00:00 2001 From: Dalf Date: Sun, 14 Sep 2014 14:40:55 +0200 Subject: [PATCH] [fix] rewrite the google engine since Google Web Search API is about to expire --- searx/engines/google.py | 106 +++++++++++++++++++++++++++++++--------- 1 file changed, 82 insertions(+), 24 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 80c7cc746..9dbe8b8f0 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,15 +1,17 @@ ## Google (Web) # # @website https://www.google.com -# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! +# @provide-api yes (https://developers.google.com/custom-search/) # -# @using-api yes -# @results JSON -# @stable yes (but deprecated) -# @parse url, title, content +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content, suggestion from urllib import urlencode -from json import loads +from urlparse import unquote,urlparse,parse_qsl +from lxml import html +from searx.engines.xpath import extract_text, extract_url # engine dependent config categories = ['general'] @@ -17,21 +19,45 @@ paging = True language_support = True # search-url -url = 'https://ajax.googleapis.com/' -search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa +google_hostname = 'www.google.com' +search_path = '/search' +redirect_path = '/url' +images_path = '/images' +search_url = 'https://' + google_hostname + search_path + '?{query}&start={offset}&gbv=1' +# specific xpath variables +results_xpath= '//li[@class="g"]' +url_xpath = './/h3/a/@href' +title_xpath = './/h3' +content_xpath = './/span[@class="st"]' +suggestion_xpath = '//p[@class="_Bmc"]' + +images_xpath = './/div/a' +image_url_xpath = './@href' +image_img_src_xpath = './img/@src' + +# remove google-specific tracking-url +def parse_url(url_string): + parsed_url = urlparse(url_string) + if parsed_url.netloc in [google_hostname, ''] and parsed_url.path==redirect_path: + query = dict(parse_qsl(parsed_url.query)) + return query['q'] + else: + return url_string # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 8 + offset = (params['pageno'] - 1) * 10 - language = 'en-US' - if params['language'] != 'all': - language = params['language'].replace('_', '-') + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].replace('_','-').lower() params['url'] = search_url.format(offset=offset, - query=urlencode({'q': query}), - language=language) + query=urlencode({'q': query})) + + params['headers']['Accept-Language'] = language return params @@ -40,18 +66,50 @@ def request(query, params): def response(resp): results = [] - search_res = loads(resp.text) - - # return empty array if there are no results - if not search_res.get('responseData', {}).get('results'): - return [] + dom = html.fromstring(resp.text) # parse results - for result in search_res['responseData']['results']: - # append result - results.append({'url': result['unescapedUrl'], - 'title': result['titleNoFormatting'], - 'content': result['content']}) + for result in dom.xpath(results_xpath): + title = extract_text(result.xpath(title_xpath)[0]) + try: + url = parse_url(extract_url(result.xpath(url_xpath), search_url)) + parsed_url = urlparse(url) + if parsed_url.netloc==google_hostname and parsed_url.path==search_path: + # remove the link to google news + continue + + if parsed_url.netloc==google_hostname and parsed_url.path==images_path: + # images result + results = results + parse_images(result) + else: + # normal result + content = extract_text(result.xpath(content_xpath)[0]) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + continue + + # parse suggestion + for suggestion in dom.xpath(suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) # return results return results + +def parse_images(result): + results = [] + for image in result.xpath(images_xpath): + url = parse_url(extract_text(image.xpath(image_url_xpath)[0])) + img_src = extract_text(image.xpath(image_img_src_xpath)[0]) + + # append result + results.append({'url': url, + 'title': '', + 'content': '', + 'img_src': img_src, + 'template': 'images.html'}) + + return results