From b8fc531b60221756446d50b1055161ec6dd1c34c Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 5 Jun 2015 11:23:24 +0200 Subject: [PATCH] [enh] google engine : parse map links and more --- searx/engines/google.py | 180 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 166 insertions(+), 14 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 785cd5e66..3684a9e68 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -8,6 +8,7 @@ # @stable no (HTML can change) # @parse url, title, content, suggestion +import re from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html @@ -78,15 +79,22 @@ country_to_hostname = { 'TW': 'www.google.com.tw' # Taiwan } +# osm +url_map = 'https://www.openstreetmap.org/'\ + + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' + # search-url search_path = '/search' -maps_path = '/maps/' -redirect_path = '/url' -images_path = '/images' search_url = ('https://{hostname}' + search_path + '?{query}&start={offset}&gbv=1') +# other URLs +map_hostname_start = 'maps.google.' +maps_path = '/maps' +redirect_path = '/url' +images_path = '/images' + # specific xpath variables results_xpath = '//li[@class="g"]' url_xpath = './/h3/a/@href' @@ -95,10 +103,32 @@ content_xpath = './/span[@class="st"]' content_misc_xpath = './/div[@class="f slp"]' suggestion_xpath = '//p[@class="_Bmc"]' +# map : detail location +map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' +map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span' +map_website_url_xpath = 'h3[2]/a/@href' +map_website_title_xpath = 'h3[2]' + +# map : near the location +map_near = 'table[@class="ts"]//tr' +map_near_title = './/h4' +map_near_url = './/h4/a/@href' +map_near_phone = './/span[@class="nobr"]' + +# images images_xpath = './/div/a' image_url_xpath = './@href' image_img_src_xpath = './img/@src' +# property names +# FIXME : no translation +property_address = "Address" +property_phone = "Phone number" +property_location = "Location" +property_website = "Web site" +property_gplus_website = "Google plus" + +# cookies pref_cookie = '' nid_cookie = {} @@ -122,6 +152,11 @@ def get_google_nid_cookie(google_hostname): # remove google-specific tracking-url def parse_url(url_string, google_hostname): + # sanity check + if url_string is None: + return url_string + + # normal case parsed_url = urlparse(url_string) if (parsed_url.netloc in [google_hostname, ''] and parsed_url.path == redirect_path): @@ -131,6 +166,19 @@ def parse_url(url_string, google_hostname): return url_string +# URL : get label +def url_get_label(url_string): + # sanity check + if url_string is None: + return url_string + + # normal case + parsed_url = urlparse(url_string) + if parsed_url.netloc == 'plus.google.com': + return property_gplus_website + return property_website + + # returns extract_text on the first result selected by the xpath or None def extract_text_from_dom(result, xpath): r = result.xpath(xpath) @@ -151,7 +199,7 @@ def request(query, params): if len(language_array) == 2: country = language_array[1] else: - country = ' ' + country = 'US' language = language_array[0] + ',' + language_array[0] + '-' + country if use_locale_domain: @@ -196,21 +244,32 @@ def response(resp): try: url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) - if (parsed_url.netloc == google_hostname - and (parsed_url.path == search_path - or parsed_url.path.startswith(maps_path))): - # remove the link to google news and google maps - # FIXME : sometimes the URL is https://maps.google.*/maps - # no consequence, the result trigger an exception after which is ignored - continue + + # map result + if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path)) + or (parsed_url.netloc.startswith(map_hostname_start))): + x = result.xpath(map_near) + if len(x) > 0: + # map : near the location + results = results + parse_map_near(parsed_url, x, google_hostname) + else: + # map : detail about a location + results = results + parse_map_detail(parsed_url, result, google_hostname) + + # google news + elif (parsed_url.netloc == google_hostname + and parsed_url.path == search_path): + # skipping news results + pass # images result - if (parsed_url.netloc == google_hostname - and parsed_url.path == images_path): + elif (parsed_url.netloc == google_hostname + and parsed_url.path == images_path): # only thumbnail image provided, # so skipping image results # results = results + parse_images(result, google_hostname) pass + else: # normal result content = extract_text_from_dom(result, content_xpath) @@ -223,7 +282,7 @@ def response(resp): results.append({'url': url, 'title': title, 'content': content}) - except Exception: + except: continue # parse suggestion @@ -249,3 +308,96 @@ def parse_images(result, google_hostname): 'template': 'images.html'}) return results + + +def parse_map_near(parsed_url, x, google_hostname): + results = [] + + for result in x: + title = extract_text_from_dom(result, map_near_title) + url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname) + phone = extract_text_from_dom(result, map_near_phone) + if phone is not None: + phone = property_phone + ": " + phone + results.append({'url': url, + 'title': title, + 'content': phone}) + + return results + + +def parse_map_detail(parsed_url, result, google_hostname): + results = [] + + # try to parse the geoloc + m = re.search('@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path) + if m is None: + m = re.search('ll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query) + + if m is not None: + # geoloc found + lon = float(m.group(2)) + lat = float(m.group(1)) + zoom = int(m.group(3)) + + # TODO : map zoom to dlon / dlat + dlon = 0.000001 + dlat = 0.000001 + + boundingbox = [round(lat - dlat, 7), round(lat + dlat, 7), round(lon - dlon, 7), round(lon + dlon, 7)] + map_url = url_map\ + .replace('{latitude}', str(lat))\ + .replace('{longitude}', str(lon))\ + .replace('{zoom}', str(zoom+2)) + + geojson = {u'type': u'Point', + u'coordinates': [lon, lat] + } + + # attributes + attributes = [] + add_attributes(attributes, property_address, extract_text_from_dom(result, map_address_xpath)) + add_attributes(attributes, property_phone, extract_text_from_dom(result, map_phone_xpath)) + + # title / content / url + website_title = extract_text_from_dom(result, map_website_title_xpath) + content = extract_text_from_dom(result, content_xpath) + website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname) + + # add an infobox if there is a website + if website_url is not None: + results.append({'infobox': website_title, + 'id': website_url, + 'content': content, + 'attributes': attributes, + 'urls': [ + {'title': url_get_label(website_url), 'url': website_url}, + {'title': property_location, 'url': map_url} + ] + }) + + # usefull because user can see the map directly into searx + results.append({'template': 'map.html', + 'title': website_title, + 'content': (content + '
' if content is not None else '') + + attributes_to_html(attributes), + 'longitude': lon, + 'latitude': lat, + 'boundingbox': boundingbox, + 'geojson': geojson, + 'url': website_url if website_url is not None else map_url + }) + return results + + +def add_attributes(attributes, name, value): + if value is not None and len(value) > 0: + attributes.append({'label': name, 'value': value}) + + +def attributes_to_html(attributes): + retval = '' + for a in attributes: + retval = retval + '' + retval = retval + '
' + a.get('label') + '' + a.get('value') + '
' + return retval