From 16d15268181d536f1cf7126674619e942fc23b99 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 16:48:18 +0200 Subject: [PATCH 01/16] add comments to deviantart engine --- searx/engines/deviantart.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index 298b9a397..29386f71d 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -1,35 +1,61 @@ +## Deviantart (Images) +# +# @website https://www.deviantart.com/ +# @provide-api yes (https://www.deviantart.com/developers/) (RSS) +# +# @using-api no (TODO, rewrite to api) +# @results HTML +# @stable no (HTML can change) +# @parse url, title, thumbnail +# +# @todo rewrite to api + from urllib import urlencode from urlparse import urljoin from lxml import html +# engine dependent config categories = ['images'] +paging = True +# search-url base_url = 'https://www.deviantart.com/' search_url = base_url+'search?offset={offset}&{query}' -paging = True - +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 24 + params['url'] = search_url.format(offset=offset, query=urlencode({'q': query})) + return params +# get response from search-request def response(resp): results = [] + + # return empty array if a redirection code is returned if resp.status_code == 302: - return results + return [] + dom = html.fromstring(resp.text) + + # parse results for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'): link = result.xpath('.//a[contains(@class, "thumb")]')[0] url = urljoin(base_url, link.attrib.get('href')) title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') # noqa title = ''.join(title_links[0].xpath('.//text()')) img_src = link.xpath('.//img')[0].attrib['src'] + + # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'template': 'images.html'}) + + # return results return results From 3d61d9b9308df7e07ab5ccee6e2f323652b4be2d Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:13:44 +0200 Subject: [PATCH 02/16] little refactoring --- searx/engines/bing.py | 1 + searx/engines/bing_images.py | 1 + searx/engines/bing_news.py | 1 + searx/engines/deviantart.py | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 2da0df885..56c6b36c1 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -23,6 +23,7 @@ language_support = True base_url = 'https://www.bing.com/' search_string = 'search?{query}&first={offset}' + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 5f7f36bc1..8f77e15ce 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -24,6 +24,7 @@ paging = True base_url = 'https://www.bing.com/' search_string = 'images/search?{query}&count=10&first={offset}' + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 2db41eca8..6c99c35dc 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -24,6 +24,7 @@ language_support = True base_url = 'https://www.bing.com/' search_string = 'news/search?{query}&first={offset}' + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index 29386f71d..ff5e1d465 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -6,7 +6,7 @@ # @using-api no (TODO, rewrite to api) # @results HTML # @stable no (HTML can change) -# @parse url, title, thumbnail +# @parse url, title, thumbnail, img_src # # @todo rewrite to api From e6e4de8ba037f1356104289555bd8bd63fedbc9c Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:14:57 +0200 Subject: [PATCH 03/16] rewrite duckduckgo engine and add comments --- searx/engines/duckduckgo.py | 71 +++++++++++++++++++------------------ searx/settings.yml | 2 -- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 58cbc9872..eae79481d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,24 +1,48 @@ +## DuckDuckGo (Web) +# +# @website https://duckduckgo.com/ +# @provide-api yes (https://duckduckgo.com/api), but not all results from search-site +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo rewrite to api +# @todo language support + from urllib import urlencode from lxml.html import fromstring from searx.utils import html_to_text -url = 'https://duckduckgo.com/html?{query}&s={offset}' +# engine dependent config +categories = ['general'] +paging = True locale = 'us-en' +# search-url +url = 'https://duckduckgo.com/html?{query}&s={offset}' +# specific xpath variables +result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa +url_xpath = './/a[@class="large"]/@href' +title_xpath = './/a[@class="large"]//text()' +content_xpath = './/div[@class="snippet"]//text()' + + +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 30 - q = urlencode({'q': query, - 'l': locale}) - params['url'] = url.format(query=q, offset=offset) + + params['url'] = url.format( + query=urlencode({'q': query, 'l': locale}), + offset=offset) + return params +# get response from search-request def response(resp): - result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa - url_xpath = './/a[@class="large"]/@href' - title_xpath = './/a[@class="large"]//text()' - content_xpath = './/div[@class="snippet"]//text()' results = [] doc = fromstring(resp.text) @@ -28,38 +52,17 @@ def response(resp): res_url = r.xpath(url_xpath)[-1] except: continue + if not res_url: continue + title = html_to_text(''.join(r.xpath(title_xpath))) content = html_to_text(''.join(r.xpath(content_xpath))) + + # append result results.append({'title': title, 'content': content, 'url': res_url}) + # return results return results - - -#from json import loads -#search_url = url + 'd.js?{query}&p=1&s={offset}' -# -#paging = True -# -# -#def request(query, params): -# offset = (params['pageno'] - 1) * 30 -# q = urlencode({'q': query, -# 'l': locale}) -# params['url'] = search_url.format(query=q, offset=offset) -# return params -# -# -#def response(resp): -# results = [] -# search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] -# for r in search_res: -# if not r.get('t'): -# continue -# results.append({'title': r['t'], -# 'content': html_to_text(r['a']), -# 'url': r['u']}) -# return results diff --git a/searx/settings.yml b/searx/settings.yml index 6d398f871..5a9254070 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -37,7 +37,6 @@ engines: - name : deviantart engine : deviantart - categories : images shortcut : da timeout: 3.0 @@ -47,7 +46,6 @@ engines: - name : duckduckgo engine : duckduckgo - locale : en-us shortcut : ddg # down - website is under criminal investigation by the UK From 80b9312e42087351bb081ceab717e479e75a1ab0 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:20:16 +0200 Subject: [PATCH 04/16] add comments to dummy engine --- searx/engines/dummy.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/searx/engines/dummy.py b/searx/engines/dummy.py index 4586760a0..5a2cdf6b5 100644 --- a/searx/engines/dummy.py +++ b/searx/engines/dummy.py @@ -1,6 +1,14 @@ +## Dummy +# +# @results empty array +# @stable yes + + +# do search-request def request(query, params): return params +# get response from search-request def response(resp): return [] From c5d83059d537d8efb296ffbe743828a884ac4e10 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:28:35 +0200 Subject: [PATCH 05/16] update generalfile engine and add comments --- searx/engines/generalfile.py | 31 ++++++++++++++++++++++++++++--- searx/settings.yml | 1 - 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/searx/engines/generalfile.py b/searx/engines/generalfile.py index d249c00c7..11d8b6955 100644 --- a/searx/engines/generalfile.py +++ b/searx/engines/generalfile.py @@ -1,35 +1,60 @@ +## General Files (Files) +# +# @website http://www.general-files.org +# @provide-api no (nothing found) +# +# @using-api no (because nothing found) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo detect torrents? + from lxml import html +# engine dependent config +categories = ['files'] +paging = True +# search-url base_url = 'http://www.general-file.com' search_url = base_url + '/files-{letter}/{query}/{pageno}' +# specific xpath variables result_xpath = '//table[@class="block-file"]' title_xpath = './/h2/a//text()' url_xpath = './/h2/a/@href' content_xpath = './/p//text()' -paging = True - +# do search-request def request(query, params): + params['url'] = search_url.format(query=query, letter=query[0], pageno=params['pageno']) + return params +# get response from search-request def response(resp): - results = [] + dom = html.fromstring(resp.text) + + # parse results for result in dom.xpath(result_xpath): url = result.xpath(url_xpath)[0] + # skip fast download links if not url.startswith('/'): continue + + # append result results.append({'url': base_url + url, 'title': ''.join(result.xpath(title_xpath)), 'content': ''.join(result.xpath(content_xpath))}) + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index 5a9254070..c6227212e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -62,7 +62,6 @@ engines: - name : general-file engine : generalfile - categories : files shortcut : gf - name : github From 334a286c18652423447aff589ec0f25406e4d8ea Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:37:47 +0200 Subject: [PATCH 06/16] update github engine and add comments --- searx/engines/github.py | 32 ++++++++++++++++++++++++++++++-- searx/settings.yml | 1 - 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/searx/engines/github.py b/searx/engines/github.py index d36797aba..53fec029f 100644 --- a/searx/engines/github.py +++ b/searx/engines/github.py @@ -1,31 +1,59 @@ +## Github (It) +# +# @website https://github.com/ +# @provide-api yes (https://developer.github.com/v3/) +# +# @using-api yes +# @results JSON +# @stable yes (using api) +# @parse url, title, content + from urllib import urlencode from json import loads from cgi import escape +# engine dependent config categories = ['it'] +# search-url search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa accept_header = 'application/vnd.github.preview.text-match+json' +# do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query})) + params['headers']['Accept'] = accept_header + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + + # check if items are recieved if not 'items' in search_res: - return results + return [] + + # parse results for res in search_res['items']: title = res['name'] url = res['html_url'] + if res['description']: content = escape(res['description'][:500]) else: content = '' - results.append({'url': url, 'title': title, 'content': content}) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index c6227212e..1d5f36a57 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -66,7 +66,6 @@ engines: - name : github engine : github - categories : it shortcut : gh - name : google From dae88d862b8de2a7cddf461045ef381b07819fe5 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:56:32 +0200 Subject: [PATCH 07/16] update piratebay engine and add comments --- searx/engines/piratebay.py | 43 +++++++++++++++++++++++++++++++------- searx/settings.yml | 1 - 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index bb4886868..9533b629e 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -1,39 +1,61 @@ +## Piratebay (Videos, Music, Files) +# +# @website https://thepiratebay.se +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + from urlparse import urljoin from cgi import escape from urllib import quote from lxml import html from operator import itemgetter -categories = ['videos', 'music'] +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True +# search-url url = 'https://thepiratebay.se/' search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' -search_types = {'videos': '200', - 'music': '100', - 'files': '0'} +# piratebay specific type-definitions +search_types = {'files': '0', + 'music': '100', + 'videos': '200'} + +# specific xpath variables magnet_xpath = './/a[@title="Download this torrent using magnet"]' content_xpath = './/font[@class="detDesc"]//text()' -paging = True - +# do search-request def request(query, params): - search_type = search_types.get(params['category'], '200') + search_type = search_types.get(params['category'], '0') + params['url'] = search_url.format(search_term=quote(query), search_type=search_type, pageno=params['pageno'] - 1) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) + search_res = dom.xpath('//table[@id="searchResult"]//tr') + # return empty array if nothing is found if not search_res: - return results + return [] + # parse results for result in search_res[1:]: link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) @@ -41,17 +63,21 @@ def response(resp): content = escape(' '.join(result.xpath(content_xpath))) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] + # convert seed to int if possible if seed.isdigit(): seed = int(seed) else: seed = 0 + # convert leech to int if possible if leech.isdigit(): leech = int(leech) else: leech = 0 magnetlink = result.xpath(magnet_xpath)[0] + + # append result results.append({'url': href, 'title': title, 'content': content, @@ -60,4 +86,5 @@ def response(resp): 'magnetlink': magnetlink.attrib['href'], 'template': 'torrent.html'}) + # return results sorted by seeder return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/settings.yml b/searx/settings.yml index 1d5f36a57..4a976d36e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -82,7 +82,6 @@ engines: - name : piratebay engine : piratebay - categories : videos, music, files shortcut : tpb - name : soundcloud From 64a68a7296d356fe28e31a704c3e0d3051220cec Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 18:12:30 +0200 Subject: [PATCH 08/16] update soundcloud and add comments --- searx/engines/soundcloud.py | 37 +++++++++++++++++++++++++++++++------ searx/settings.yml | 1 - 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 07cdbc273..aebea239f 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -1,30 +1,55 @@ +## Soundcloud (Music) +# +# @website https://soundcloud.com +# @provide-api yes (https://developers.soundcloud.com/) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content + from json import loads from urllib import urlencode +# engine dependent config categories = ['music'] - -guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28' -url = 'https://api.soundcloud.com/' -search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id='+guest_client_id # noqa - paging = True +# api-key +guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28' +# search-url +url = 'https://api.soundcloud.com/' +search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id={client_id}' + + +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 20 + params['url'] = search_url.format(query=urlencode({'q': query}), - offset=offset) + offset=offset, + client_id=guest_client_id) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + + # parse results for result in search_res.get('collection', []): if result['kind'] in ('track', 'playlist'): title = result['title'] content = result['description'] + + # append result results.append({'url': result['permalink_url'], 'title': title, 'content': content}) + + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index 4a976d36e..00ea2c339 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -86,7 +86,6 @@ engines: - name : soundcloud engine : soundcloud - categories : music shortcut : sc - name : stackoverflow From 80f98d60413c742d603da8eae3596999942ae77a Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 18:12:42 +0200 Subject: [PATCH 09/16] add little comment --- searx/engines/duckduckgo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index eae79481d..4810174ab 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -47,6 +47,7 @@ def response(resp): doc = fromstring(resp.text) + # parse results for r in doc.xpath(result_xpath): try: res_url = r.xpath(url_xpath)[-1] From a46bbb40422564b5576b81c978fb734dbf45a9ce Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 18:49:42 +0200 Subject: [PATCH 10/16] fix stackoverflow and add comments --- searx/engines/stackoverflow.py | 48 +++++++++++++++++++++++++++------- searx/settings.yml | 1 - 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index e24b309c1..edbe74a70 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -1,30 +1,58 @@ +## Stackoverflow (It) +# +# @website https://stackoverflow.com/ +# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content + from urlparse import urljoin from cgi import escape from urllib import urlencode from lxml import html +# engine dependent config categories = ['it'] - -url = 'http://stackoverflow.com/' -search_url = url+'search?{query}&page={pageno}' -result_xpath = './/div[@class="excerpt"]//text()' - paging = True +# search-url +url = 'http://stackoverflow.com/' +search_url = url+'search?{query}&page={pageno}' +# specific xpath variables +results_xpath = '//div[contains(@class,"question-summary")]' +link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' +title_xpath = './/text()' +content_xpath = './/div[@class="excerpt"]//text()' + + +# do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) - for result in dom.xpath('//div[@class="question-summary search-result"]'): - link = result.xpath('.//div[@class="result-link"]//a')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(' '.join(link.xpath('.//text()'))) - content = escape(' '.join(result.xpath(result_xpath))) - results.append({'url': href, 'title': title, 'content': content}) + title = escape(' '.join(link.xpath(title_xpath))) + content = escape(' '.join(result.xpath(content_xpath))) + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index 00ea2c339..a08a15403 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -90,7 +90,6 @@ engines: - name : stackoverflow engine : stackoverflow - categories : it shortcut : st - name : startpage From 678a80f043d2f57f059236b574cc29fab4f70fe8 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 19:57:01 +0200 Subject: [PATCH 11/16] fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements --- searx/engines/startpage.py | 74 +++++++++++++++++++++++++++----------- searx/settings.yml | 2 -- 2 files changed, 53 insertions(+), 23 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 8d44d05ab..2adbfb3e4 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -1,47 +1,79 @@ +## Startpage (Web) +# +# @website https://startpage.com +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo paging + from urllib import urlencode from lxml import html from cgi import escape +import re -base_url = None -search_url = None +# engine dependent config +categories = ['general'] +# there is a mechanism to block "bot" search (probably the parameter qid), require storing of qid's between mulitble search-calls +#paging = False +language_support = True -# TODO paging -paging = False -# TODO complete list of country mapping -country_map = {'en_US': 'eng', - 'en_UK': 'uk', - 'nl_NL': 'ned'} +# search-url +base_url = 'https://startpage.com/' +search_url = base_url + 'do/search' + +# specific xpath variables +# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] +# not ads: div[@class="result"] are the direct childs of div[@id="results"] +results_xpath = '//div[@class="result"]' +link_xpath = './/h3/a' +# do search-request def request(query, params): + offset = (params['pageno'] - 1) * 10 query = urlencode({'q': query})[2:] + params['url'] = search_url params['method'] = 'POST' params['data'] = {'query': query, - 'startat': (params['pageno'] - 1) * 10} # offset - country = country_map.get(params['language'], 'eng') - params['cookies']['preferences'] = \ - 'lang_homepageEEEs/air/{country}/N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE0N1Ncolor_schemeEEEnewN1Nnum_of_resultsEEE10N1N'.format(country=country) # noqa + 'startat': offset} + + # set language if specified + if params['language'] != 'all': + params['data']['with_language'] = 'lang_' + params['language'].split('_')[0] + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.content) - # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] - # not ads: div[@class="result"] are the direct childs of div[@id="results"] - for result in dom.xpath('//div[@class="result"]'): - link = result.xpath('.//h3/a')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] url = link.attrib.get('href') - if url.startswith('http://www.google.')\ - or url.startswith('https://www.google.'): - continue title = escape(link.text_content()) - content = '' + # block google-ad url's + if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): + continue + if result.xpath('./p[@class="desc"]'): content = escape(result.xpath('./p[@class="desc"]')[0].text_content()) + else: + content = '' - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index a08a15403..cfbca852e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -94,8 +94,6 @@ engines: - name : startpage engine : startpage - base_url : 'https://startpage.com/' - search_url : 'https://startpage.com/do/search' shortcut : sp # +30% page load time From 9460750feab250d383080342a7bb0a5fe2e2392d Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 20:14:52 +0200 Subject: [PATCH 12/16] fix twitter engine and add comments * add language-support * add comments * little refactoring --- searx/engines/twitter.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index c05c20fc2..8de78144e 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -1,30 +1,63 @@ +## Twitter (Social media) +# +# @website https://www.bing.com/news +# @provide-api yes (https://dev.twitter.com/docs/using-search) +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo publishedDate + from urlparse import urljoin from urllib import urlencode from lxml import html from cgi import escape +# engine dependent config categories = ['social media'] +language_support = True +# search-url base_url = 'https://twitter.com/' search_url = base_url+'search?' + +# specific xpath variables +results_xpath = '//li[@data-item-type="tweet"]' +link_xpath = './/small[@class="time"]//a' title_xpath = './/span[@class="username js-action-profile-name"]//text()' content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()' +# do search-request def request(query, params): params['url'] = search_url + urlencode({'q': query}) + + # set language if specified + if params['language'] != 'all': + params['cookies']['lang'] = params['language'].split('_')[0] + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) - for tweet in dom.xpath('//li[@data-item-type="tweet"]'): - link = tweet.xpath('.//small[@class="time"]//a')[0] + + # parse results + for tweet in dom.xpath(results_xpath): + link = tweet.xpath(link_xpath)[0] url = urljoin(base_url, link.attrib.get('href')) title = ''.join(tweet.xpath(title_xpath)) content = escape(''.join(tweet.xpath(content_xpath))) + + # append result results.append({'url': url, 'title': title, 'content': content}) + + # return results return results From badf8504fd69a5f7022e9ea0c93f581d3f2ce71a Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 20:15:49 +0200 Subject: [PATCH 13/16] remove categories : social media from twitter engine in settings.yml --- searx/settings.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/searx/settings.yml b/searx/settings.yml index cfbca852e..aebae4f3c 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -104,7 +104,6 @@ engines: - name : twitter engine : twitter - categories : social media shortcut : tw # maybe in a fun category From 8eb064dea1f312865dc5d5588d8a317a80efbb49 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 20:21:33 +0200 Subject: [PATCH 14/16] add little note to duckduckgo engine --- searx/engines/duckduckgo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 4810174ab..cd7d93c51 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -9,7 +9,7 @@ # @parse url, title, content # # @todo rewrite to api -# @todo language support +# @todo language support (the current used site does not support language-change) from urllib import urlencode from lxml.html import fromstring From bb628469d31d9ce61b2188aae3f570441eec8803 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 21:01:24 +0200 Subject: [PATCH 15/16] fix wikipedia engine and add comments * add paging support * make number_of_results changable * make result calculation more clear * add comments --- searx/engines/wikipedia.py | 57 +++++++++++++++++++++++++++++++------- searx/settings.yml | 3 +- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 1e2a798cc..ce9429776 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -1,30 +1,67 @@ +## Wikipedia (Web) +# +# @website http://www.wikipedia.org +# @provide-api yes (http://www.mediawiki.org/wiki/API:Search) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title +# +# @todo content + from json import loads from urllib import urlencode, quote -url = 'https://{language}.wikipedia.org/' - -search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}' # noqa - -number_of_results = 10 - +# engine dependent config +categories = ['general'] language_support = True +paging = True +number_of_results = 1 + +# search-url +url = 'https://{language}.wikipedia.org/' +search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}&srlimit={limit}' # noqa +# do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + offset = (params['pageno'] - 1) * number_of_results + if params['language'] == 'all': language = 'en' else: language = params['language'].split('_')[0] + + # write search-language back to params, required in response params['language'] = language + params['url'] = search_url.format(query=urlencode({'srsearch': query}), offset=offset, + limit=number_of_results, language=language) + return params +# get response from search-request def response(resp): + results = [] + search_results = loads(resp.text) - res = search_results.get('query', {}).get('search', []) - return [{'url': url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')), # noqa - 'title': result['title']} for result in res[:int(number_of_results)]] + + # return empty array if there are no results + if not search_results.get('query', {}).get('search'): + return [] + + # parse results + for result in search_results['query']['search']: + res_url = url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')) + + # append result + results.append({'url': res_url, + 'title': result['title'], + 'content': ''}) + + # return results + return results diff --git a/searx/settings.yml b/searx/settings.yml index aebae4f3c..8938dfb80 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -11,9 +11,8 @@ server: engines: - name : wikipedia engine : wikipedia - number_of_results : 1 - paging : False shortcut : wp +# number_of_results : 1 # default is 1 - name : bing engine : bing From 629a05e149eaaab05a724dd3915ed363c364c796 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 21:19:20 +0200 Subject: [PATCH 16/16] fix youtube engine and add comments * add language-support * decrease search-results/site to 5 * add comments --- searx/engines/youtube.py | 43 +++++++++++++++++++++++++++++++++------- searx/settings.yml | 3 --- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py index 895b55918..a3c3980af 100644 --- a/searx/engines/youtube.py +++ b/searx/engines/youtube.py @@ -1,42 +1,69 @@ +## Youtube (Videos) +# +# @website https://www.youtube.com/ +# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content, publishedDate, thumbnail + from json import loads from urllib import urlencode from dateutil import parser +# engine dependent config categories = ['videos'] - -search_url = ('https://gdata.youtube.com/feeds/api/videos' - '?alt=json&{query}&start-index={index}&max-results=25') # noqa - paging = True +language_support = True + +# search-url +base_url = 'https://gdata.youtube.com/feeds/api/videos' +search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5' # noqa +# do search-request def request(query, params): - index = (params['pageno'] - 1) * 25 + 1 + index = (params['pageno'] - 1) * 5 + 1 + params['url'] = search_url.format(query=urlencode({'q': query}), index=index) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&lr=' + params['language'].split('_')[0] + return params +# get response from search-request def response(resp): results = [] + search_results = loads(resp.text) + + # return empty array if there are no results if not 'feed' in search_results: - return results + return [] + feed = search_results['feed'] + # parse results for result in feed['entry']: url = [x['href'] for x in result['link'] if x['type'] == 'text/html'] + if not url: return + # remove tracking url = url[0].replace('feature=youtube_gdata', '') if url.endswith('&'): url = url[:-1] + title = result['title']['$t'] content = '' thumbnail = '' -#"2013-12-31T15:22:51.000Z" pubdate = result['published']['$t'] publishedDate = parser.parse(pubdate) @@ -49,6 +76,7 @@ def response(resp): else: content = result['content']['$t'] + # append result results.append({'url': url, 'title': title, 'content': content, @@ -56,4 +84,5 @@ def response(resp): 'publishedDate': publishedDate, 'thumbnail': thumbnail}) + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index 8938dfb80..552a5f7b9 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -131,13 +131,10 @@ engines: - name : youtube engine : youtube - categories : videos shortcut : yt - name : dailymotion engine : dailymotion - locale : en_US - categories : videos shortcut : dm - name : vimeo