From d748b8419ad1ef875f34783bbbcf773ebc4cfb5e Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Thu, 24 Mar 2016 01:57:27 +0600 Subject: [PATCH 1/2] Add Arch Linux Wiki search engine --- AUTHORS.rst | 1 + searx/engines/archlinux.py | 144 +++++++++++++++++++++++++++ searx/settings.yml | 4 + tests/unit/engines/test_archlinux.py | 111 +++++++++++++++++++++ 4 files changed, 260 insertions(+) create mode 100644 searx/engines/archlinux.py create mode 100644 tests/unit/engines/test_archlinux.py diff --git a/AUTHORS.rst b/AUTHORS.rst index 632e7f090..c5047438a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -41,3 +41,4 @@ generally made searx better: - @GreenLunar - Noemi Vanyi - Kang-min Liu +- Kirill Isakov diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py new file mode 100644 index 000000000..f12c4328a --- /dev/null +++ b/searx/engines/archlinux.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- + +""" + Arch Linux Wiki + + @website https://wiki.archlinux.org + @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from urlparse import urljoin +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text + +# engine dependent config +categories = ['it'] +language_support = True +paging = True +base_url = 'https://wiki.archlinux.org' + +# xpath queries +xpath_results = '//ul[@class="mw-search-results"]/li' +xpath_link = './/div[@class="mw-search-result-heading"]/a' +xpath_content = './/div[@class="searchresult"]' + + +# cut 'en' from 'en_US', 'de' from 'de_CH', and so on +def locale_to_lang_code(locale): + if locale.find('_') >= 0: + locale = locale.split('_')[0] + return locale + +# wikis for some languages were moved off from the main site, we need to make +# requests to correct URLs to be able to get results in those languages +lang_urls = { + 'all': { + 'base': 'https://wiki.archlinux.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}' + }, + 'de': { + 'base': 'https://wiki.archlinux.de', + 'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}' + }, + 'fr': { + 'base': 'https://wiki.archlinux.fr', + 'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}' + }, + 'ja': { + 'base': 'https://wiki.archlinuxjp.org', + 'search': '/index.php?title=特別:検索&offset={offset}&{query}' + }, + 'ro': { + 'base': 'http://wiki.archlinux.ro', + 'search': '/index.php?title=Special:Căutare&offset={offset}&{query}' + }, + 'tr': { + 'base': 'http://archtr.org/wiki', + 'search': '/index.php?title=Özel:Ara&offset={offset}&{query}' + } +} + + +# get base & search URLs for selected language +def get_lang_urls(language): + if language in lang_urls: + return lang_urls[language] + return lang_urls['all'] + +# Language names to build search requests for +# those languages which are hosted on the main site. +main_langs = { + 'ar': 'العربية', + 'bg': 'Български', + 'cs': 'Česky', + 'da': 'Dansk', + 'el': 'Ελληνικά', + 'es': 'Español', + 'he': 'עברית', + 'hr': 'Hrvatski', + 'hu': 'Magyar', + 'it': 'Italiano', + 'ko': '한국어', + 'lt': 'Lietuviškai', + 'nl': 'Nederlands', + 'pl': 'Polski', + 'pt': 'Português', + 'ru': 'Русский', + 'sl': 'Slovenský', + 'th': 'ไทย', + 'uk': 'Українська', + 'zh': '简体中文' +} + + +# do search-request +def request(query, params): + # translate the locale (e.g. 'en_US') to language code ('en') + language = locale_to_lang_code(params['language']) + + # if our language is hosted on the main site, we need to add its name + # to the query in order to narrow the results to that language + if language in main_langs: + query += '(' + main_langs[language] + ')' + + # prepare the request parameters + query = urlencode({'search': query}) + offset = (params['pageno'] - 1) * 20 + + # get request URLs for our language of choice + urls = get_lang_urls(language) + search_url = urls['base'] + urls['search'] + + params['url'] = search_url.format(query=query, offset=offset) + + return params + + +# get response from search-request +def response(resp): + # get the base URL for the language in which request was made + language = locale_to_lang_code(resp.search_params['language']) + base_url = get_lang_urls(language)['base'] + + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(xpath_results): + link = result.xpath(xpath_link)[0] + href = urljoin(base_url, link.attrib.get('href')) + title = escape(extract_text(link)) + content = escape(extract_text(result.xpath(xpath_content))) + + results.append({'url': href, + 'title': title, + 'content': content}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index e8e442f23..5ef74d955 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -34,6 +34,10 @@ outgoing: # communication with search engines # - 1.1.1.2 engines: + - name : arch linux wiki + engine : archlinux + shortcut : al + - name : wikipedia engine : mediawiki shortcut : wp diff --git a/tests/unit/engines/test_archlinux.py b/tests/unit/engines/test_archlinux.py new file mode 100644 index 000000000..66959857a --- /dev/null +++ b/tests/unit/engines/test_archlinux.py @@ -0,0 +1,111 @@ +from collections import defaultdict +import mock +from searx.engines import archlinux +from searx.testing import SearxTestCase + +domains = { + 'all': 'https://wiki.archlinux.org', + 'de': 'https://wiki.archlinux.de', + 'fr': 'https://wiki.archlinux.fr', + 'ja': 'https://wiki.archlinuxjp.org', + 'ro': 'http://wiki.archlinux.ro', + 'tr': 'http://archtr.org/wiki' +} + + +class TestArchLinuxEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 0 + dic['language'] = 'en_US' + params = archlinux.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('wiki.archlinux.org' in params['url']) + + for lang, domain in domains.iteritems(): + dic['language'] = lang + params = archlinux.request(query, dic) + self.assertTrue(domain in params['url']) + + def test_response(self): + response = mock.Mock(text='') + response.search_params = { + 'language': 'en_US' + } + self.assertEqual(archlinux.response(response), []) + + html = """ + + """ + + expected = [ + { + 'title': 'ATI', + 'url': 'https://wiki.archlinux.org/index.php/ATI', + 'content': 'Lorem ipsum dolor sit amet' + }, + { + 'title': 'Frequently asked questions', + 'url': 'https://wiki.archlinux.org/index.php/Frequently_asked_questions', + 'content': 'CPUs with AMDs instruction set "AMD64"' + }, + { + 'title': 'CPU frequency scaling', + 'url': 'https://wiki.archlinux.org/index.php/CPU_frequency_scaling', + 'content': 'ondemand for AMD and older Intel CPU' + } + ] + + response = mock.Mock(text=html) + response.search_params = { + 'language': 'en_US' + } + results = archlinux.response(response) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), len(expected)) + + i = 0 + for exp in expected: + res = results[i] + i += 1 + for key, value in exp.iteritems(): + self.assertEqual(res[key], value) From 8b7dc2acb9f670ba65e4b98eb310f04e4c212bd8 Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Fri, 25 Mar 2016 00:38:48 +0600 Subject: [PATCH 2/2] Remove content field from ArchWiki results; reformat code in archlinux.py Content field in Arch Wiki search results is of no real use, more often than not it contains no usable information and includes too many markup tags which make the text unreadable. It is safe to remove it. --- searx/engines/archlinux.py | 17 +++++++---------- tests/unit/engines/test_archlinux.py | 17 ++++++----------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index f12c4328a..84e0d0fba 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -3,12 +3,12 @@ """ Arch Linux Wiki - @website https://wiki.archlinux.org - @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content + @website https://wiki.archlinux.org + @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title """ from urlparse import urljoin @@ -26,7 +26,6 @@ base_url = 'https://wiki.archlinux.org' # xpath queries xpath_results = '//ul[@class="mw-search-results"]/li' xpath_link = './/div[@class="mw-search-result-heading"]/a' -xpath_content = './/div[@class="searchresult"]' # cut 'en' from 'en_US', 'de' from 'de_CH', and so on @@ -135,10 +134,8 @@ def response(resp): link = result.xpath(xpath_link)[0] href = urljoin(base_url, link.attrib.get('href')) title = escape(extract_text(link)) - content = escape(extract_text(result.xpath(xpath_content))) results.append({'url': href, - 'title': title, - 'content': content}) + 'title': title}) return results diff --git a/tests/unit/engines/test_archlinux.py b/tests/unit/engines/test_archlinux.py index 66959857a..d0009d63a 100644 --- a/tests/unit/engines/test_archlinux.py +++ b/tests/unit/engines/test_archlinux.py @@ -18,7 +18,7 @@ class TestArchLinuxEngine(SearxTestCase): def test_request(self): query = 'test_query' dic = defaultdict(dict) - dic['pageno'] = 0 + dic['pageno'] = 1 dic['language'] = 'en_US' params = archlinux.request(query, dic) self.assertTrue('url' in params) @@ -31,10 +31,8 @@ class TestArchLinuxEngine(SearxTestCase): self.assertTrue(domain in params['url']) def test_response(self): - response = mock.Mock(text='') - response.search_params = { - 'language': 'en_US' - } + response = mock.Mock(text='', + search_params={'language': 'en_US'}) self.assertEqual(archlinux.response(response), []) html = """ @@ -79,18 +77,15 @@ class TestArchLinuxEngine(SearxTestCase): expected = [ { 'title': 'ATI', - 'url': 'https://wiki.archlinux.org/index.php/ATI', - 'content': 'Lorem ipsum dolor sit amet' + 'url': 'https://wiki.archlinux.org/index.php/ATI' }, { 'title': 'Frequently asked questions', - 'url': 'https://wiki.archlinux.org/index.php/Frequently_asked_questions', - 'content': 'CPUs with AMDs instruction set "AMD64"' + 'url': 'https://wiki.archlinux.org/index.php/Frequently_asked_questions' }, { 'title': 'CPU frequency scaling', - 'url': 'https://wiki.archlinux.org/index.php/CPU_frequency_scaling', - 'content': 'ondemand for AMD and older Intel CPU' + 'url': 'https://wiki.archlinux.org/index.php/CPU_frequency_scaling' } ]