From c5d83059d537d8efb296ffbe743828a884ac4e10 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 2 Sep 2014 17:28:35 +0200 Subject: [PATCH] update generalfile engine and add comments --- searx/engines/generalfile.py | 31 ++++++++++++++++++++++++++++--- searx/settings.yml | 1 - 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/searx/engines/generalfile.py b/searx/engines/generalfile.py index d249c00c7..11d8b6955 100644 --- a/searx/engines/generalfile.py +++ b/searx/engines/generalfile.py @@ -1,35 +1,60 @@ +## General Files (Files) +# +# @website http://www.general-files.org +# @provide-api no (nothing found) +# +# @using-api no (because nothing found) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo detect torrents? + from lxml import html +# engine dependent config +categories = ['files'] +paging = True +# search-url base_url = 'http://www.general-file.com' search_url = base_url + '/files-{letter}/{query}/{pageno}' +# specific xpath variables result_xpath = '//table[@class="block-file"]' title_xpath = './/h2/a//text()' url_xpath = './/h2/a/@href' content_xpath = './/p//text()' -paging = True - +# do search-request def request(query, params): + params['url'] = search_url.format(query=query, letter=query[0], pageno=params['pageno']) + return params +# get response from search-request def response(resp): - results = [] + dom = html.fromstring(resp.text) + + # parse results for result in dom.xpath(result_xpath): url = result.xpath(url_xpath)[0] + # skip fast download links if not url.startswith('/'): continue + + # append result results.append({'url': base_url + url, 'title': ''.join(result.xpath(title_xpath)), 'content': ''.join(result.xpath(content_xpath))}) + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index 5a9254070..c6227212e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -62,7 +62,6 @@ engines: - name : general-file engine : generalfile - categories : files shortcut : gf - name : github