Merge branch 'master' into conditional-sigusr1

This commit is contained in:
Dr. Rolf Jansen 2021-03-15 17:03:36 -03:00 committed by GitHub
commit 4a27dabcf7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 189 additions and 126 deletions

View file

@ -192,6 +192,7 @@ PYLINT_FILES=\
searx/engines/google_images.py \ searx/engines/google_images.py \
searx/engines/mediathekviewweb.py \ searx/engines/mediathekviewweb.py \
searx/engines/solidtorrents.py \ searx/engines/solidtorrents.py \
searx/engines/solr.py \
searx/engines/google_scholar.py \ searx/engines/google_scholar.py \
searx/engines/yahoo_news.py \ searx/engines/yahoo_news.py \
searx/engines/apkmirror.py \ searx/engines/apkmirror.py \

View file

@ -1,74 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Acgsou (Japanese Animation/Music/Comics Bittorrent tracker)
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, get_torrent_size, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://www.acgsou.com/',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'images', 'videos', 'music']
paging = True
# search-url
base_url = 'https://www.acgsou.com/'
search_url = base_url + 'search.php?{query}&page={offset}'
# xpath queries
xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]'
xpath_category = './/td[2]/a[1]'
xpath_title = './/td[3]/a[last()]'
xpath_torrent_links = './/td[3]/a'
xpath_filesize = './/td[4]/text()'
def request(query, params):
query = urlencode({'keyword': query})
params['url'] = search_url.format(query=query, offset=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, xpath_results):
# defaults
filesize = 0
magnet_link = "magnet:?xt=urn:btih:{}&tr=https://tracker.acgsou.com:2710/announce"
category = extract_text(eval_xpath_getindex(result, xpath_category, 0, default=[]))
page_a = eval_xpath_getindex(result, xpath_title, 0)
title = extract_text(page_a)
href = base_url + page_a.attrib.get('href')
magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5])
filesize_info = eval_xpath_getindex(result, xpath_filesize, 0, default=None)
if filesize_info:
try:
filesize = filesize_info[:-2]
filesize_multiplier = filesize_info[-2:]
filesize = get_torrent_size(filesize, filesize_multiplier)
except:
pass
# I didn't add download/seed/leech count since as I figured out they are generated randomly everytime
content = 'Category: "{category}".'
content = content.format(category=category)
results.append({'url': href,
'title': title,
'content': content,
'filesize': filesize,
'magnetlink': magnet_link,
'template': 'torrent.html'})
return results

View file

@ -3,10 +3,7 @@
Microsoft Academic (Science) Microsoft Academic (Science)
""" """
from datetime import datetime from json import dumps, loads
from json import loads
from uuid import uuid4
from urllib.parse import urlencode
from searx.utils import html_to_text from searx.utils import html_to_text
# about # about
@ -21,26 +18,25 @@ about = {
categories = ['images'] categories = ['images']
paging = True paging = True
result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' search_url = 'https://academic.microsoft.com/api/search'
_paper_url = 'https://academic.microsoft.com/paper/{id}/reference'
def request(query, params): def request(query, params):
correlation_id = uuid4() params['url'] = search_url
msacademic = uuid4()
time_now = datetime.now()
params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id}))
params['cookies']['msacademic'] = str(msacademic)
params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now))
params['method'] = 'POST' params['method'] = 'POST'
params['data'] = { params['headers']['content-type'] = 'application/json; charset=utf-8'
'Query': '@{query}@'.format(query=query), params['data'] = dumps({
'Limit': 10, 'query': query,
'Offset': params['pageno'] - 1, 'queryExpression': '',
'Filters': '', 'filters': [],
'OrderBy': '', 'orderBy': 0,
'SortAscending': False, 'skip': (params['pageno'] - 1) * 10,
} 'sortAscending': True,
'take': 10,
'includeCitationContexts': False,
'profileId': '',
})
return params return params
@ -51,10 +47,13 @@ def response(resp):
if not response_data: if not response_data:
return results return results
for result in response_data['results']: for result in response_data['pr']:
url = _get_url(result) if 'dn' not in result['paper']:
title = result['e']['dn'] continue
content = _get_content(result)
title = result['paper']['dn']
content = _get_content(result['paper'])
url = _paper_url.format(id=result['paper']['id'])
results.append({ results.append({
'url': url, 'url': url,
'title': html_to_text(title), 'title': html_to_text(title),
@ -64,15 +63,9 @@ def response(resp):
return results return results
def _get_url(result):
if 's' in result['e']:
return result['e']['s'][0]['u']
return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id'])
def _get_content(result): def _get_content(result):
if 'd' in result['e']: if 'd' in result:
content = result['e']['d'] content = result['d']
if len(content) > 300: if len(content) > 300:
return content[:300] + '...' return content[:300] + '...'
return content return content

74
searx/engines/solr.py Normal file
View file

@ -0,0 +1,74 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Solr
"""
# pylint: disable=global-statement, missing-function-docstring
from json import loads
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
base_url = 'http://localhost:8983'
collection = ''
rows = 10
sort = '' # sorting: asc or desc
field_list = 'name' # list of field names to display on the UI
default_fields = '' # default field to query
query_fields = '' # query fields
_search_url = ''
paging = True
def init(_):
if collection == '':
raise ValueError('collection cannot be empty')
global _search_url
_search_url = base_url + '/solr/' + collection + '/select?{params}'
def request(query, params):
query_params = {'q': query, 'rows': rows}
if field_list != '':
query_params['fl'] = field_list
if query_fields != '':
query_params['qf'] = query_fields
if default_fields != '':
query_params['df'] = default_fields
if sort != '':
query_params['sort'] = sort
if 'pageno' in params:
query_params['start'] = rows * (params['pageno'] - 1)
params['url'] = _search_url.format(params=urlencode(query_params))
return params
def response(resp):
resp_json = __get_response(resp)
results = []
for result in resp_json['response']['docs']:
r = {key: str(value) for key, value in result.items()}
if len(r) == 0:
continue
r['template'] = 'key-value.html'
results.append(r)
return results
def __get_response(resp):
try:
resp_json = loads(resp.text)
except Exception as e:
raise SearxEngineAPIException("failed to parse response") from e
if 'error' in resp_json:
raise SearxEngineAPIException(resp_json['error']['msg'])
return resp_json

View file

@ -22,13 +22,6 @@ from searx.utils import (
from searx.engines.yahoo import parse_url from searx.engines.yahoo import parse_url
# pylint: disable=unused-import
from searx.engines.yahoo import (
_fetch_supported_languages,
supported_languages_url,
)
# pylint: enable=unused-import
logger = logger.getChild('yahoo_news engine') logger = logger.getChild('yahoo_news engine')
# about # about

View file

@ -4,7 +4,7 @@
""" """
from functools import reduce from functools import reduce
from json import loads from json import loads, dumps
from urllib.parse import quote_plus from urllib.parse import quote_plus
# about # about
@ -20,12 +20,15 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos', 'music'] categories = ['videos', 'music']
paging = True paging = True
language_support = False
time_range_support = True time_range_support = True
# search-url # search-url
base_url = 'https://www.youtube.com/results' base_url = 'https://www.youtube.com/results'
search_url = base_url + '?search_query={query}&page={page}' search_url = base_url + '?search_query={query}&page={page}'
time_range_url = '&sp=EgII{time_range}%253D%253D' time_range_url = '&sp=EgII{time_range}%253D%253D'
# the key seems to be constant
next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
time_range_dict = {'day': 'Ag', time_range_dict = {'day': 'Ag',
'week': 'Aw', 'week': 'Aw',
'month': 'BA', 'month': 'BA',
@ -40,21 +43,73 @@ base_youtube_url = 'https://www.youtube.com/watch?v='
# do search-request # do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=quote_plus(query), if not params['engine_data'].get('next_page_token'):
page=params['pageno']) params['url'] = search_url.format(query=quote_plus(query), page=params['pageno'])
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']])
else:
print(params['engine_data']['next_page_token'])
params['url'] = next_page_url
params['method'] = 'POST'
params['data'] = dumps({
'context': {"client": {"clientName": "WEB", "clientVersion": "2.20210310.12.01"}},
'continuation': params['engine_data']['next_page_token'],
})
params['headers']['Content-Type'] = 'application/json'
return params return params
# get response from search-request # get response from search-request
def response(resp): def response(resp):
if resp.search_params.get('engine_data'):
return parse_next_page_response(resp.text)
return parse_first_page_response(resp.text)
def parse_next_page_response(response_text):
results = [] results = []
result_json = loads(response_text)
with open("/tmp/x", "w") as f:
f.write(response_text)
for section in (result_json['onResponseReceivedCommands'][0]
.get('appendContinuationItemsAction')['continuationItems'][0]
.get('itemSectionRenderer')['contents']):
if 'videoRenderer' not in section:
continue
section = section['videoRenderer']
content = "-"
if 'descriptionSnippet' in section:
content = ' '.join(x['text'] for x in section['descriptionSnippet']['runs'])
results.append({
'url': base_youtube_url + section['videoId'],
'title': ' '.join(x['text'] for x in section['title']['runs']),
'content': content,
'author': section['ownerText']['runs'][0]['text'],
'length': section['lengthText']['simpleText'],
'template': 'videos.html',
'embedded': embedded_url.format(videoid=section['videoId']),
'thumbnail': section['thumbnail']['thumbnails'][-1]['url'],
})
try:
token = result_json['onResponseReceivedCommands'][0]\
.get('appendContinuationItemsAction')['continuationItems'][1]\
.get('continuationItemRenderer')['continuationEndpoint']\
.get('continuationCommand')['token']
results.append({
"engine_data": token,
"key": "next_page_token",
})
except:
pass
results_data = resp.text[resp.text.find('ytInitialData'):] return results
def parse_first_page_response(response_text):
results = []
results_data = response_text[response_text.find('ytInitialData'):]
results_data = results_data[results_data.find('{'):results_data.find(';</script>')] results_data = results_data[results_data.find('{'):results_data.find(';</script>')]
results_json = loads(results_data) if results_data else {} results_json = loads(results_data) if results_data else {}
sections = results_json.get('contents', {})\ sections = results_json.get('contents', {})\
.get('twoColumnSearchResultsRenderer', {})\ .get('twoColumnSearchResultsRenderer', {})\
@ -63,6 +118,16 @@ def response(resp):
.get('contents', []) .get('contents', [])
for section in sections: for section in sections:
if "continuationItemRenderer" in section:
next_page_token = section["continuationItemRenderer"]\
.get("continuationEndpoint", {})\
.get("continuationCommand", {})\
.get("token", "")
if next_page_token:
results.append({
"engine_data": next_page_token,
"key": "next_page_token",
})
for video_container in section.get('itemSectionRenderer', {}).get('contents', []): for video_container in section.get('itemSectionRenderer', {}).get('contents', []):
video = video_container.get('videoRenderer', {}) video = video_container.get('videoRenderer', {})
videoid = video.get('videoId') videoid = video.get('videoId')

View file

@ -82,6 +82,8 @@ outgoing: # communication with search engines
# https: # https:
# - http://proxy1:8080 # - http://proxy1:8080
# - http://proxy2:8080 # - http://proxy2:8080
# using_tor_proxy : True
# extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy
# uncomment below section only if you have more than one network interface # uncomment below section only if you have more than one network interface
# which can be the source of outgoing search requests # which can be the source of outgoing search requests
# source_ips: # source_ips:
@ -159,6 +161,7 @@ engines:
- name : ahmia - name : ahmia
engine : ahmia engine : ahmia
categories : onions categories : onions
enable_http : True
shortcut : ah shortcut : ah
- name : arch linux wiki - name : arch linux wiki
@ -730,6 +733,8 @@ engines:
# Requires Tor # Requires Tor
- name : not evil - name : not evil
engine : not_evil engine : not_evil
categories : onions
enable_http : True
shortcut : ne shortcut : ne
- name : nyaa - name : nyaa
@ -737,12 +742,6 @@ engines:
shortcut : nt shortcut : nt
disabled : True disabled : True
- name : acgsou
engine : acgsou
shortcut : acg
disabled : True
timeout: 5.0
- name : openairedatasets - name : openairedatasets
engine : json_engine engine : json_engine
paging : True paging : True
@ -943,6 +942,17 @@ engines:
# api_client_id : ******* # api_client_id : *******
# api_client_secret : ******* # api_client_secret : *******
# - name : solr
# engine : solr
# shortcut : slr
# base_url : http://localhost:8983
# collection : collection_name
# sort : '' # sorting: asc or desc
# field_list : '' # comma separated list of field names to display on the UI
# default_fields : '' # default field to query
# query_fields : '' # query fields
# enable_http : True
- name : startpage - name : startpage
engine : startpage engine : startpage
shortcut : sp shortcut : sp
@ -979,6 +989,7 @@ engines:
title_xpath : ./td[2]/b title_xpath : ./td[2]/b
content_xpath : ./td[2]/small content_xpath : ./td[2]/small
categories : onions categories : onions
enable_http : True
shortcut : tch shortcut : tch
# maybe in a fun category # maybe in a fun category