mirror of
https://github.com/searxng/searxng.git
synced 2024-12-29 20:50:28 +00:00
Implementing https rewrite support #71
* parsing XML-Files which contain target, exclusions and rules * convert regex if required (is a little hack, probably does not work for all rules) * check if target rule apply for http url, and use the rules to rewrite it * add pice of code, to check if domain name has not changed during rewrite (should be rewritten, using publicsuffix instead of little hack)
This commit is contained in:
parent
d1d55f2ca4
commit
9b9f097adb
3 changed files with 187 additions and 14 deletions
|
@ -1,5 +1,6 @@
|
||||||
from os import environ
|
from os import environ
|
||||||
from os.path import realpath, dirname, join, abspath
|
from os.path import realpath, dirname, join, abspath
|
||||||
|
from searx.https_rewrite import load_https_rules
|
||||||
try:
|
try:
|
||||||
from yaml import load
|
from yaml import load
|
||||||
except:
|
except:
|
||||||
|
@ -15,6 +16,13 @@ if 'SEARX_SETTINGS_PATH' in environ:
|
||||||
else:
|
else:
|
||||||
settings_path = join(searx_dir, 'settings.yml')
|
settings_path = join(searx_dir, 'settings.yml')
|
||||||
|
|
||||||
|
if 'SEARX_HTTPS_REWRITE_PATH' in environ:
|
||||||
|
https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
|
||||||
|
else:
|
||||||
|
https_rewrite_path = join(searx_dir, 'https_rules')
|
||||||
|
|
||||||
with open(settings_path) as settings_yaml:
|
with open(settings_path) as settings_yaml:
|
||||||
settings = load(settings_yaml)
|
settings = load(settings_yaml)
|
||||||
|
|
||||||
|
# loade https rules
|
||||||
|
load_https_rules(https_rewrite_path)
|
||||||
|
|
|
@ -1,14 +1,139 @@
|
||||||
|
'''
|
||||||
|
searx is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
searx is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
||||||
|
|
||||||
|
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
||||||
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from os import listdir
|
||||||
|
from os.path import isfile, join
|
||||||
|
|
||||||
|
|
||||||
# https://gitweb.torproject.org/\
|
# https://gitweb.torproject.org/\
|
||||||
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
|
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
|
||||||
|
|
||||||
# HTTPS rewrite rules
|
# HTTPS rewrite rules
|
||||||
https_rules = (
|
https_rules = []
|
||||||
# from
|
|
||||||
(re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
|
|
||||||
# to
|
# load single ruleset from a xml file
|
||||||
r'https://\1xkcd.com/'),
|
def load_single_https_ruleset(filepath):
|
||||||
(re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
|
ruleset = ()
|
||||||
r'https://sslimgs.xkcd.com/'),
|
|
||||||
)
|
# init parser
|
||||||
|
parser = etree.XMLParser()
|
||||||
|
|
||||||
|
# load and parse xml-file
|
||||||
|
try:
|
||||||
|
tree = etree.parse(filepath, parser)
|
||||||
|
except:
|
||||||
|
# TODO, error message
|
||||||
|
return ()
|
||||||
|
|
||||||
|
# get root node
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
#print(etree.tostring(tree))
|
||||||
|
|
||||||
|
# check if root is a node with the name ruleset
|
||||||
|
# TODO improve parsing
|
||||||
|
if root.tag != 'ruleset':
|
||||||
|
return ()
|
||||||
|
|
||||||
|
# check if rule is deactivated by default
|
||||||
|
if root.attrib.get('default_off'):
|
||||||
|
return ()
|
||||||
|
|
||||||
|
# check if rule does only work for specific platforms
|
||||||
|
if root.attrib.get('platform'):
|
||||||
|
return ()
|
||||||
|
|
||||||
|
hosts = []
|
||||||
|
rules = []
|
||||||
|
exclusions = []
|
||||||
|
|
||||||
|
# parse childs from ruleset
|
||||||
|
for ruleset in root:
|
||||||
|
# this child define a target
|
||||||
|
if ruleset.tag == 'target':
|
||||||
|
# check if required tags available
|
||||||
|
if not ruleset.attrib.get('host'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# convert host-rule to valid regex
|
||||||
|
host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
|
||||||
|
|
||||||
|
# append to host list
|
||||||
|
hosts.append(host)
|
||||||
|
|
||||||
|
# this child define a rule
|
||||||
|
elif ruleset.tag == 'rule':
|
||||||
|
# check if required tags available
|
||||||
|
if not ruleset.attrib.get('from')\
|
||||||
|
or not ruleset.attrib.get('to'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# TODO hack, which convert a javascript regex group into a valid python regex group
|
||||||
|
rule_from = ruleset.attrib.get('from').replace('$', '\\')
|
||||||
|
rule_to = ruleset.attrib.get('to').replace('$', '\\')
|
||||||
|
|
||||||
|
# TODO, not working yet because of the hack above, currently doing that in webapp.py
|
||||||
|
#rule_from_rgx = re.compile(rule_from, re.I)
|
||||||
|
|
||||||
|
# append rule
|
||||||
|
rules.append((rule_from, rule_to))
|
||||||
|
|
||||||
|
# this child define an exclusion
|
||||||
|
elif ruleset.tag == 'exclusion':
|
||||||
|
# check if required tags available
|
||||||
|
if not ruleset.attrib.get('pattern'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
|
||||||
|
|
||||||
|
# append exclusion
|
||||||
|
exclusions.append(exclusion_rgx)
|
||||||
|
|
||||||
|
# convert list of possible hosts to a simple regex
|
||||||
|
# TODO compress regex to improve performance
|
||||||
|
try:
|
||||||
|
target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
|
||||||
|
except:
|
||||||
|
return ()
|
||||||
|
|
||||||
|
# return ruleset
|
||||||
|
return (target_hosts, rules, exclusions)
|
||||||
|
|
||||||
|
|
||||||
|
# load all https rewrite rules
|
||||||
|
def load_https_rules(rules_path):
|
||||||
|
# add / to path if not set yet
|
||||||
|
if rules_path[-1:] != '/':
|
||||||
|
rules_path += '/'
|
||||||
|
|
||||||
|
# search all xml files which are stored in the https rule directory
|
||||||
|
xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
|
||||||
|
|
||||||
|
# load xml-files
|
||||||
|
for ruleset_file in xml_files:
|
||||||
|
# calculate rewrite-rules
|
||||||
|
ruleset = load_single_https_ruleset(ruleset_file)
|
||||||
|
|
||||||
|
# skip if no ruleset returned
|
||||||
|
if not ruleset:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# append ruleset
|
||||||
|
https_rules.append(ruleset)
|
||||||
|
|
|
@ -49,6 +49,9 @@ from searx.languages import language_codes
|
||||||
from searx.search import Search
|
from searx.search import Search
|
||||||
from searx.autocomplete import backends as autocomplete_backends
|
from searx.autocomplete import backends as autocomplete_backends
|
||||||
|
|
||||||
|
from urlparse import urlparse
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
static_path, templates_path, themes =\
|
static_path, templates_path, themes =\
|
||||||
get_themes(settings['themes_path']
|
get_themes(settings['themes_path']
|
||||||
|
@ -197,16 +200,53 @@ def index():
|
||||||
if not search.paging and engines[result['engine']].paging:
|
if not search.paging and engines[result['engine']].paging:
|
||||||
search.paging = True
|
search.paging = True
|
||||||
|
|
||||||
|
# check if HTTPS rewrite is required
|
||||||
if settings['server']['https_rewrite']\
|
if settings['server']['https_rewrite']\
|
||||||
and result['parsed_url'].scheme == 'http':
|
and result['parsed_url'].scheme == 'http':
|
||||||
|
|
||||||
for http_regex, https_url in https_rules:
|
skip_https_rewrite = False
|
||||||
if http_regex.match(result['url']):
|
|
||||||
result['url'] = http_regex.sub(https_url, result['url'])
|
# check if HTTPS rewrite is possible
|
||||||
# TODO result['parsed_url'].scheme
|
for target, rules, exclusions in https_rules:
|
||||||
break
|
|
||||||
|
# check if target regex match with url
|
||||||
|
if target.match(result['url']):
|
||||||
|
# process exclusions
|
||||||
|
for exclusion in exclusions:
|
||||||
|
# check if exclusion match with url
|
||||||
|
if exclusion.match(result['url']):
|
||||||
|
skip_https_rewrite = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# skip https rewrite if required
|
||||||
|
if skip_https_rewrite:
|
||||||
|
break
|
||||||
|
|
||||||
|
# process rules
|
||||||
|
for rule in rules:
|
||||||
|
# TODO, precompile rule
|
||||||
|
p = re.compile(rule[0])
|
||||||
|
# rewrite url if possible
|
||||||
|
new_result_url = p.sub(rule[1], result['url'])
|
||||||
|
|
||||||
|
# parse new url
|
||||||
|
new_parsed_url = urlparse(new_result_url)
|
||||||
|
|
||||||
|
# continiue if nothing was rewritten
|
||||||
|
if result['url'] == new_result_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# get domainname from result
|
||||||
|
# TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de
|
||||||
|
# TODO, using publicsuffix instead of this rewrite rule
|
||||||
|
old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:])
|
||||||
|
new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:])
|
||||||
|
|
||||||
|
# check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules
|
||||||
|
if old_result_domainname == new_result_domainname:
|
||||||
|
# set new url
|
||||||
|
result['url'] = new_result_url
|
||||||
|
|
||||||
# HTTPS rewrite
|
|
||||||
if search.request_data.get('format', 'html') == 'html':
|
if search.request_data.get('format', 'html') == 'html':
|
||||||
if 'content' in result:
|
if 'content' in result:
|
||||||
result['content'] = highlight_content(result['content'],
|
result['content'] = highlight_content(result['content'],
|
||||||
|
|
Loading…
Reference in a new issue