commit: 9b9f097adbf39d7908931203e9d8966748900bde
parent d1d55f2ca41fbaf10a66bfc66d69e0fccf673413
Author: Thomas Pointhuber <thomas.pointhuber@gmx.at>
Date: Sun, 14 Sep 2014 11:09:44 +0200
Implementing https rewrite support #71
* parsing XML-Files which contain target, exclusions and rules
* convert regex if required (is a little hack, probably does not work
for all rules)
* check if target rule apply for http url, and use the rules to rewrite
it
* add pice of code, to check if domain name has not changed during
rewrite (should be rewritten, using publicsuffix instead of little hack)
Diffstat:
3 files changed, 187 insertions(+), 14 deletions(-)
diff --git a/searx/__init__.py b/searx/__init__.py
@@ -1,5 +1,6 @@
from os import environ
from os.path import realpath, dirname, join, abspath
+from searx.https_rewrite import load_https_rules
try:
from yaml import load
except:
@@ -15,6 +16,13 @@ if 'SEARX_SETTINGS_PATH' in environ:
else:
settings_path = join(searx_dir, 'settings.yml')
+if 'SEARX_HTTPS_REWRITE_PATH' in environ:
+ https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
+else:
+ https_rewrite_path = join(searx_dir, 'https_rules')
with open(settings_path) as settings_yaml:
settings = load(settings_yaml)
+
+# loade https rules
+load_https_rules(https_rewrite_path)
diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py
@@ -1,14 +1,139 @@
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
+'''
+
import re
+from lxml import etree
+from os import listdir
+from os.path import isfile, join
+
# https://gitweb.torproject.org/\
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
# HTTPS rewrite rules
-https_rules = (
- # from
- (re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
- # to
- r'https://\1xkcd.com/'),
- (re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
- r'https://sslimgs.xkcd.com/'),
-)
+https_rules = []
+
+
+# load single ruleset from a xml file
+def load_single_https_ruleset(filepath):
+ ruleset = ()
+
+ # init parser
+ parser = etree.XMLParser()
+
+ # load and parse xml-file
+ try:
+ tree = etree.parse(filepath, parser)
+ except:
+ # TODO, error message
+ return ()
+
+ # get root node
+ root = tree.getroot()
+
+ #print(etree.tostring(tree))
+
+ # check if root is a node with the name ruleset
+ # TODO improve parsing
+ if root.tag != 'ruleset':
+ return ()
+
+ # check if rule is deactivated by default
+ if root.attrib.get('default_off'):
+ return ()
+
+ # check if rule does only work for specific platforms
+ if root.attrib.get('platform'):
+ return ()
+
+ hosts = []
+ rules = []
+ exclusions = []
+
+ # parse childs from ruleset
+ for ruleset in root:
+ # this child define a target
+ if ruleset.tag == 'target':
+ # check if required tags available
+ if not ruleset.attrib.get('host'):
+ continue
+
+ # convert host-rule to valid regex
+ host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
+
+ # append to host list
+ hosts.append(host)
+
+ # this child define a rule
+ elif ruleset.tag == 'rule':
+ # check if required tags available
+ if not ruleset.attrib.get('from')\
+ or not ruleset.attrib.get('to'):
+ continue
+
+ # TODO hack, which convert a javascript regex group into a valid python regex group
+ rule_from = ruleset.attrib.get('from').replace('$', '\\')
+ rule_to = ruleset.attrib.get('to').replace('$', '\\')
+
+ # TODO, not working yet because of the hack above, currently doing that in webapp.py
+ #rule_from_rgx = re.compile(rule_from, re.I)
+
+ # append rule
+ rules.append((rule_from, rule_to))
+
+ # this child define an exclusion
+ elif ruleset.tag == 'exclusion':
+ # check if required tags available
+ if not ruleset.attrib.get('pattern'):
+ continue
+
+ exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
+
+ # append exclusion
+ exclusions.append(exclusion_rgx)
+
+ # convert list of possible hosts to a simple regex
+ # TODO compress regex to improve performance
+ try:
+ target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
+ except:
+ return ()
+
+ # return ruleset
+ return (target_hosts, rules, exclusions)
+
+
+# load all https rewrite rules
+def load_https_rules(rules_path):
+ # add / to path if not set yet
+ if rules_path[-1:] != '/':
+ rules_path += '/'
+
+ # search all xml files which are stored in the https rule directory
+ xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
+
+ # load xml-files
+ for ruleset_file in xml_files:
+ # calculate rewrite-rules
+ ruleset = load_single_https_ruleset(ruleset_file)
+
+ # skip if no ruleset returned
+ if not ruleset:
+ continue
+
+ # append ruleset
+ https_rules.append(ruleset)
diff --git a/searx/webapp.py b/searx/webapp.py
@@ -49,6 +49,9 @@ from searx.languages import language_codes
from searx.search import Search
from searx.autocomplete import backends as autocomplete_backends
+from urlparse import urlparse
+import re
+
static_path, templates_path, themes =\
get_themes(settings['themes_path']
@@ -197,16 +200,53 @@ def index():
if not search.paging and engines[result['engine']].paging:
search.paging = True
+ # check if HTTPS rewrite is required
if settings['server']['https_rewrite']\
and result['parsed_url'].scheme == 'http':
- for http_regex, https_url in https_rules:
- if http_regex.match(result['url']):
- result['url'] = http_regex.sub(https_url, result['url'])
- # TODO result['parsed_url'].scheme
- break
+ skip_https_rewrite = False
+
+ # check if HTTPS rewrite is possible
+ for target, rules, exclusions in https_rules:
+
+ # check if target regex match with url
+ if target.match(result['url']):
+ # process exclusions
+ for exclusion in exclusions:
+ # check if exclusion match with url
+ if exclusion.match(result['url']):
+ skip_https_rewrite = True
+ break
+
+ # skip https rewrite if required
+ if skip_https_rewrite:
+ break
+
+ # process rules
+ for rule in rules:
+ # TODO, precompile rule
+ p = re.compile(rule[0])
+ # rewrite url if possible
+ new_result_url = p.sub(rule[1], result['url'])
+
+ # parse new url
+ new_parsed_url = urlparse(new_result_url)
+
+ # continiue if nothing was rewritten
+ if result['url'] == new_result_url:
+ continue
+
+ # get domainname from result
+ # TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de
+ # TODO, using publicsuffix instead of this rewrite rule
+ old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:])
+ new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:])
+
+ # check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules
+ if old_result_domainname == new_result_domainname:
+ # set new url
+ result['url'] = new_result_url
- # HTTPS rewrite
if search.request_data.get('format', 'html') == 'html':
if 'content' in result:
result['content'] = highlight_content(result['content'],