commit: d2a636f75d24953f5094ea97ab54a8a4353a65ff
parent: 146928a74980b90de614b71512334ac0a6373048
Author: Adam Tauber <asciimoo@gmail.com>
Date: Mon, 13 Apr 2015 00:30:12 +0200
[mod] https rewrite pluginification
Diffstat:
42 files changed, 234 insertions(+), 231 deletions(-)
diff --git a/searx/__init__.py b/searx/__init__.py
@@ -36,11 +36,6 @@ if 'SEARX_SETTINGS_PATH' in environ:
else:
settings_path = join(searx_dir, 'settings.yml')
-if 'SEARX_HTTPS_REWRITE_PATH' in environ:
- https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
-else:
- https_rewrite_path = join(searx_dir, 'https_rules')
-
# load settings
with open(settings_path) as settings_yaml:
settings = load(settings_yaml)
@@ -52,10 +47,4 @@ else:
logger = logging.getLogger('searx')
-# load https rules only if https rewrite is enabled
-if settings.get('server', {}).get('https_rewrite'):
- # loade https rules
- from searx.https_rewrite import load_https_rules
- load_https_rules(https_rewrite_path)
-
logger.info('Initialisation done')
diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py
@@ -1,209 +0,0 @@
-'''
-searx is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-searx is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with searx. If not, see < http://www.gnu.org/licenses/ >.
-
-(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
-'''
-
-import re
-from urlparse import urlparse
-from lxml import etree
-from os import listdir
-from os.path import isfile, isdir, join
-from searx import logger
-
-
-logger = logger.getChild("https_rewrite")
-
-# https://gitweb.torproject.org/\
-# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
-
-# HTTPS rewrite rules
-https_rules = []
-
-
-# load single ruleset from a xml file
-def load_single_https_ruleset(filepath):
- ruleset = ()
-
- # init parser
- parser = etree.XMLParser()
-
- # load and parse xml-file
- try:
- tree = etree.parse(filepath, parser)
- except:
- # TODO, error message
- return ()
-
- # get root node
- root = tree.getroot()
-
- # check if root is a node with the name ruleset
- # TODO improve parsing
- if root.tag != 'ruleset':
- return ()
-
- # check if rule is deactivated by default
- if root.attrib.get('default_off'):
- return ()
-
- # check if rule does only work for specific platforms
- if root.attrib.get('platform'):
- return ()
-
- hosts = []
- rules = []
- exclusions = []
-
- # parse childs from ruleset
- for ruleset in root:
- # this child define a target
- if ruleset.tag == 'target':
- # check if required tags available
- if not ruleset.attrib.get('host'):
- continue
-
- # convert host-rule to valid regex
- host = ruleset.attrib.get('host')\
- .replace('.', '\.').replace('*', '.*')
-
- # append to host list
- hosts.append(host)
-
- # this child define a rule
- elif ruleset.tag == 'rule':
- # check if required tags available
- if not ruleset.attrib.get('from')\
- or not ruleset.attrib.get('to'):
- continue
-
- # TODO hack, which convert a javascript regex group
- # into a valid python regex group
- rule_from = ruleset.attrib['from'].replace('$', '\\')
- if rule_from.endswith('\\'):
- rule_from = rule_from[:-1]+'$'
- rule_to = ruleset.attrib['to'].replace('$', '\\')
- if rule_to.endswith('\\'):
- rule_to = rule_to[:-1]+'$'
-
- # TODO, not working yet because of the hack above,
- # currently doing that in webapp.py
- # rule_from_rgx = re.compile(rule_from, re.I)
-
- # append rule
- try:
- rules.append((re.compile(rule_from, re.I | re.U), rule_to))
- except:
- # TODO log regex error
- continue
-
- # this child define an exclusion
- elif ruleset.tag == 'exclusion':
- # check if required tags available
- if not ruleset.attrib.get('pattern'):
- continue
-
- exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
-
- # append exclusion
- exclusions.append(exclusion_rgx)
-
- # convert list of possible hosts to a simple regex
- # TODO compress regex to improve performance
- try:
- target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
- except:
- return ()
-
- # return ruleset
- return (target_hosts, rules, exclusions)
-
-
-# load all https rewrite rules
-def load_https_rules(rules_path):
- # check if directory exists
- if not isdir(rules_path):
- logger.error("directory not found: '" + rules_path + "'")
- return
-
- # search all xml files which are stored in the https rule directory
- xml_files = [join(rules_path, f)
- for f in listdir(rules_path)
- if isfile(join(rules_path, f)) and f[-4:] == '.xml']
-
- # load xml-files
- for ruleset_file in xml_files:
- # calculate rewrite-rules
- ruleset = load_single_https_ruleset(ruleset_file)
-
- # skip if no ruleset returned
- if not ruleset:
- continue
-
- # append ruleset
- https_rules.append(ruleset)
-
- logger.info('{n} rules loaded'.format(n=len(https_rules)))
-
-
-def https_url_rewrite(result):
- skip_https_rewrite = False
- # check if HTTPS rewrite is possible
- for target, rules, exclusions in https_rules:
-
- # check if target regex match with url
- if target.match(result['parsed_url'].netloc):
- # process exclusions
- for exclusion in exclusions:
- # check if exclusion match with url
- if exclusion.match(result['url']):
- skip_https_rewrite = True
- break
-
- # skip https rewrite if required
- if skip_https_rewrite:
- break
-
- # process rules
- for rule in rules:
- try:
- new_result_url = rule[0].sub(rule[1], result['url'])
- except:
- break
-
- # parse new url
- new_parsed_url = urlparse(new_result_url)
-
- # continiue if nothing was rewritten
- if result['url'] == new_result_url:
- continue
-
- # get domainname from result
- # TODO, does only work correct with TLD's like
- # asdf.com, not for asdf.com.de
- # TODO, using publicsuffix instead of this rewrite rule
- old_result_domainname = '.'.join(
- result['parsed_url'].hostname.split('.')[-2:])
- new_result_domainname = '.'.join(
- new_parsed_url.hostname.split('.')[-2:])
-
- # check if rewritten hostname is the same,
- # to protect against wrong or malicious rewrite rules
- if old_result_domainname == new_result_domainname:
- # set new url
- result['url'] = new_result_url
-
- # target has matched, do not search over the other rules
- break
- return result
diff --git a/searx/plugins/__init__.py b/searx/plugins/__init__.py
@@ -14,13 +14,15 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2015 by Adam Tauber, <asciimoo@gmail.com>
'''
-from searx.plugins import (self_ip,
- search_on_category_select)
-from searx import logger
from sys import exit
+from searx import logger
logger = logger.getChild('plugins')
+from searx.plugins import (https_rewrite,
+ self_ip,
+ search_on_category_select)
+
required_attrs = (('name', str),
('description', str),
('default_on', bool))
@@ -68,5 +70,6 @@ class PluginStore():
plugins = PluginStore()
+plugins.register(https_rewrite)
plugins.register(self_ip)
plugins.register(search_on_category_select)
diff --git a/searx/plugins/https_rewrite.py b/searx/plugins/https_rewrite.py
@@ -0,0 +1,227 @@
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
+'''
+
+import re
+from urlparse import urlparse
+from lxml import etree
+from os import listdir, environ
+from os.path import isfile, isdir, join
+from searx.plugins import logger
+from flask.ext.babel import gettext
+from searx import searx_dir
+
+
+name = "HTTPS rewrite"
+description = gettext('Rewrite HTTP links to HTTPS if possible')
+default_on = True
+
+if 'SEARX_HTTPS_REWRITE_PATH' in environ:
+ rules_path = environ['SEARX_rules_path']
+else:
+ rules_path = join(searx_dir, 'plugins/https_rules')
+
+logger = logger.getChild("https_rewrite")
+
+# https://gitweb.torproject.org/\
+# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
+
+# HTTPS rewrite rules
+https_rules = []
+
+
+# load single ruleset from a xml file
+def load_single_https_ruleset(rules_path):
+ ruleset = ()
+
+ # init parser
+ parser = etree.XMLParser()
+
+ # load and parse xml-file
+ try:
+ tree = etree.parse(rules_path, parser)
+ except:
+ # TODO, error message
+ return ()
+
+ # get root node
+ root = tree.getroot()
+
+ # check if root is a node with the name ruleset
+ # TODO improve parsing
+ if root.tag != 'ruleset':
+ return ()
+
+ # check if rule is deactivated by default
+ if root.attrib.get('default_off'):
+ return ()
+
+ # check if rule does only work for specific platforms
+ if root.attrib.get('platform'):
+ return ()
+
+ hosts = []
+ rules = []
+ exclusions = []
+
+ # parse childs from ruleset
+ for ruleset in root:
+ # this child define a target
+ if ruleset.tag == 'target':
+ # check if required tags available
+ if not ruleset.attrib.get('host'):
+ continue
+
+ # convert host-rule to valid regex
+ host = ruleset.attrib.get('host')\
+ .replace('.', '\.').replace('*', '.*')
+
+ # append to host list
+ hosts.append(host)
+
+ # this child define a rule
+ elif ruleset.tag == 'rule':
+ # check if required tags available
+ if not ruleset.attrib.get('from')\
+ or not ruleset.attrib.get('to'):
+ continue
+
+ # TODO hack, which convert a javascript regex group
+ # into a valid python regex group
+ rule_from = ruleset.attrib['from'].replace('$', '\\')
+ if rule_from.endswith('\\'):
+ rule_from = rule_from[:-1]+'$'
+ rule_to = ruleset.attrib['to'].replace('$', '\\')
+ if rule_to.endswith('\\'):
+ rule_to = rule_to[:-1]+'$'
+
+ # TODO, not working yet because of the hack above,
+ # currently doing that in webapp.py
+ # rule_from_rgx = re.compile(rule_from, re.I)
+
+ # append rule
+ try:
+ rules.append((re.compile(rule_from, re.I | re.U), rule_to))
+ except:
+ # TODO log regex error
+ continue
+
+ # this child define an exclusion
+ elif ruleset.tag == 'exclusion':
+ # check if required tags available
+ if not ruleset.attrib.get('pattern'):
+ continue
+
+ exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
+
+ # append exclusion
+ exclusions.append(exclusion_rgx)
+
+ # convert list of possible hosts to a simple regex
+ # TODO compress regex to improve performance
+ try:
+ target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
+ except:
+ return ()
+
+ # return ruleset
+ return (target_hosts, rules, exclusions)
+
+
+# load all https rewrite rules
+def load_https_rules(rules_path):
+ # check if directory exists
+ if not isdir(rules_path):
+ logger.error("directory not found: '" + rules_path + "'")
+ return
+
+ # search all xml files which are stored in the https rule directory
+ xml_files = [join(rules_path, f)
+ for f in listdir(rules_path)
+ if isfile(join(rules_path, f)) and f[-4:] == '.xml']
+
+ # load xml-files
+ for ruleset_file in xml_files:
+ # calculate rewrite-rules
+ ruleset = load_single_https_ruleset(ruleset_file)
+
+ # skip if no ruleset returned
+ if not ruleset:
+ continue
+
+ # append ruleset
+ https_rules.append(ruleset)
+
+ logger.info('{n} rules loaded'.format(n=len(https_rules)))
+
+
+def https_url_rewrite(result):
+ skip_https_rewrite = False
+ # check if HTTPS rewrite is possible
+ for target, rules, exclusions in https_rules:
+
+ # check if target regex match with url
+ if target.match(result['parsed_url'].netloc):
+ # process exclusions
+ for exclusion in exclusions:
+ # check if exclusion match with url
+ if exclusion.match(result['url']):
+ skip_https_rewrite = True
+ break
+
+ # skip https rewrite if required
+ if skip_https_rewrite:
+ break
+
+ # process rules
+ for rule in rules:
+ try:
+ new_result_url = rule[0].sub(rule[1], result['url'])
+ except:
+ break
+
+ # parse new url
+ new_parsed_url = urlparse(new_result_url)
+
+ # continiue if nothing was rewritten
+ if result['url'] == new_result_url:
+ continue
+
+ # get domainname from result
+ # TODO, does only work correct with TLD's like
+ # asdf.com, not for asdf.com.de
+ # TODO, using publicsuffix instead of this rewrite rule
+ old_result_domainname = '.'.join(
+ result['parsed_url'].hostname.split('.')[-2:])
+ new_result_domainname = '.'.join(
+ new_parsed_url.hostname.split('.')[-2:])
+
+ # check if rewritten hostname is the same,
+ # to protect against wrong or malicious rewrite rules
+ if old_result_domainname == new_result_domainname:
+ # set new url
+ result['url'] = new_result_url
+
+ # target has matched, do not search over the other rules
+ break
+ return result
+
+
+def on_result(request, ctx):
+ result = ctx['result']
+ if result['parsed_url'].scheme == 'http':
+ https_url_rewrite(result)
+ return True
diff --git a/searx/https_rules/00README b/searx/plugins/https_rules/00README
diff --git a/searx/https_rules/Bing.xml b/searx/plugins/https_rules/Bing.xml
diff --git a/searx/https_rules/Dailymotion.xml b/searx/plugins/https_rules/Dailymotion.xml
diff --git a/searx/https_rules/Deviantart.xml b/searx/plugins/https_rules/Deviantart.xml
diff --git a/searx/https_rules/DuckDuckGo.xml b/searx/plugins/https_rules/DuckDuckGo.xml
diff --git a/searx/https_rules/Flickr.xml b/searx/plugins/https_rules/Flickr.xml
diff --git a/searx/https_rules/Github-Pages.xml b/searx/plugins/https_rules/Github-Pages.xml
diff --git a/searx/https_rules/Github.xml b/searx/plugins/https_rules/Github.xml
diff --git a/searx/https_rules/Google-mismatches.xml b/searx/plugins/https_rules/Google-mismatches.xml
diff --git a/searx/https_rules/Google.org.xml b/searx/plugins/https_rules/Google.org.xml
diff --git a/searx/https_rules/GoogleAPIs.xml b/searx/plugins/https_rules/GoogleAPIs.xml
diff --git a/searx/https_rules/GoogleCanada.xml b/searx/plugins/https_rules/GoogleCanada.xml
diff --git a/searx/https_rules/GoogleImages.xml b/searx/plugins/https_rules/GoogleImages.xml
diff --git a/searx/https_rules/GoogleMainSearch.xml b/searx/plugins/https_rules/GoogleMainSearch.xml
diff --git a/searx/https_rules/GoogleMaps.xml b/searx/plugins/https_rules/GoogleMaps.xml
diff --git a/searx/https_rules/GoogleMelange.xml b/searx/plugins/https_rules/GoogleMelange.xml
diff --git a/searx/https_rules/GoogleSearch.xml b/searx/plugins/https_rules/GoogleSearch.xml
diff --git a/searx/https_rules/GoogleServices.xml b/searx/plugins/https_rules/GoogleServices.xml
diff --git a/searx/https_rules/GoogleShopping.xml b/searx/plugins/https_rules/GoogleShopping.xml
diff --git a/searx/https_rules/GoogleSorry.xml b/searx/plugins/https_rules/GoogleSorry.xml
diff --git a/searx/https_rules/GoogleTranslate.xml b/searx/plugins/https_rules/GoogleTranslate.xml
diff --git a/searx/https_rules/GoogleVideos.xml b/searx/plugins/https_rules/GoogleVideos.xml
diff --git a/searx/https_rules/GoogleWatchBlog.xml b/searx/plugins/https_rules/GoogleWatchBlog.xml
diff --git a/searx/https_rules/Google_App_Engine.xml b/searx/plugins/https_rules/Google_App_Engine.xml
diff --git a/searx/https_rules/Googleplex.com.xml b/searx/plugins/https_rules/Googleplex.com.xml
diff --git a/searx/https_rules/OpenStreetMap.xml b/searx/plugins/https_rules/OpenStreetMap.xml
diff --git a/searx/https_rules/Rawgithub.com.xml b/searx/plugins/https_rules/Rawgithub.com.xml
diff --git a/searx/https_rules/Soundcloud.xml b/searx/plugins/https_rules/Soundcloud.xml
diff --git a/searx/https_rules/ThePirateBay.xml b/searx/plugins/https_rules/ThePirateBay.xml
diff --git a/searx/https_rules/Torproject.xml b/searx/plugins/https_rules/Torproject.xml
diff --git a/searx/https_rules/Twitter.xml b/searx/plugins/https_rules/Twitter.xml
diff --git a/searx/https_rules/Vimeo.xml b/searx/plugins/https_rules/Vimeo.xml
diff --git a/searx/https_rules/WikiLeaks.xml b/searx/plugins/https_rules/WikiLeaks.xml
diff --git a/searx/https_rules/Wikimedia.xml b/searx/plugins/https_rules/Wikimedia.xml
diff --git a/searx/https_rules/Yahoo.xml b/searx/plugins/https_rules/Yahoo.xml
diff --git a/searx/https_rules/YouTube.xml b/searx/plugins/https_rules/YouTube.xml
diff --git a/searx/settings.yml b/searx/settings.yml
@@ -6,7 +6,6 @@ server:
base_url : False # Set custom base_url. Possible values: False or "https://your.custom.host/location/"
themes_path : "" # Custom ui themes path - leave it blank if you didn't change
default_theme : oscar # ui theme
- https_rewrite : True # Force rewrite result urls. See searx/https_rewrite.py
useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator
image_proxy : False # Proxying image results through searx
default_locale : "" # Default interface locale - leave blank to detect from browser information or use codes from the 'locales' config section
diff --git a/searx/webapp.py b/searx/webapp.py
@@ -59,7 +59,6 @@ from searx.utils import (
)
from searx.version import VERSION_STRING
from searx.languages import language_codes
-from searx.https_rewrite import https_url_rewrite
from searx.search import Search
from searx.query import Query
from searx.autocomplete import searx_bang, backends as autocomplete_backends
@@ -359,15 +358,10 @@ def index():
for result in search.results:
+ plugins.call('on_result', request, locals())
if not search.paging and engines[result['engine']].paging:
search.paging = True
- # check if HTTPS rewrite is required
- if settings['server']['https_rewrite']\
- and result['parsed_url'].scheme == 'http':
-
- result = https_url_rewrite(result)
-
if search.request_data.get('format', 'html') == 'html':
if 'content' in result:
result['content'] = highlight_content(result['content'],