logo

searx

My custom branche(s) on searx, a meta-search engine
commit: f14177381433618a5b4f5bcff83e4e1a19487f02
parent: 813247b37ab00a1496468df4cff33199ae04d6b4
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Fri, 19 Dec 2014 22:40:37 +0100

[mod][fix] https rewrite refactor ++ fixes

Diffstat:

Msearx/https_rewrite.py68+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Msearx/https_rules/Soundcloud.xml2+-
Msearx/webapp.py59++---------------------------------------------------------
3 files changed, 68 insertions(+), 61 deletions(-)

diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py @@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. ''' import re +from urlparse import urlparse from lxml import etree from os import listdir from os.path import isfile, isdir, join @@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath): # TODO hack, which convert a javascript regex group # into a valid python regex group - rule_from = ruleset.attrib.get('from').replace('$', '\\') - rule_to = ruleset.attrib.get('to').replace('$', '\\') + rule_from = ruleset.attrib['from'].replace('$', '\\') + if rule_from.endswith('\\'): + rule_from = rule_from[:-1]+'$' + rule_to = ruleset.attrib['to'].replace('$', '\\') + if rule_to.endswith('\\'): + rule_to = rule_to[:-1]+'$' # TODO, not working yet because of the hack above, # currently doing that in webapp.py # rule_from_rgx = re.compile(rule_from, re.I) # append rule - rules.append((rule_from, rule_to)) + try: + rules.append((re.compile(rule_from, re.I | re.U), rule_to)) + except: + # TODO log regex error + continue # this child define an exclusion elif ruleset.tag == 'exclusion': @@ -143,3 +152,56 @@ def load_https_rules(rules_path): https_rules.append(ruleset) print(' * {n} https-rules loaded'.format(n=len(https_rules))) + + + +def https_url_rewrite(result): + skip_https_rewrite = False + # check if HTTPS rewrite is possible + for target, rules, exclusions in https_rules: + + # check if target regex match with url + if target.match(result['parsed_url'].netloc): + # process exclusions + for exclusion in exclusions: + # check if exclusion match with url + if exclusion.match(result['url']): + skip_https_rewrite = True + break + + # skip https rewrite if required + if skip_https_rewrite: + break + + # process rules + for rule in rules: + try: + new_result_url = rule[0].sub(rule[1], result['url']) + except: + break + + # parse new url + new_parsed_url = urlparse(new_result_url) + + # continiue if nothing was rewritten + if result['url'] == new_result_url: + continue + + # get domainname from result + # TODO, does only work correct with TLD's like + # asdf.com, not for asdf.com.de + # TODO, using publicsuffix instead of this rewrite rule + old_result_domainname = '.'.join( + result['parsed_url'].hostname.split('.')[-2:]) + new_result_domainname = '.'.join( + new_parsed_url.hostname.split('.')[-2:]) + + # check if rewritten hostname is the same, + # to protect against wrong or malicious rewrite rules + if old_result_domainname == new_result_domainname: + # set new url + result['url'] = new_result_url + + # target has matched, do not search over the other rules + break + return result diff --git a/searx/https_rules/Soundcloud.xml b/searx/https_rules/Soundcloud.xml @@ -89,7 +89,7 @@ <rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/" to="https://$1.sndcdn.com/" /> - <rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/" + <rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/" to="https://$1soundcloud.com/" /> <rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/" diff --git a/searx/webapp.py b/searx/webapp.py @@ -41,15 +41,12 @@ from searx.utils import ( UnicodeWriter, highlight_content, html_to_text, get_themes ) from searx.version import VERSION_STRING -from searx.https_rewrite import https_rules from searx.languages import language_codes +from searx.https_rewrite import https_url_rewrite from searx.search import Search from searx.query import Query from searx.autocomplete import backends as autocomplete_backends -from urlparse import urlparse -import re - static_path, templates_path, themes =\ get_themes(settings['themes_path'] @@ -215,59 +212,7 @@ def index(): if settings['server']['https_rewrite']\ and result['parsed_url'].scheme == 'http': - skip_https_rewrite = False - - # check if HTTPS rewrite is possible - for target, rules, exclusions in https_rules: - - # check if target regex match with url - if target.match(result['url']): - # process exclusions - for exclusion in exclusions: - # check if exclusion match with url - if exclusion.match(result['url']): - skip_https_rewrite = True - break - - # skip https rewrite if required - if skip_https_rewrite: - break - - # process rules - for rule in rules: - try: - # TODO, precompile rule - p = re.compile(rule[0]) - - # rewrite url if possible - new_result_url = p.sub(rule[1], result['url']) - except: - break - - # parse new url - new_parsed_url = urlparse(new_result_url) - - # continiue if nothing was rewritten - if result['url'] == new_result_url: - continue - - # get domainname from result - # TODO, does only work correct with TLD's like - # asdf.com, not for asdf.com.de - # TODO, using publicsuffix instead of this rewrite rule - old_result_domainname = '.'.join( - result['parsed_url'].hostname.split('.')[-2:]) - new_result_domainname = '.'.join( - new_parsed_url.hostname.split('.')[-2:]) - - # check if rewritten hostname is the same, - # to protect against wrong or malicious rewrite rules - if old_result_domainname == new_result_domainname: - # set new url - result['url'] = new_result_url - - # target has matched, do not search over the other rules - break + result = https_url_rewrite(result) if search.request_data.get('format', 'html') == 'html': if 'content' in result: