logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

https_rewrite.py (7155B)


  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. import sys
  16. from lxml import etree
  17. from os import listdir, environ
  18. from os.path import isfile, isdir, join
  19. from searx.plugins import logger
  20. from flask_babel import gettext
  21. from searx import searx_dir
  22. from searx.url_utils import urlparse
  23. if sys.version_info[0] == 3:
  24. unicode = str
  25. name = "HTTPS rewrite"
  26. description = gettext('Rewrite HTTP links to HTTPS if possible')
  27. default_on = True
  28. preference_section = 'privacy'
  29. if 'SEARX_HTTPS_REWRITE_PATH' in environ:
  30. rules_path = environ['SEARX_rules_path']
  31. else:
  32. rules_path = join(searx_dir, 'plugins/https_rules')
  33. logger = logger.getChild("https_rewrite")
  34. # https://gitweb.torproject.org/\
  35. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  36. # HTTPS rewrite rules
  37. https_rules = []
  38. # load single ruleset from a xml file
  39. def load_single_https_ruleset(rules_path):
  40. ruleset = ()
  41. # init parser
  42. parser = etree.XMLParser()
  43. # load and parse xml-file
  44. try:
  45. tree = etree.parse(rules_path, parser)
  46. except:
  47. # TODO, error message
  48. return ()
  49. # get root node
  50. root = tree.getroot()
  51. # check if root is a node with the name ruleset
  52. # TODO improve parsing
  53. if root.tag != 'ruleset':
  54. return ()
  55. # check if rule is deactivated by default
  56. if root.attrib.get('default_off'):
  57. return ()
  58. # check if rule does only work for specific platforms
  59. if root.attrib.get('platform'):
  60. return ()
  61. hosts = []
  62. rules = []
  63. exclusions = []
  64. # parse childs from ruleset
  65. for ruleset in root:
  66. # this child define a target
  67. if ruleset.tag == 'target':
  68. # check if required tags available
  69. if not ruleset.attrib.get('host'):
  70. continue
  71. # convert host-rule to valid regex
  72. host = ruleset.attrib.get('host')\
  73. .replace('.', r'\.').replace('*', '.*')
  74. # append to host list
  75. hosts.append(host)
  76. # this child define a rule
  77. elif ruleset.tag == 'rule':
  78. # check if required tags available
  79. if not ruleset.attrib.get('from')\
  80. or not ruleset.attrib.get('to'):
  81. continue
  82. # TODO hack, which convert a javascript regex group
  83. # into a valid python regex group
  84. rule_from = ruleset.attrib['from'].replace('$', '\\')
  85. if rule_from.endswith('\\'):
  86. rule_from = rule_from[:-1] + '$'
  87. rule_to = ruleset.attrib['to'].replace('$', '\\')
  88. if rule_to.endswith('\\'):
  89. rule_to = rule_to[:-1] + '$'
  90. # TODO, not working yet because of the hack above,
  91. # currently doing that in webapp.py
  92. # rule_from_rgx = re.compile(rule_from, re.I)
  93. # append rule
  94. try:
  95. rules.append((re.compile(rule_from, re.I | re.U), rule_to))
  96. except:
  97. # TODO log regex error
  98. continue
  99. # this child define an exclusion
  100. elif ruleset.tag == 'exclusion':
  101. # check if required tags available
  102. if not ruleset.attrib.get('pattern'):
  103. continue
  104. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  105. # append exclusion
  106. exclusions.append(exclusion_rgx)
  107. # convert list of possible hosts to a simple regex
  108. # TODO compress regex to improve performance
  109. try:
  110. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  111. except:
  112. return ()
  113. # return ruleset
  114. return (target_hosts, rules, exclusions)
  115. # load all https rewrite rules
  116. def load_https_rules(rules_path):
  117. # check if directory exists
  118. if not isdir(rules_path):
  119. logger.error("directory not found: '" + rules_path + "'")
  120. return
  121. # search all xml files which are stored in the https rule directory
  122. xml_files = [join(rules_path, f)
  123. for f in listdir(rules_path)
  124. if isfile(join(rules_path, f)) and f[-4:] == '.xml']
  125. # load xml-files
  126. for ruleset_file in xml_files:
  127. # calculate rewrite-rules
  128. ruleset = load_single_https_ruleset(ruleset_file)
  129. # skip if no ruleset returned
  130. if not ruleset:
  131. continue
  132. # append ruleset
  133. https_rules.append(ruleset)
  134. logger.info('{n} rules loaded'.format(n=len(https_rules)))
  135. def https_url_rewrite(result):
  136. skip_https_rewrite = False
  137. # check if HTTPS rewrite is possible
  138. for target, rules, exclusions in https_rules:
  139. # check if target regex match with url
  140. if target.match(result['parsed_url'].netloc):
  141. # process exclusions
  142. for exclusion in exclusions:
  143. # check if exclusion match with url
  144. if exclusion.match(result['url']):
  145. skip_https_rewrite = True
  146. break
  147. # skip https rewrite if required
  148. if skip_https_rewrite:
  149. break
  150. # process rules
  151. for rule in rules:
  152. try:
  153. new_result_url = rule[0].sub(rule[1], result['url'])
  154. except:
  155. break
  156. # parse new url
  157. new_parsed_url = urlparse(new_result_url)
  158. # continiue if nothing was rewritten
  159. if result['url'] == new_result_url:
  160. continue
  161. # get domainname from result
  162. # TODO, does only work correct with TLD's like
  163. # asdf.com, not for asdf.com.de
  164. # TODO, using publicsuffix instead of this rewrite rule
  165. old_result_domainname = '.'.join(
  166. result['parsed_url'].hostname.split('.')[-2:])
  167. new_result_domainname = '.'.join(
  168. new_parsed_url.hostname.split('.')[-2:])
  169. # check if rewritten hostname is the same,
  170. # to protect against wrong or malicious rewrite rules
  171. if old_result_domainname == new_result_domainname:
  172. # set new url
  173. result['url'] = new_result_url
  174. # target has matched, do not search over the other rules
  175. break
  176. return result
  177. def on_result(request, search, result):
  178. if result['parsed_url'].scheme == 'http':
  179. https_url_rewrite(result)
  180. return True
  181. load_https_rules(rules_path)