logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

results.py (11359B)


  1. import re
  2. import sys
  3. from collections import defaultdict
  4. from operator import itemgetter
  5. from threading import RLock
  6. from searx.engines import engines
  7. from searx.url_utils import urlparse, unquote
  8. if sys.version_info[0] == 3:
  9. basestring = str
  10. CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
  11. WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
  12. # return the meaningful length of the content for a result
  13. def result_content_len(content):
  14. if isinstance(content, basestring):
  15. return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
  16. else:
  17. return 0
  18. def compare_urls(url_a, url_b):
  19. # ignore www. in comparison
  20. if url_a.netloc.startswith('www.'):
  21. host_a = url_a.netloc.replace('www.', '', 1)
  22. else:
  23. host_a = url_a.netloc
  24. if url_b.netloc.startswith('www.'):
  25. host_b = url_b.netloc.replace('www.', '', 1)
  26. else:
  27. host_b = url_b.netloc
  28. if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
  29. return False
  30. # remove / from the end of the url if required
  31. path_a = url_a.path[:-1]\
  32. if url_a.path.endswith('/')\
  33. else url_a.path
  34. path_b = url_b.path[:-1]\
  35. if url_b.path.endswith('/')\
  36. else url_b.path
  37. return unquote(path_a) == unquote(path_b)
  38. def merge_two_infoboxes(infobox1, infobox2):
  39. # get engines weights
  40. if hasattr(engines[infobox1['engine']], 'weight'):
  41. weight1 = engines[infobox1['engine']].weight
  42. else:
  43. weight1 = 1
  44. if hasattr(engines[infobox2['engine']], 'weight'):
  45. weight2 = engines[infobox2['engine']].weight
  46. else:
  47. weight2 = 1
  48. if weight2 > weight1:
  49. infobox1['engine'] = infobox2['engine']
  50. if 'urls' in infobox2:
  51. urls1 = infobox1.get('urls', None)
  52. if urls1 is None:
  53. urls1 = []
  54. for url2 in infobox2.get('urls', []):
  55. unique_url = True
  56. for url1 in infobox1.get('urls', []):
  57. if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
  58. unique_url = False
  59. break
  60. if unique_url:
  61. urls1.append(url2)
  62. infobox1['urls'] = urls1
  63. if 'img_src' in infobox2:
  64. img1 = infobox1.get('img_src', None)
  65. img2 = infobox2.get('img_src')
  66. if img1 is None:
  67. infobox1['img_src'] = img2
  68. elif weight2 > weight1:
  69. infobox1['img_src'] = img2
  70. if 'attributes' in infobox2:
  71. attributes1 = infobox1.get('attributes', None)
  72. if attributes1 is None:
  73. attributes1 = []
  74. infobox1['attributes'] = attributes1
  75. attributeSet = set()
  76. for attribute in infobox1.get('attributes', []):
  77. if attribute.get('label', None) not in attributeSet:
  78. attributeSet.add(attribute.get('label', None))
  79. for attribute in infobox2.get('attributes', []):
  80. if attribute.get('label', None) not in attributeSet:
  81. attributes1.append(attribute)
  82. if 'content' in infobox2:
  83. content1 = infobox1.get('content', None)
  84. content2 = infobox2.get('content', '')
  85. if content1 is not None:
  86. if result_content_len(content2) > result_content_len(content1):
  87. infobox1['content'] = content2
  88. else:
  89. infobox1['content'] = content2
  90. def result_score(result):
  91. weight = 1.0
  92. for result_engine in result['engines']:
  93. if hasattr(engines[result_engine], 'weight'):
  94. weight *= float(engines[result_engine].weight)
  95. occurences = len(result['positions'])
  96. return sum((occurences * weight) / position for position in result['positions'])
  97. class ResultContainer(object):
  98. """docstring for ResultContainer"""
  99. def __init__(self):
  100. super(ResultContainer, self).__init__()
  101. self.results = defaultdict(list)
  102. self._merged_results = []
  103. self.infoboxes = []
  104. self.suggestions = set()
  105. self.answers = set()
  106. self.corrections = set()
  107. self._number_of_results = []
  108. self._ordered = False
  109. self.paging = False
  110. self.unresponsive_engines = set()
  111. def extend(self, engine_name, results):
  112. for result in list(results):
  113. result['engine'] = engine_name
  114. if 'suggestion' in result:
  115. self.suggestions.add(result['suggestion'])
  116. results.remove(result)
  117. elif 'answer' in result:
  118. self.answers.add(result['answer'])
  119. results.remove(result)
  120. elif 'correction' in result:
  121. self.corrections.add(result['correction'])
  122. results.remove(result)
  123. elif 'infobox' in result:
  124. self._merge_infobox(result)
  125. results.remove(result)
  126. elif 'number_of_results' in result:
  127. self._number_of_results.append(result['number_of_results'])
  128. results.remove(result)
  129. if engine_name in engines:
  130. with RLock():
  131. engines[engine_name].stats['search_count'] += 1
  132. engines[engine_name].stats['result_count'] += len(results)
  133. if not results:
  134. return
  135. self.results[engine_name].extend(results)
  136. if not self.paging and engine_name in engines and engines[engine_name].paging:
  137. self.paging = True
  138. for i, result in enumerate(results):
  139. if 'url' in result and not isinstance(result['url'], basestring):
  140. continue
  141. try:
  142. result['url'] = result['url'].decode('utf-8')
  143. except:
  144. pass
  145. if 'title' in result and not isinstance(result['title'], basestring):
  146. continue
  147. if 'content' in result and not isinstance(result['content'], basestring):
  148. continue
  149. position = i + 1
  150. self._merge_result(result, position)
  151. def _merge_infobox(self, infobox):
  152. add_infobox = True
  153. infobox_id = infobox.get('id', None)
  154. if infobox_id is not None:
  155. for existingIndex in self.infoboxes:
  156. if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
  157. merge_two_infoboxes(existingIndex, infobox)
  158. add_infobox = False
  159. if add_infobox:
  160. self.infoboxes.append(infobox)
  161. def _merge_result(self, result, position):
  162. result['parsed_url'] = urlparse(result['url'])
  163. # if the result has no scheme, use http as default
  164. if not result['parsed_url'].scheme:
  165. result['parsed_url'] = result['parsed_url']._replace(scheme="http")
  166. result['url'] = result['parsed_url'].geturl()
  167. result['engines'] = set([result['engine']])
  168. # strip multiple spaces and cariage returns from content
  169. if result.get('content'):
  170. result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
  171. # check for duplicates
  172. duplicated = False
  173. for merged_result in self._merged_results:
  174. if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
  175. and result.get('template') == merged_result.get('template'):
  176. duplicated = merged_result
  177. break
  178. # merge duplicates together
  179. if duplicated:
  180. # using content with more text
  181. if result_content_len(result.get('content', '')) >\
  182. result_content_len(duplicated.get('content', '')):
  183. duplicated['content'] = result['content']
  184. # merge all result's parameters not found in duplicate
  185. for key in result.keys():
  186. if not duplicated.get(key):
  187. duplicated[key] = result.get(key)
  188. # add the new position
  189. duplicated['positions'].append(position)
  190. # add engine to list of result-engines
  191. duplicated['engines'].add(result['engine'])
  192. # using https if possible
  193. if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
  194. duplicated['url'] = result['parsed_url'].geturl()
  195. duplicated['parsed_url'] = result['parsed_url']
  196. # if there is no duplicate found, append result
  197. else:
  198. result['positions'] = [position]
  199. with RLock():
  200. self._merged_results.append(result)
  201. def order_results(self):
  202. for result in self._merged_results:
  203. score = result_score(result)
  204. result['score'] = score
  205. with RLock():
  206. for result_engine in result['engines']:
  207. engines[result_engine].stats['score_count'] += score
  208. results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
  209. # pass 2 : group results by category and template
  210. gresults = []
  211. categoryPositions = {}
  212. for i, res in enumerate(results):
  213. # FIXME : handle more than one category per engine
  214. res['category'] = engines[res['engine']].categories[0]
  215. # FIXME : handle more than one category per engine
  216. category = engines[res['engine']].categories[0]\
  217. + ':' + res.get('template', '')\
  218. + ':' + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
  219. current = None if category not in categoryPositions\
  220. else categoryPositions[category]
  221. # group with previous results using the same category
  222. # if the group can accept more result and is not too far
  223. # from the current position
  224. if current is not None and (current['count'] > 0)\
  225. and (len(gresults) - current['index'] < 20):
  226. # group with the previous results using
  227. # the same category with this one
  228. index = current['index']
  229. gresults.insert(index, res)
  230. # update every index after the current one
  231. # (including the current one)
  232. for k in categoryPositions:
  233. v = categoryPositions[k]['index']
  234. if v >= index:
  235. categoryPositions[k]['index'] = v + 1
  236. # update this category
  237. current['count'] -= 1
  238. else:
  239. # same category
  240. gresults.append(res)
  241. # update categoryIndex
  242. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  243. # update _merged_results
  244. self._ordered = True
  245. self._merged_results = gresults
  246. def get_ordered_results(self):
  247. if not self._ordered:
  248. self.order_results()
  249. return self._merged_results
  250. def results_length(self):
  251. return len(self._merged_results)
  252. def results_number(self):
  253. resultnum_sum = sum(self._number_of_results)
  254. if not resultnum_sum or not self._number_of_results:
  255. return 0
  256. return resultnum_sum / len(self._number_of_results)
  257. def add_unresponsive_engine(self, engine_error):
  258. self.unresponsive_engines.add(engine_error)