commit: 9688495b9fd3f03c2faaf95ec23f03b9533682b1
parent: fa6b980edd44ea0d62b4dcad89ddbde8846b1c67
Author: asciimoo <asciimoo@gmail.com>
Date: Tue, 12 Nov 2013 21:34:28 +0100
[mod] result scoring separated
Diffstat:
1 file changed, 43 insertions(+), 38 deletions(-)
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
@@ -123,6 +123,46 @@ def highlight_content(content, query):
return content
+def score_results(results):
+ flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
+ flat_len = len(flat_res)
+ engines_len = len(results)
+ results = []
+ # deduplication + scoring
+ for i,res in enumerate(flat_res):
+ res['parsed_url'] = urlparse(res['url'])
+ res['engines'] = [res['engine']]
+ weight = 1.0
+ if hasattr(engines[res['engine']], 'weight'):
+ weight = float(engines[res['engine']].weight)
+ elif res['engine'] in settings.weights:
+ weight = float(settings.weights[res['engine']])
+ score = int((flat_len - i)/engines_len)*weight+1
+ duplicated = False
+ for new_res in results:
+ p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path
+ p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path
+ if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
+ p1 == p2 and\
+ res['parsed_url'].query == new_res['parsed_url'].query and\
+ res.get('template') == new_res.get('template'):
+ duplicated = new_res
+ break
+ if duplicated:
+ if len(res.get('content', '')) > len(duplicated.get('content', '')):
+ duplicated['content'] = res['content']
+ duplicated['score'] += score
+ duplicated['engines'].append(res['engine'])
+ if duplicated['parsed_url'].scheme == 'https':
+ continue
+ elif res['parsed_url'].scheme == 'https':
+ duplicated['url'] = res['parsed_url'].geturl()
+ duplicated['parsed_url'] = res['parsed_url']
+ else:
+ res['score'] = score
+ results.append(res)
+ return sorted(results, key=itemgetter('score'), reverse=True)
+
def search(query, request, selected_engines):
global engines, categories, number_of_searches
requests = []
@@ -165,43 +205,8 @@ def search(query, request, selected_engines):
for engine_name,engine_results in results.items():
engines[engine_name].stats['search_count'] += 1
engines[engine_name].stats['result_count'] += len(engine_results)
- flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
- flat_len = len(flat_res)
- engines_len = len(selected_engines)
- results = []
- # deduplication + scoring
- for i,res in enumerate(flat_res):
- res['parsed_url'] = urlparse(res['url'])
- res['engines'] = [res['engine']]
- weight = 1.0
- if hasattr(engines[res['engine']], 'weight'):
- weight = float(engines[res['engine']].weight)
- elif res['engine'] in settings.weights:
- weight = float(settings.weights[res['engine']])
- score = int((flat_len - i)/engines_len)*weight+1
- duplicated = False
- for new_res in results:
- p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path
- p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path
- if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
- p1 == p2 and\
- res['parsed_url'].query == new_res['parsed_url'].query and\
- res.get('template') == new_res.get('template'):
- duplicated = new_res
- break
- if duplicated:
- if len(res.get('content', '')) > len(duplicated.get('content', '')):
- duplicated['content'] = res['content']
- duplicated['score'] += score
- duplicated['engines'].append(res['engine'])
- if duplicated['parsed_url'].scheme == 'https':
- continue
- elif res['parsed_url'].scheme == 'https':
- duplicated['url'] = res['parsed_url'].geturl()
- duplicated['parsed_url'] = res['parsed_url']
- else:
- res['score'] = score
- results.append(res)
+
+ results = score_results(results)
for result in results:
if 'content' in result:
@@ -209,7 +214,7 @@ def search(query, request, selected_engines):
for res_engine in result['engines']:
engines[result['engine']].stats['score_count'] += result['score']
- return sorted(results, key=itemgetter('score'), reverse=True)
+ return results
def get_engines_stats():
pageloads = []