logo

searx

My custom branche(s) on searx, a meta-search engine
commit: 70cbc09e9390d02686882786c20c201b3a08edef
parent: 34941aca3f3c9e204309dbf0426b932e35412238
Author: asciimoo <asciimoo@gmail.com>
Date:   Sat, 19 Oct 2013 17:36:44 +0200

[enh] better url comparison

Diffstat:

Msearx/engines/__init__.py10+++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py @@ -22,6 +22,7 @@ from imp import load_source import grequests from itertools import izip_longest, chain from operator import itemgetter +from urlparse import urlparse engine_dir = dirname(realpath(__file__)) @@ -87,16 +88,23 @@ def search(query, request, selected_engines): results = [] # deduplication + scoring for i,res in enumerate(flat_res): + res['parsed_url'] = urlparse(res['url']) score = flat_len - i duplicated = False for new_res in results: - if res['url'] == new_res['url']: + if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ + res['parsed_url'].path == new_res['parsed_url'].path: duplicated = new_res break if duplicated: if len(res.get('content', '')) > len(duplicated.get('content', '')): duplicated['content'] = res['content'] duplicated['score'] += score + if duplicated['parsed_url'].scheme == 'https': + continue + elif res['parsed_url'].scheme == 'https': + duplicated['parsed_url'].scheme == 'https' + duplicated['url'] = duplicated['parsed_url'].geturl() else: res['score'] = score results.append(res)