logo

searx

My custom branche(s) on searx, a meta-search engine
commit: a07b2b514c38cff031e0e36b99878a6041873842
parent: cebf5868b33659d9b66f949c9c78f4101bd3b925
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Sat,  5 Jul 2014 17:33:19 +0200

[fix] url path unquoted check to avoid duplications

Diffstat:

Msearx/engines/__init__.py10++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py @@ -21,7 +21,7 @@ import sys from imp import load_source from itertools import izip_longest, chain from operator import itemgetter -from urlparse import urlparse +from urlparse import urlparse, unquote from datetime import datetime import grequests from flask.ext.babel import gettext @@ -153,7 +153,9 @@ def score_results(results): results = [] # deduplication + scoring for i, res in enumerate(flat_res): + res['parsed_url'] = urlparse(res['url']) + res['host'] = res['parsed_url'].netloc if res['host'].startswith('www.'): @@ -172,7 +174,7 @@ def score_results(results): p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa if res['host'] == new_res['host'] and\ - p1 == p2 and\ + unquote(p1) == unquote(p2) and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): duplicated = new_res @@ -222,6 +224,10 @@ def search(query, request, selected_engines, pageno=1, lang='all'): request_params['language'] = lang request_params = engine.request(query.encode('utf-8'), request_params) + if request_params['url'] is None: + # TODO add support of offline engines + pass + callback = make_callback( selected_engine['name'], results,