commit: 6b058962e1f87a17ce2d9c2bcb4faa73df285df3
parent bd2db71fa6921a757ff5df559535092f45010652
Author: Dalf <alex@al-f.net>
Date: Mon, 22 Sep 2014 22:55:51 +0200
[fix] when two results are merged, really use the content with more text
Diffstat:
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/searx/search.py b/searx/search.py
@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
'''
import grequests
+import re
from itertools import izip_longest, chain
from datetime import datetime
from operator import itemgetter
@@ -76,6 +77,13 @@ def make_callback(engine_name, results, suggestions, callback, params):
return process_callback
+# return the meaningful length of the content for a result
+def content_result_len(result):
+ if isinstance(result.get('content'), basestring):
+ content = re.sub('[,;:!?\./\\\\ ()-_]', '', result.get('content'))
+ return len(content)
+ else:
+ return 0
# score results and remove duplications
def score_results(results):
@@ -110,6 +118,9 @@ def score_results(results):
duplicated = False
# check for duplicates
+ if 'content' in res:
+ res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', ''))
+
for new_res in results:
# remove / from the end of the url if required
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
@@ -126,7 +137,7 @@ def score_results(results):
# merge duplicates together
if duplicated:
# using content with more text
- if res.get('content') > duplicated.get('content'):
+ if content_result_len(res) > content_result_len(duplicated):
duplicated['content'] = res['content']
# increase result-score