commit: c2e40142879fcb08291471f0306a793fce63c124
parent a0a1284998946bdc446283552674263240b4fd0f
Author: marc <a01200356@itesm.mx>
Date: Fri, 24 Jun 2016 00:38:17 -0500
[fix] urls merge in infobox (#593)
TODO:
merge attributes
Diffstat:
2 files changed, 25 insertions(+), 21 deletions(-)
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
@@ -99,9 +99,8 @@ def response(resp):
return []
# link to wikipedia article
- # parenthesis are not quoted to make infobox mergeable with wikidata's
wikipedia_link = url_lang(resp.search_params['language']) \
- + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
+ + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
results.append({'url': wikipedia_link, 'title': title})
diff --git a/searx/results.py b/searx/results.py
@@ -18,7 +18,17 @@ def result_content_len(content):
def compare_urls(url_a, url_b):
- if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
+ # ignore www. in comparison
+ if url_a.netloc.startswith('www.'):
+ host_a = url_a.netloc.replace('www.', '', 1)
+ else:
+ host_a = url_a.netloc
+ if url_b.netloc.startswith('www.'):
+ host_b = url_b.netloc.replace('www.', '', 1)
+ else:
+ host_b = url_b.netloc
+
+ if host_a != host_b or url_a.query != url_b.query:
return False
# remove / from the end of the url if required
@@ -37,15 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2):
urls1 = infobox1.get('urls', None)
if urls1 is None:
urls1 = []
- infobox1['urls'] = urls1
- urlSet = set()
- for url in infobox1.get('urls', []):
- urlSet.add(url.get('url', None))
+ for url2 in infobox2.get('urls', []):
+ unique_url = True
+ for url1 in infobox1.get('urls', []):
+ if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
+ unique_url = False
+ break
+ if unique_url:
+ urls1.append(url2)
- for url in infobox2.get('urls', []):
- if url.get('url', None) not in urlSet:
- urls1.append(url)
+ infobox1['urls'] = urls1
if 'img_src' in infobox2:
img1 = infobox1.get('img_src', None)
@@ -97,7 +109,6 @@ class ResultContainer(object):
self.results = defaultdict(list)
self._merged_results = []
self.infoboxes = []
- self._infobox_ids = {}
self.suggestions = set()
self.answers = set()
self._number_of_results = []
@@ -138,14 +149,13 @@ class ResultContainer(object):
add_infobox = True
infobox_id = infobox.get('id', None)
if infobox_id is not None:
- existingIndex = self._infobox_ids.get(infobox_id, None)
- if existingIndex is not None:
- merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
- add_infobox = False
+ for existingIndex in self.infoboxes:
+ if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
+ merge_two_infoboxes(existingIndex, infobox)
+ add_infobox = False
if add_infobox:
self.infoboxes.append(infobox)
- self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
def _merge_result(self, result, position):
result['parsed_url'] = urlparse(result['url'])
@@ -155,11 +165,6 @@ class ResultContainer(object):
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
result['url'] = result['parsed_url'].geturl()
- result['host'] = result['parsed_url'].netloc
-
- if result['host'].startswith('www.'):
- result['host'] = result['host'].replace('www.', '', 1)
-
result['engines'] = [result['engine']]
# strip multiple spaces and cariage returns from content