commit: 8d335dbdaedd6113242e785e8fabac86128d069a
parent 817c74e52317126128c6f8740df01b8bdc51c3cf
Author: a01200356 <a01200356@itesm.mx>
Date: Mon, 14 Mar 2016 00:32:36 -0600
[enh] wikipedia infobox
creates simple multilingual infobox using wikipedia's api
Diffstat:
7 files changed, 297 insertions(+), 14 deletions(-)
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -43,3 +43,4 @@ generally made searx better:
- Kang-min Liu
- Kirill Isakov
- Guilhem Bonnefille
+- Marc Abonce Seguin
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
@@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
results.append({'title': title, 'url': official_website})
wikipedia_link_count = 0
+ wikipedia_link = get_wikilink(result, language + 'wiki')
+ wikipedia_link_count += add_url(urls,
+ 'Wikipedia (' + language + ')',
+ wikipedia_link)
if language != 'en':
+ wikipedia_en_link = get_wikilink(result, 'enwiki')
wikipedia_link_count += add_url(urls,
- 'Wikipedia (' + language + ')',
- get_wikilink(result, language +
- 'wiki'))
- wikipedia_en_link = get_wikilink(result, 'enwiki')
- wikipedia_link_count += add_url(urls,
- 'Wikipedia (en)',
- wikipedia_en_link)
+ 'Wikipedia (en)',
+ wikipedia_en_link)
if wikipedia_link_count == 0:
misc_language = get_wiki_firstlanguage(result, 'wiki')
if misc_language is not None:
@@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
else:
results.append({
'infobox': title,
- 'id': wikipedia_en_link,
+ 'id': wikipedia_link,
'content': description,
'attributes': attributes,
'urls': urls
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
@@ -0,0 +1,114 @@
+"""
+ Wikipedia (Web)
+
+ @website https://{language}.wikipedia.org
+ @provide-api yes
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, infobox
+"""
+
+from json import loads
+from urllib import urlencode, quote
+
+# search-url
+base_url = 'https://{language}.wikipedia.org/'
+search_postfix = 'w/api.php?'\
+ 'action=query'\
+ '&format=json'\
+ '&{query}'\
+ '&prop=extracts|pageimages'\
+ '&exintro'\
+ '&explaintext'\
+ '&pithumbsize=300'\
+ '&redirects'
+
+
+# set language in base_url
+def url_lang(lang):
+ if lang == 'all':
+ language = 'en'
+ else:
+ language = lang.split('_')[0]
+
+ return base_url.format(language=language)
+
+
+# do search-request
+def request(query, params):
+ if query.islower():
+ query += '|' + query.title()
+
+ params['url'] = url_lang(params['language']) \
+ + search_postfix.format(query=urlencode({'titles': query}))
+
+ return params
+
+
+# get first meaningful paragraph
+# this should filter out disambiguation pages and notes above first paragraph
+# "magic numbers" were obtained by fine tuning
+def extract_first_paragraph(content, title, image):
+ first_paragraph = None
+
+ failed_attempts = 0
+ for paragraph in content.split('\n'):
+
+ starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
+ length = len(paragraph)
+
+ if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
+ first_paragraph = paragraph
+ break
+
+ failed_attempts += 1
+ if failed_attempts > 3:
+ return None
+
+ return first_paragraph
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_result = loads(resp.content)
+
+ # wikipedia article's unique id
+ # first valid id is assumed to be the requested article
+ for article_id in search_result['query']['pages']:
+ page = search_result['query']['pages'][article_id]
+ if int(article_id) > 0:
+ break
+
+ if int(article_id) < 0:
+ return []
+
+ title = page.get('title')
+
+ image = page.get('thumbnail')
+ if image:
+ image = image.get('source')
+
+ extract = page.get('extract')
+
+ summary = extract_first_paragraph(extract, title, image)
+ if not summary:
+ return []
+
+ # link to wikipedia article
+ # parenthesis are not quoted to make infobox mergeable with wikidata's
+ wikipedia_link = url_lang(resp.search_params['language']) \
+ + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
+
+ results.append({'url': wikipedia_link, 'title': title})
+
+ results.append({'infobox': title,
+ 'id': wikipedia_link,
+ 'content': summary,
+ 'img_src': image,
+ 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
+
+ return results
diff --git a/searx/results.py b/searx/results.py
@@ -37,7 +37,7 @@ def merge_two_infoboxes(infobox1, infobox2):
urls1 = infobox1.get('urls', None)
if urls1 is None:
urls1 = []
- infobox1.set('urls', urls1)
+ infobox1['urls'] = urls1
urlSet = set()
for url in infobox1.get('urls', []):
@@ -47,11 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2):
if url.get('url', None) not in urlSet:
urls1.append(url)
+ if 'img_src' in infobox2:
+ img1 = infobox1.get('img_src', None)
+ img2 = infobox2.get('img_src')
+ if img1 is None:
+ infobox1['img_src'] = img2
+
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None)
if attributes1 is None:
attributes1 = []
- infobox1.set('attributes', attributes1)
+ infobox1['attributes'] = attributes1
attributeSet = set()
for attribute in infobox1.get('attributes', []):
@@ -68,7 +74,7 @@ def merge_two_infoboxes(infobox1, infobox2):
if result_content_len(content2) > result_content_len(content1):
infobox1['content'] = content2
else:
- infobox1.set('content', content2)
+ infobox1['content'] = content2
def result_score(result):
diff --git a/searx/settings.yml b/searx/settings.yml
@@ -43,10 +43,9 @@ engines:
shortcut : bs
- name : wikipedia
- engine : mediawiki
+ engine : wikipedia
shortcut : wp
base_url : 'https://{language}.wikipedia.org/'
- number_of_results : 1
- name : bing
engine : bing
@@ -93,6 +92,7 @@ engines:
- name : ddg definitions
engine : duckduckgo_definitions
shortcut : ddd
+ disabled : True
- name : digg
engine : digg
diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html
@@ -1,8 +1,9 @@
<div class="panel panel-default infobox">
<div class="panel-heading">
- <h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4>
+ <bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi>
</div>
<div class="panel-body">
+ <bdi>
{% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %}
{% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %}
@@ -28,5 +29,6 @@
{% endfor %}
</div>
{% endif %}
+ </bdi>
</div>
</div>
diff --git a/tests/unit/engines/test_wikipedia.py b/tests/unit/engines/test_wikipedia.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+from collections import defaultdict
+import mock
+from searx.engines import wikipedia
+from searx.testing import SearxTestCase
+
+
+class TestWikipediaEngine(SearxTestCase):
+
+ def test_request(self):
+ query = 'test_query'
+ dicto = defaultdict(dict)
+ dicto['language'] = 'fr_FR'
+ params = wikipedia.request(query, dicto)
+ self.assertIn('url', params)
+ self.assertIn(query, params['url'])
+ self.assertIn('test_query', params['url'])
+ self.assertIn('Test_Query', params['url'])
+ self.assertIn('fr.wikipedia.org', params['url'])
+
+ query = 'Test_Query'
+ params = wikipedia.request(query, dicto)
+ self.assertIn('Test_Query', params['url'])
+ self.assertNotIn('test_query', params['url'])
+
+ dicto['language'] = 'all'
+ params = wikipedia.request(query, dicto)
+ self.assertIn('en', params['url'])
+
+ def test_response(self):
+ dicto = defaultdict(dict)
+ dicto['language'] = 'fr'
+
+ self.assertRaises(AttributeError, wikipedia.response, None)
+ self.assertRaises(AttributeError, wikipedia.response, [])
+ self.assertRaises(AttributeError, wikipedia.response, '')
+ self.assertRaises(AttributeError, wikipedia.response, '[]')
+
+ # page not found
+ json = """
+ {
+ "batchcomplete": "",
+ "query": {
+ "normalized": [],
+ "pages": {
+ "-1": {
+ "ns": 0,
+ "title": "",
+ "missing": ""
+ }
+ }
+ }
+ }"""
+ response = mock.Mock(content=json, search_params=dicto)
+ self.assertEqual(wikipedia.response(response), [])
+
+ # normal case
+ json = """
+ {
+ "batchcomplete": "",
+ "query": {
+ "normalized": [],
+ "pages": {
+ "12345": {
+ "pageid": 12345,
+ "ns": 0,
+ "title": "The Title",
+ "extract": "The Title is...",
+ "thumbnail": {
+ "source": "img_src.jpg"
+ },
+ "pageimage": "img_name.jpg"
+ }
+ }
+ }
+ }"""
+ response = mock.Mock(content=json, search_params=dicto)
+ results = wikipedia.response(response)
+ self.assertEqual(type(results), list)
+ self.assertEqual(len(results), 2)
+ self.assertEqual(results[0]['title'], u'The Title')
+ self.assertIn('fr.wikipedia.org/wiki/The_Title', results[0]['url'])
+ self.assertEqual(results[1]['infobox'], u'The Title')
+ self.assertIn('fr.wikipedia.org/wiki/The_Title', results[1]['id'])
+ self.assertIn('The Title is...', results[1]['content'])
+ self.assertEqual(results[1]['img_src'], 'img_src.jpg')
+
+ # disambiguation page
+ json = """
+ {
+ "batchcomplete": "",
+ "query": {
+ "normalized": [],
+ "pages": {
+ "12345": {
+ "pageid": 12345,
+ "ns": 0,
+ "title": "The Title",
+ "extract": "The Title can be:\\nThe Title 1\\nThe Title 2\\nThe Title 3\\nThe Title 4......................................................................................................................................." """ # noqa
+ json += """
+ }
+ }
+ }
+ }"""
+ response = mock.Mock(content=json, search_params=dicto)
+ results = wikipedia.response(response)
+ self.assertEqual(type(results), list)
+ self.assertEqual(len(results), 0)
+
+ # no image
+ json = """
+ {
+ "batchcomplete": "",
+ "query": {
+ "normalized": [],
+ "pages": {
+ "12345": {
+ "pageid": 12345,
+ "ns": 0,
+ "title": "The Title",
+ "extract": "The Title is......................................................................................................................................................................................." """ # noqa
+ json += """
+ }
+ }
+ }
+ }"""
+ response = mock.Mock(content=json, search_params=dicto)
+ results = wikipedia.response(response)
+ self.assertEqual(type(results), list)
+ self.assertEqual(len(results), 2)
+ self.assertIn('The Title is...', results[1]['content'])
+ self.assertEqual(results[1]['img_src'], None)
+
+ # title not in first paragraph
+ json = u"""
+ {
+ "batchcomplete": "",
+ "query": {
+ "normalized": [],
+ "pages": {
+ "12345": {
+ "pageid": 12345,
+ "ns": 0,
+ "title": "披頭四樂隊",
+ "extract": "披头士乐队....................................................................................................................................................................................................\\n披頭四樂隊...", """ # noqa
+ json += """
+ "thumbnail": {
+ "source": "img_src.jpg"
+ },
+ "pageimage": "img_name.jpg"
+ }
+ }
+ }
+ }"""
+ response = mock.Mock(content=json, search_params=dicto)
+ results = wikipedia.response(response)
+ self.assertEqual(type(results), list)
+ self.assertEqual(len(results), 2)
+ self.assertEqual(results[1]['infobox'], u'披頭四樂隊')
+ self.assertIn(u'披头士乐队...', results[1]['content'])