logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git
commit: a0a1284998946bdc446283552674263240b4fd0f
parent a4c77f88d0ca52b9c236c95a16b2a527f811c0e6
Author: marc <a01200356@itesm.mx>
Date:   Mon,  6 Jun 2016 01:08:36 -0500

wikidata refactor and more attributes (see issue #560)

Diffstat:

M.gitignore1+
Msearx/engines/wikidata.py602++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Msearx/templates/default/infobox.html14+++++++-------
Msearx/templates/oscar/infobox.html12++++++------
Atests/unit/engines/test_wikidata.py502+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 883 insertions(+), 248 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,4 +1,5 @@ .coverage +coverage/ .installed.cfg engines.cfg env diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py @@ -1,33 +1,57 @@ -import json +# -*- coding: utf-8 -*- +""" + Wikidata + + @website https://wikidata.org + @provide-api yes (https://wikidata.org/w/api.php) + + @using-api partially (most things require scraping) + @results JSON, HTML + @stable no (html can change) + @parse url, infobox +""" from searx import logger from searx.poolrequests import get -from searx.utils import format_date_by_locale +from searx.engines.xpath import extract_text -from datetime import datetime -from dateutil.parser import parse as dateutil_parse -from urllib import urlencode +from json import loads from lxml.html import fromstring - +from urllib import urlencode logger = logger.getChild('wikidata') result_count = 1 + +# urls wikidata_host = 'https://www.wikidata.org' url_search = wikidata_host \ + '/wiki/Special:ItemDisambiguation?{query}' wikidata_api = wikidata_host + '/w/api.php' url_detail = wikidata_api\ - + '?action=wbgetentities&format=json'\ - + '&props=labels%7Cinfo%7Csitelinks'\ - + '%7Csitelinks%2Furls%7Cdescriptions%7Cclaims'\ - + '&{query}' + + '?action=parse&format=json&{query}'\ + + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\ + + '&disableeditsection=1&disabletidy=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2' + url_map = 'https://www.openstreetmap.org/'\ + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' -url_entity_label = wikidata_api\ - + '?action=wbgetentities&format=json&props=labels&{query}' +url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500' +# xpaths wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' +title_xpath = '//*[contains(@class,"wikibase-title-label")]' +description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' +property_xpath = '//div[@id="{propertyid}"]' +label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' +url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' +wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ + + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' +property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' +preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' +value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ + + '/*/div[contains(@class,"wikibase-snakview-value")]' +language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' +calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' def request(query, params): @@ -50,13 +74,13 @@ def response(resp): if language == 'all': language = 'en' - url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids), - 'languages': language + '|en'})) - - htmlresponse = get(url) - jsonresponse = json.loads(htmlresponse.content) + # TODO: make requests asynchronous to avoid timeout when result_count > 1 for wikidata_id in wikidata_ids[:result_count]: - results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) + url = url_detail.format(query=urlencode({'page': wikidata_id, + 'uselang': language})) + htmlresponse = get(url) + jsonresponse = loads(htmlresponse.content) + results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) return results @@ -66,125 +90,194 @@ def getDetail(jsonresponse, wikidata_id, language, locale): urls = [] attributes = [] - result = jsonresponse.get('entities', {}).get(wikidata_id, {}) + title = jsonresponse.get('parse', {}).get('displaytitle', {}) + result = jsonresponse.get('parse', {}).get('text', {}) - title = result.get('labels', {}).get(language, {}).get('value', None) - if title is None: - title = result.get('labels', {}).get('en', {}).get('value', None) - if title is None: + if not title or not result: return results - description = result\ - .get('descriptions', {})\ - .get(language, {})\ - .get('value', None) + title = fromstring(title) + for elem in title.xpath(language_fallback_xpath): + elem.getparent().remove(elem) + title = extract_text(title.xpath(title_xpath)) - if description is None: - description = result\ - .get('descriptions', {})\ - .get('en', {})\ - .get('value', '') + result = fromstring(result) + for elem in result.xpath(language_fallback_xpath): + elem.getparent().remove(elem) - claims = result.get('claims', {}) - official_website = get_string(claims, 'P856', None) - if official_website is not None: - urls.append({'title': get_label('P856', language), 'url': official_website}) - results.append({'title': title, 'url': official_website}) + description = extract_text(result.xpath(description_xpath)) - wikipedia_link_count = 0 - wikipedia_link = get_wikilink(result, language + 'wiki') - wikipedia_link_count += add_url(urls, - 'Wikipedia (' + language + ')', - wikipedia_link) - if language != 'en': - wikipedia_en_link = get_wikilink(result, 'enwiki') - wikipedia_link_count += add_url(urls, - 'Wikipedia (en)', - wikipedia_en_link) - if wikipedia_link_count == 0: - misc_language = get_wiki_firstlanguage(result, 'wiki') - if misc_language is not None: - add_url(urls, - 'Wikipedia (' + misc_language + ')', - get_wikilink(result, misc_language + 'wiki')) + # URLS - if language != 'en': - add_url(urls, - 'Wiki voyage (' + language + ')', - get_wikilink(result, language + 'wikivoyage')) + # official website + add_url(urls, result, 'P856', results=results) - add_url(urls, - 'Wiki voyage (en)', - get_wikilink(result, 'enwikivoyage')) + # wikipedia + wikipedia_link_count = 0 + wikipedia_link = get_wikilink(result, language + 'wiki') + if wikipedia_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (' + language + ')', + 'url': wikipedia_link}) if language != 'en': - add_url(urls, - 'Wikiquote (' + language + ')', - get_wikilink(result, language + 'wikiquote')) - - add_url(urls, - 'Wikiquote (en)', - get_wikilink(result, 'enwikiquote')) - - add_url(urls, - 'Commons wiki', - get_wikilink(result, 'commonswiki')) - - # Location - add_url(urls, - get_label('P625', language), - get_geolink(claims, 'P625', None)) - - add_url(urls, - 'Wikidata', - 'https://www.wikidata.org/wiki/' - + wikidata_id + '?uselang=' + language) - - musicbrainz_work_id = get_string(claims, 'P435') - if musicbrainz_work_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/work/' - + musicbrainz_work_id) - - musicbrainz_artist_id = get_string(claims, 'P434') - if musicbrainz_artist_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/artist/' - + musicbrainz_artist_id) - - musicbrainz_release_group_id = get_string(claims, 'P436') - if musicbrainz_release_group_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/release-group/' - + musicbrainz_release_group_id) - - musicbrainz_label_id = get_string(claims, 'P966') - if musicbrainz_label_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/label/' - + musicbrainz_label_id) - - # musicbrainz_area_id = get_string(claims, 'P982') - # P1407 MusicBrainz series ID - # P1004 MusicBrainz place ID - # P1330 MusicBrainz instrument ID - # P1407 MusicBrainz series ID - - postal_code = get_string(claims, 'P281', None) - if postal_code is not None: - attributes.append({'label': get_label('P281', language), 'value': postal_code}) - - date_of_birth = get_time(claims, 'P569', locale, None) - if date_of_birth is not None: - attributes.append({'label': get_label('P569', language), 'value': date_of_birth}) - - date_of_death = get_time(claims, 'P570', locale, None) - if date_of_death is not None: - attributes.append({'label': get_label('P570', language), 'value': date_of_death}) + wikipedia_en_link = get_wikilink(result, 'enwiki') + if wikipedia_en_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (en)', + 'url': wikipedia_en_link}) + + # TODO: get_wiki_firstlanguage + # if wikipedia_link_count == 0: + + # more wikis + add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') + add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') + add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki') + + add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo') + + # musicbrainz + add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') + add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') + add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') + add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') + + # IMDb + add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') + # source code repository + add_url(urls, result, 'P1324') + # blog + add_url(urls, result, 'P1581') + # social media links + add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') + add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') + add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/') + add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/') + add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/') + + urls.append({'title': 'Wikidata', + 'url': 'https://www.wikidata.org/wiki/' + + wikidata_id + '?uselang=' + language}) + + # INFOBOX ATTRIBUTES (ROWS) + + # inception date + add_attribute(attributes, result, 'P571', date=True) + # dissolution date + add_attribute(attributes, result, 'P576', date=True) + # start date + add_attribute(attributes, result, 'P580', date=True) + # end date + add_attribute(attributes, result, 'P582', date=True) + + # date of birth + add_attribute(attributes, result, 'P569', date=True) + # date of death + add_attribute(attributes, result, 'P570', date=True) + + # nationality + add_attribute(attributes, result, 'P27') + # country of origin + add_attribute(attributes, result, 'P495') + # country + add_attribute(attributes, result, 'P17') + # headquarters + add_attribute(attributes, result, 'Q180') + + # PLACES + # capital + add_attribute(attributes, result, 'P36', trim=True) + # head of state + add_attribute(attributes, result, 'P35', trim=True) + # head of government + add_attribute(attributes, result, 'P6', trim=True) + # type of government + add_attribute(attributes, result, 'P122') + # official language + add_attribute(attributes, result, 'P37') + # population + add_attribute(attributes, result, 'P1082', trim=True) + # area + add_attribute(attributes, result, 'P2046') + # currency + add_attribute(attributes, result, 'P38') + # heigth (building) + add_attribute(attributes, result, 'P2048') + + # MEDIA + # platform (videogames) + add_attribute(attributes, result, 'P400') + # author + add_attribute(attributes, result, 'P50') + # creator + add_attribute(attributes, result, 'P170') + # director + add_attribute(attributes, result, 'P57') + # performer + add_attribute(attributes, result, 'P175') + # developer + add_attribute(attributes, result, 'P178') + # producer + add_attribute(attributes, result, 'P162') + # manufacturer + add_attribute(attributes, result, 'P176') + # screenwriter + add_attribute(attributes, result, 'P58') + # production company + add_attribute(attributes, result, 'P272') + # record label + add_attribute(attributes, result, 'P264') + # publisher + add_attribute(attributes, result, 'P123') + # composer + add_attribute(attributes, result, 'P86') + # publication date + add_attribute(attributes, result, 'P577', date=True) + # genre + add_attribute(attributes, result, 'P136') + # original language + add_attribute(attributes, result, 'P364') + # isbn + add_attribute(attributes, result, 'Q33057') + # software license + add_attribute(attributes, result, 'P275') + # programming language + add_attribute(attributes, result, 'P277') + # version + add_attribute(attributes, result, 'P348', trim=True) + # narrative location + add_attribute(attributes, result, 'P840') + + # LANGUAGES + # number of speakers + add_attribute(attributes, result, 'P1098') + # writing system + add_attribute(attributes, result, 'P282') + # regulatory body + add_attribute(attributes, result, 'P1018') + # language code + add_attribute(attributes, result, 'P218') + + # OTHER + # ceo + add_attribute(attributes, result, 'P169', trim=True) + # founder + add_attribute(attributes, result, 'P112') + # legal form (company/organization) + add_attribute(attributes, result, 'P1454') + # taxon + add_attribute(attributes, result, 'P225') + # chemical formula + add_attribute(attributes, result, 'P274') + # winner (sports/contests) + add_attribute(attributes, result, 'P1346') + # number of deaths + add_attribute(attributes, result, 'P1120') + # currency code + add_attribute(attributes, result, 'P498') + + image = add_image(result) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: results.append({ @@ -197,6 +290,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): 'infobox': title, 'id': wikipedia_link, 'content': description, + 'img_src': image, 'attributes': attributes, 'urls': urls }) @@ -204,92 +298,149 @@ def getDetail(jsonresponse, wikidata_id, language, locale): return results -def add_url(urls, title, url): - if url is not None: - urls.append({'title': title, 'url': url}) - return 1 +# only returns first match +def add_image(result): + # P18: image, P154: logo, P242: map, P41: flag, P2716: collage, P2910: icon + property_ids = ['P18', 'P154', 'P242', 'P41', 'P2716', 'P2910'] + + for property_id in property_ids: + image = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if image: + image_name = image[0].xpath(value_xpath) + image_src = url_image.replace('{filename}', extract_text(image_name[0])) + return image_src + + +# setting trim will only returned high ranked rows OR the first row +def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False): + attribute = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if attribute: + + if default_label: + label = default_label + else: + label = extract_text(attribute[0].xpath(label_xpath)) + + if date: + trim = True + # remove calendar name + calendar_name = attribute[0].xpath(calendar_name_xpath) + for calendar in calendar_name: + calendar.getparent().remove(calendar) + + concat_values = "" + values = [] + first_value = None + for row in attribute[0].xpath(property_row_xpath): + if not first_value or not trim or row.xpath(preferred_rank_xpath): + + value = row.xpath(value_xpath) + if not value: + continue + value = extract_text(value) + + # save first value in case no ranked row is found + if trim and not first_value: + first_value = value + else: + # to avoid duplicate values + if value not in values: + concat_values += value + ", " + values.append(value) + + if trim and not values: + attributes.append({'label': label, + 'value': first_value}) + else: + attributes.append({'label': label, + 'value': concat_values[:-2]}) + + +# requires property_id unless it's a wiki link (defined in link_type) +def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None): + links = [] + + # wiki links don't have property in wikidata page + if link_type and 'wiki' in link_type: + links.append(get_wikilink(result, link_type)) else: - return 0 + dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if dom_element: + dom_element = dom_element[0] + if not default_label: + label = extract_text(dom_element.xpath(label_xpath)) + + if link_type == 'geo': + links.append(get_geolink(dom_element)) + + elif link_type == 'imdb': + links.append(get_imdblink(dom_element, url_prefix)) + + else: + url_results = dom_element.xpath(url_xpath) + for link in url_results: + if link is not None: + if url_prefix: + link = url_prefix + extract_text(link) + else: + link = extract_text(link) + links.append(link) + + # append urls + for url in links: + if url is not None: + urls.append({'title': default_label or label, + 'url': url}) + if results is not None: + results.append({'title': default_label or label, + 'url': url}) + + +def get_imdblink(result, url_prefix): + imdb_id = result.xpath(value_xpath) + if imdb_id: + imdb_id = extract_text(imdb_id) + id_prefix = imdb_id[:2] + if id_prefix == 'tt': + url = url_prefix + 'title/' + imdb_id + elif id_prefix == 'nm': + url = url_prefix + 'name/' + imdb_id + elif id_prefix == 'ch': + url = url_prefix + 'character/' + imdb_id + elif id_prefix == 'co': + url = url_prefix + 'company/' + imdb_id + elif id_prefix == 'ev': + url = url_prefix + 'event/' + imdb_id + else: + url = None + return url -def get_mainsnak(claims, propertyName): - propValue = claims.get(propertyName, {}) - if len(propValue) == 0: +def get_geolink(result): + coordinates = result.xpath(value_xpath) + if not coordinates: return None - - propValue = propValue[0].get('mainsnak', None) - return propValue - - -def get_string(claims, propertyName, defaultValue=None): - propValue = claims.get(propertyName, {}) - if len(propValue) == 0: - return defaultValue - - result = [] - for e in propValue: - mainsnak = e.get('mainsnak', {}) - - datavalue = mainsnak.get('datavalue', {}) - if datavalue is not None: - result.append(datavalue.get('value', '')) - - if len(result) == 0: - return defaultValue - else: - # TODO handle multiple urls - return result[0] - - -def get_time(claims, propertyName, locale, defaultValue=None): - propValue = claims.get(propertyName, {}) - if len(propValue) == 0: - return defaultValue - - result = [] - for e in propValue: - mainsnak = e.get('mainsnak', {}) - - datavalue = mainsnak.get('datavalue', {}) - if datavalue is not None: - value = datavalue.get('value', '') - result.append(value.get('time', '')) - - if len(result) == 0: - date_string = defaultValue - else: - date_string = ', '.join(result) - - try: - parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ") - except: - if date_string.startswith('-'): - return date_string.split('T')[0] - try: - parsed_date = dateutil_parse(date_string, fuzzy=False, default=False) - except: - logger.debug('could not parse date %s', date_string) - return date_string.split('T')[0] - - return format_date_by_locale(parsed_date, locale) - - -def get_geolink(claims, propertyName, defaultValue=''): - mainsnak = get_mainsnak(claims, propertyName) - - if mainsnak is None: - return defaultValue - - datatype = mainsnak.get('datatype', '') - datavalue = mainsnak.get('datavalue', {}) - - if datatype != 'globe-coordinate': - return defaultValue - - value = datavalue.get('value', {}) - - precision = value.get('precision', 0.0002) - + coordinates = extract_text(coordinates[0]) + latitude, longitude = coordinates.split(',') + + # convert to decimal + lat = int(latitude[:latitude.find(u'°')]) + if latitude.find('\'') >= 0: + lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0 + if latitude.find('"') >= 0: + lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 + if latitude.find('S') >= 0: + lat *= -1 + lon = int(longitude[:longitude.find(u'°')]) + if longitude.find('\'') >= 0: + lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0 + if longitude.find('"') >= 0: + lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 + if longitude.find('W') >= 0: + lon *= -1 + + # TODO: get precision + precision = 0.0002 # there is no zoom information, deduce from precision (error prone) # samples : # 13 --> 5 @@ -305,39 +456,20 @@ def get_geolink(claims, propertyName, defaultValue=''): zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) url = url_map\ - .replace('{latitude}', str(value.get('latitude', 0)))\ - .replace('{longitude}', str(value.get('longitude', 0)))\ + .replace('{latitude}', str(lat))\ + .replace('{longitude}', str(lon))\ .replace('{zoom}', str(zoom)) return url def get_wikilink(result, wikiid): - url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None) - if url is None: - return url - elif url.startswith('http://'): + url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid)) + if not url: + return None + url = url[0] + if url.startswith('http://'): url = url.replace('http://', 'https://') elif url.startswith('//'): url = 'https:' + url return url - - -def get_wiki_firstlanguage(result, wikipatternid): - for k in result.get('sitelinks', {}).keys(): - if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)): - return k[0:2] - return None - - -def get_label(entity_id, language): - url = url_entity_label.format(query=urlencode({'ids': entity_id, - 'languages': language + '|en'})) - - response = get(url) - jsonresponse = json.loads(response.text) - label = jsonresponse.get('entities', {}).get(entity_id, {}).get('labels', {}).get(language, {}).get('value', None) - if label is None: - label = jsonresponse['entities'][entity_id]['labels']['en']['value'] - - return label diff --git a/searx/templates/default/infobox.html b/searx/templates/default/infobox.html @@ -1,18 +1,18 @@ <div class="infobox"> - <h2>{{ infobox.infobox }}</h2> +<h2><bdi>{{ infobox.infobox }}</bdi></h2> {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %} - <p>{{ infobox.entity }}</p> - <p>{{ infobox.content | safe }}</p> + <p><bdi>{{ infobox.entity }}</bdi></p> + <p><bdi>{{ infobox.content | safe }}</bdi></p> {% if infobox.attributes %} <div class="attributes"> <table> {% for attribute in infobox.attributes %} <tr> - <td>{{ attribute.label }}</td> + <td><bdi>{{ attribute.label }}</bdi></td> {% if attribute.image %} <td><img src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> {% else %} - <td>{{ attribute.value }}</td> + <td><bdi>{{ attribute.value }}</bdi></td> {% endif %} </tr> {% endfor %} @@ -24,7 +24,7 @@ <div class="urls"> <ul> {% for url in infobox.urls %} - <li class="url"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></li> + <li class="url"><bdi><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></bdi></li> {% endfor %} </ul> </div> @@ -34,7 +34,7 @@ <div class="relatedTopics"> {% for topic in infobox.relatedTopics %} <div> - <h3>{{ topic.name }}</h3> + <h3><bdi>{{ topic.name }}</bdi></h3> {% for suggestion in topic.suggestions %} <form method="{{ method or 'POST' }}" action="{{ url_for('index') }}"> <input type="hidden" name="q" value="{{ suggestion }}"> diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html @@ -1,21 +1,20 @@ <div class="panel panel-default infobox"> <div class="panel-heading"> - <bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi> + <h4 class="panel-title infobox_part"><bdi>{{ infobox.infobox }}</bdi></h4> </div> <div class="panel-body"> - <bdi> {% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %} - {% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %} + {% if infobox.content %}<bdi><p class="infobox_part">{{ infobox.content }}</bdi></p>{% endif %} {% if infobox.attributes %} <table class="table table-striped infobox_part"> {% for attribute in infobox.attributes %} <tr> - <td>{{ attribute.label }}</td> + <td><bdi>{{ attribute.label }}</bdi></td> {% if attribute.image %} <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> {% else %} - <td>{{ attribute.value }}</td> + <td><bdi>{{ attribute.value }}</bdi></td> {% endif %} </tr> {% endfor %} @@ -24,11 +23,12 @@ {% if infobox.urls %} <div class="infobox_part"> + <bdi> {% for url in infobox.urls %} <p class="btn btn-default btn-xs"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></p> {% endfor %} + </bdi> </div> {% endif %} - </bdi> </div> </div> diff --git a/tests/unit/engines/test_wikidata.py b/tests/unit/engines/test_wikidata.py @@ -0,0 +1,502 @@ +# -*- coding: utf-8 -*- +from json import loads +from lxml.html import fromstring +from collections import defaultdict +import mock +from searx.engines import wikidata +from searx.testing import SearxTestCase + + +class TestWikidataEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['language'] = 'all' + params = wikidata.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('wikidata.org', params['url']) + self.assertIn('en', params['url']) + + dicto['language'] = 'es_ES' + params = wikidata.request(query, dicto) + self.assertIn(query, params['url']) + self.assertIn('es', params['url']) + + # successful cases are not tested here to avoid sending additional requests + def test_response(self): + self.assertRaises(AttributeError, wikidata.response, None) + self.assertRaises(AttributeError, wikidata.response, []) + self.assertRaises(AttributeError, wikidata.response, '') + self.assertRaises(AttributeError, wikidata.response, '[]') + + response = mock.Mock(content='<html></html>', search_params={"language": "all"}) + self.assertEqual(wikidata.response(response), []) + + def test_getDetail(self): + response = {} + results = wikidata.getDetail(response, "Q123", "en", "en-US") + self.assertEqual(results, []) + + title_html = '<div><div class="wikibase-title-label">Test</div></div>' + html = """ + <div> + <div class="wikibase-entitytermsview-heading-description"> + </div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> + </ul> + </div> + </div> + """ + response = {"parse": {"displaytitle": title_html, "text": html}} + + results = wikidata.getDetail(response, "Q123", "en", "en-US") + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test') + + title_html = """ + <div> + <div class="wikibase-title-label"> + <span lang="en">Test</span> + <sup class="wb-language-fallback-indicator">English</sup> + </div> + </div> + """ + html = """ + <div> + <div class="wikibase-entitytermsview-heading-description"> + <span lang="en">Description</span> + <sup class="wb-language-fallback-indicator">English</sup> + </div> + <div id="P856"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P856"> + <span lang="en">official website</span> + <sup class="wb-language-fallback-indicator">English</sup> + </a> + </div> + <div class="wikibase-statementview-mainsnak"> + <a class="external free" href="https://officialsite.com"> + https://officialsite.com + </a> + </div> + </div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> + </ul> + </div> + </div> + """ + response = {"parse": {"displaytitle": title_html, "text": html}} + + results = wikidata.getDetail(response, "Q123", "yua", "yua_MX") + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], 'official website') + self.assertEqual(results[0]['url'], 'https://officialsite.com') + + self.assertEqual(results[1]['infobox'], 'Test') + self.assertEqual(results[1]['id'], None) + self.assertEqual(results[1]['content'], 'Description') + self.assertEqual(results[1]['attributes'], []) + self.assertEqual(results[1]['urls'][0]['title'], 'official website') + self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com') + self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)') + self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test') + + def test_add_image(self): + image_src = wikidata.add_image(fromstring("<div></div>")) + self.assertEqual(image_src, None) + + html = u""" + <div> + <div id="P18"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P18"> + image + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="https://commons.wikimedia.org/wiki/File:image.png"> + image.png + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + + image_src = wikidata.add_image(html_etree) + self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500") + + html = u""" + <div> + <div id="P2910"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P2910"> + icon + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="https://commons.wikimedia.org/wiki/File:icon.png"> + icon.png + </a> + </div> + </div> + </div> + </div> + </div> + </div> + <div id="P154"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P154"> + logo + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="https://commons.wikimedia.org/wiki/File:logo.png"> + logo.png + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + + image_src = wikidata.add_image(html_etree) + self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500") + + def test_add_attribute(self): + html = u""" + <div> + <div id="P27"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P27"> + country of citizenship + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="/wiki/Q145"> + United Kingdom + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + attributes = [] + html_etree = fromstring(html) + + wikidata.add_attribute(attributes, html_etree, "Fail") + self.assertEqual(attributes, []) + + wikidata.add_attribute(attributes, html_etree, "P27") + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["label"], "country of citizenship") + self.assertEqual(attributes[0]["value"], "United Kingdom") + + html = u""" + <div> + <div id="P569"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P569"> + date of birth + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + 27 January 1832 + <sup class="wb-calendar-name"> + Gregorian + </sup> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + attributes = [] + html_etree = fromstring(html) + wikidata.add_attribute(attributes, html_etree, "P569", date=True) + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["label"], "date of birth") + self.assertEqual(attributes[0]["value"], "27 January 1832") + + html = u""" + <div> + <div id="P6"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P27"> + head of government + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="/wiki/Q206"> + Old Prime Minister + </a> + </div> + </div> + </div> + </div> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-preferred"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="/wiki/Q3099714"> + Actual Prime Minister + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + attributes = [] + html_etree = fromstring(html) + wikidata.add_attribute(attributes, html_etree, "P6") + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["label"], "head of government") + self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister") + + attributes = [] + html_etree = fromstring(html) + wikidata.add_attribute(attributes, html_etree, "P6", trim=True) + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["value"], "Actual Prime Minister") + + def test_add_url(self): + html = u""" + <div> + <div id="P856"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P856"> + official website + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="external free" href="https://searx.me"> + https://searx.me/ + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + urls = [] + html_etree = fromstring(html) + wikidata.add_url(urls, html_etree, 'P856') + self.assertEquals(len(urls), 1) + self.assertIn({'title': 'official website', 'url': 'https://searx.me/'}, urls) + urls = [] + results = [] + wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results) + self.assertEquals(len(urls), 1) + self.assertEquals(len(results), 1) + self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls) + self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, results) + + html = u""" + <div> + <div id="P856"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P856"> + official website + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="external free" href="http://www.worldofwarcraft.com"> + http://www.worldofwarcraft.com + </a> + </div> + </div> + </div> + </div> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="external free" href="http://eu.battle.net/wow/en/"> + http://eu.battle.net/wow/en/ + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + urls = [] + html_etree = fromstring(html) + wikidata.add_url(urls, html_etree, 'P856') + self.assertEquals(len(urls), 2) + self.assertIn({'title': 'official website', 'url': 'http://www.worldofwarcraft.com'}, urls) + self.assertIn({'title': 'official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) + + def test_get_imdblink(self): + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="wb-external-id" href="http://www.imdb.com/tt0433664"> + tt0433664 + </a> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') + + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="wb-external-id" + href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994""> + nm4915994 + </a> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') + self.assertIn('https://www.imdb.com/name/nm4915994', imdblink) + + def test_get_geolink(self): + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + 60°N, 40°E + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + geolink = wikidata.get_geolink(html_etree) + self.assertIn('https://www.openstreetmap.org/', geolink) + self.assertIn('lat=60&lon=40', geolink) + + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + 34°35'59"S, 58°22'55"W + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + geolink = wikidata.get_geolink(html_etree) + self.assertIn('https://www.openstreetmap.org/', geolink) + self.assertIn('lat=-34.59', geolink) + self.assertIn('lon=-58.38', geolink) + + def test_get_wikilink(self): + html = """ + <div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="arwiki"><a href="http://ar.wikipedia.org/wiki/Test">Test</a></li> + <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> + </ul> + </div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="enwikiquote"><a href="https://en.wikiquote.org/wiki/Test">Test</a></li> + </ul> + </div> + </div> + """ + html_etree = fromstring(html) + wikilink = wikidata.get_wikilink(html_etree, 'nowiki') + self.assertEqual(wikilink, None) + wikilink = wikidata.get_wikilink(html_etree, 'enwiki') + self.assertEqual(wikilink, 'https://en.wikipedia.org/wiki/Test') + wikilink = wikidata.get_wikilink(html_etree, 'arwiki') + self.assertEqual(wikilink, 'https://ar.wikipedia.org/wiki/Test') + wikilink = wikidata.get_wikilink(html_etree, 'enwikiquote') + self.assertEqual(wikilink, 'https://en.wikiquote.org/wiki/Test')