logo

searx

Unnamed repository; edit this file 'description' to name the repository.
commit: f1262ffa9e20a951eb001192d151beca39bc62fe
parent: 49403dbbda96d3c943a7c8d71979effebf53e447
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Tue,  9 Aug 2016 10:14:32 +0200

Merge pull request #588 from a01200356/wikidata

[enh] More data from Wikidata

Diffstat:

.gitignore | 1+
searx/engines/wikidata.py | 624++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
searx/engines/wikipedia.py | 3+--
searx/results.py | 61+++++++++++++++++++++++++++++++++++++++++--------------------
searx/settings.yml | 2++
searx/templates/default/infobox.html | 14+++++++-------
searx/templates/oscar/infobox.html | 12++++++------
searx/utils.py | 8+++++++-
tests/unit/engines/test_wikidata.py | 504+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
9 files changed, 964 insertions(+), 265 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,4 +1,5 @@ .coverage +coverage/ .installed.cfg engines.cfg env diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py @@ -1,56 +1,86 @@ -import json +# -*- coding: utf-8 -*- +""" + Wikidata + + @website https://wikidata.org + @provide-api yes (https://wikidata.org/w/api.php) + + @using-api partially (most things require scraping) + @results JSON, HTML + @stable no (html can change) + @parse url, infobox +""" from searx import logger from searx.poolrequests import get -from searx.utils import format_date_by_locale +from searx.engines.xpath import extract_text -from datetime import datetime -from dateutil.parser import parse as dateutil_parse +from json import loads +from lxml.html import fromstring from urllib import urlencode - logger = logger.getChild('wikidata') result_count = 1 + +# urls wikidata_host = 'https://www.wikidata.org' +url_search = wikidata_host \ + + '/wiki/Special:ItemDisambiguation?{query}' + wikidata_api = wikidata_host + '/w/api.php' -url_search = wikidata_api \ - + '?action=query&list=search&format=json'\ - + '&srnamespace=0&srprop=sectiontitle&{query}' url_detail = wikidata_api\ - + '?action=wbgetentities&format=json'\ - + '&props=labels%7Cinfo%7Csitelinks'\ - + '%7Csitelinks%2Furls%7Cdescriptions%7Cclaims'\ - + '&{query}' + + '?action=parse&format=json&{query}'\ + + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\ + + '&disableeditsection=1&disabletidy=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2' + url_map = 'https://www.openstreetmap.org/'\ + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' +url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' + +# xpaths +wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' +title_xpath = '//*[contains(@class,"wikibase-title-label")]' +description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' +property_xpath = '//div[@id="{propertyid}"]' +label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' +url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' +wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ + + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' +property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' +preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' +value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ + + '/*/div[contains(@class,"wikibase-snakview-value")]' +language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' +calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' def request(query, params): + language = params['language'].split('_')[0] + if language == 'all': + language = 'en' + params['url'] = url_search.format( - query=urlencode({'srsearch': query, - 'srlimit': result_count})) + query=urlencode({'label': query, + 'language': language})) return params def response(resp): results = [] - search_res = json.loads(resp.text) - - wikidata_ids = set() - for r in search_res.get('query', {}).get('search', {}): - wikidata_ids.add(r.get('title', '')) + html = fromstring(resp.content) + wikidata_ids = html.xpath(wikidata_ids_xpath) language = resp.search_params['language'].split('_')[0] if language == 'all': language = 'en' - url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids), - 'languages': language + '|en'})) - - htmlresponse = get(url) - jsonresponse = json.loads(htmlresponse.content) - for wikidata_id in wikidata_ids: - results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) + # TODO: make requests asynchronous to avoid timeout when result_count > 1 + for wikidata_id in wikidata_ids[:result_count]: + url = url_detail.format(query=urlencode({'page': wikidata_id, + 'uselang': language})) + htmlresponse = get(url) + jsonresponse = loads(htmlresponse.content) + results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) return results @@ -60,124 +90,206 @@ def getDetail(jsonresponse, wikidata_id, language, locale): urls = [] attributes = [] - result = jsonresponse.get('entities', {}).get(wikidata_id, {}) + title = jsonresponse.get('parse', {}).get('displaytitle', {}) + result = jsonresponse.get('parse', {}).get('text', {}) - title = result.get('labels', {}).get(language, {}).get('value', None) - if title is None: - title = result.get('labels', {}).get('en', {}).get('value', None) - if title is None: + if not title or not result: return results - description = result\ - .get('descriptions', {})\ - .get(language, {})\ - .get('value', None) + title = fromstring(title) + for elem in title.xpath(language_fallback_xpath): + elem.getparent().remove(elem) + title = extract_text(title.xpath(title_xpath)) - if description is None: - description = result\ - .get('descriptions', {})\ - .get('en', {})\ - .get('value', '') + result = fromstring(result) + for elem in result.xpath(language_fallback_xpath): + elem.getparent().remove(elem) - claims = result.get('claims', {}) - official_website = get_string(claims, 'P856', None) - if official_website is not None: - urls.append({'title': 'Official site', 'url': official_website}) - results.append({'title': title, 'url': official_website}) + description = extract_text(result.xpath(description_xpath)) - wikipedia_link_count = 0 - wikipedia_link = get_wikilink(result, language + 'wiki') - wikipedia_link_count += add_url(urls, - 'Wikipedia (' + language + ')', - wikipedia_link) - if language != 'en': - wikipedia_en_link = get_wikilink(result, 'enwiki') - wikipedia_link_count += add_url(urls, - 'Wikipedia (en)', - wikipedia_en_link) - if wikipedia_link_count == 0: - misc_language = get_wiki_firstlanguage(result, 'wiki') - if misc_language is not None: - add_url(urls, - 'Wikipedia (' + misc_language + ')', - get_wikilink(result, misc_language + 'wiki')) + # URLS - if language != 'en': - add_url(urls, - 'Wiki voyage (' + language + ')', - get_wikilink(result, language + 'wikivoyage')) + # official website + add_url(urls, result, 'P856', results=results) - add_url(urls, - 'Wiki voyage (en)', - get_wikilink(result, 'enwikivoyage')) + # wikipedia + wikipedia_link_count = 0 + wikipedia_link = get_wikilink(result, language + 'wiki') + if wikipedia_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (' + language + ')', + 'url': wikipedia_link}) if language != 'en': - add_url(urls, - 'Wikiquote (' + language + ')', - get_wikilink(result, language + 'wikiquote')) - - add_url(urls, - 'Wikiquote (en)', - get_wikilink(result, 'enwikiquote')) - - add_url(urls, - 'Commons wiki', - get_wikilink(result, 'commonswiki')) - - add_url(urls, - 'Location', - get_geolink(claims, 'P625', None)) - - add_url(urls, - 'Wikidata', - 'https://www.wikidata.org/wiki/' - + wikidata_id + '?uselang=' + language) - - musicbrainz_work_id = get_string(claims, 'P435') - if musicbrainz_work_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/work/' - + musicbrainz_work_id) - - musicbrainz_artist_id = get_string(claims, 'P434') - if musicbrainz_artist_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/artist/' - + musicbrainz_artist_id) - - musicbrainz_release_group_id = get_string(claims, 'P436') - if musicbrainz_release_group_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/release-group/' - + musicbrainz_release_group_id) - - musicbrainz_label_id = get_string(claims, 'P966') - if musicbrainz_label_id is not None: - add_url(urls, - 'MusicBrainz', - 'http://musicbrainz.org/label/' - + musicbrainz_label_id) - - # musicbrainz_area_id = get_string(claims, 'P982') - # P1407 MusicBrainz series ID - # P1004 MusicBrainz place ID - # P1330 MusicBrainz instrument ID - # P1407 MusicBrainz series ID - - postal_code = get_string(claims, 'P281', None) - if postal_code is not None: - attributes.append({'label': 'Postal code(s)', 'value': postal_code}) - - date_of_birth = get_time(claims, 'P569', locale, None) - if date_of_birth is not None: - attributes.append({'label': 'Date of birth', 'value': date_of_birth}) - - date_of_death = get_time(claims, 'P570', locale, None) - if date_of_death is not None: - attributes.append({'label': 'Date of death', 'value': date_of_death}) + wikipedia_en_link = get_wikilink(result, 'enwiki') + if wikipedia_en_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (en)', + 'url': wikipedia_en_link}) + + # TODO: get_wiki_firstlanguage + # if wikipedia_link_count == 0: + + # more wikis + add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') + add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') + add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki') + + add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo') + + # musicbrainz + add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') + add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') + add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') + add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') + + # IMDb + add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') + # source code repository + add_url(urls, result, 'P1324') + # blog + add_url(urls, result, 'P1581') + # social media links + add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') + add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') + add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/') + add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/') + add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/') + + urls.append({'title': 'Wikidata', + 'url': 'https://www.wikidata.org/wiki/' + + wikidata_id + '?uselang=' + language}) + + # INFOBOX ATTRIBUTES (ROWS) + + # DATES + # inception date + add_attribute(attributes, result, 'P571', date=True) + # dissolution date + add_attribute(attributes, result, 'P576', date=True) + # start date + add_attribute(attributes, result, 'P580', date=True) + # end date + add_attribute(attributes, result, 'P582', date=True) + # date of birth + add_attribute(attributes, result, 'P569', date=True) + # date of death + add_attribute(attributes, result, 'P570', date=True) + # date of spacecraft launch + add_attribute(attributes, result, 'P619', date=True) + # date of spacecraft landing + add_attribute(attributes, result, 'P620', date=True) + + # nationality + add_attribute(attributes, result, 'P27') + # country of origin + add_attribute(attributes, result, 'P495') + # country + add_attribute(attributes, result, 'P17') + # headquarters + add_attribute(attributes, result, 'Q180') + + # PLACES + # capital + add_attribute(attributes, result, 'P36', trim=True) + # head of state + add_attribute(attributes, result, 'P35', trim=True) + # head of government + add_attribute(attributes, result, 'P6', trim=True) + # type of government + add_attribute(attributes, result, 'P122') + # official language + add_attribute(attributes, result, 'P37') + # population + add_attribute(attributes, result, 'P1082', trim=True) + # area + add_attribute(attributes, result, 'P2046') + # currency + add_attribute(attributes, result, 'P38', trim=True) + # heigth (building) + add_attribute(attributes, result, 'P2048') + + # MEDIA + # platform (videogames) + add_attribute(attributes, result, 'P400') + # author + add_attribute(attributes, result, 'P50') + # creator + add_attribute(attributes, result, 'P170') + # director + add_attribute(attributes, result, 'P57') + # performer + add_attribute(attributes, result, 'P175') + # developer + add_attribute(attributes, result, 'P178') + # producer + add_attribute(attributes, result, 'P162') + # manufacturer + add_attribute(attributes, result, 'P176') + # screenwriter + add_attribute(attributes, result, 'P58') + # production company + add_attribute(attributes, result, 'P272') + # record label + add_attribute(attributes, result, 'P264') + # publisher + add_attribute(attributes, result, 'P123') + # original network + add_attribute(attributes, result, 'P449') + # distributor + add_attribute(attributes, result, 'P750') + # composer + add_attribute(attributes, result, 'P86') + # publication date + add_attribute(attributes, result, 'P577', date=True) + # genre + add_attribute(attributes, result, 'P136') + # original language + add_attribute(attributes, result, 'P364') + # isbn + add_attribute(attributes, result, 'Q33057') + # software license + add_attribute(attributes, result, 'P275') + # programming language + add_attribute(attributes, result, 'P277') + # version + add_attribute(attributes, result, 'P348', trim=True) + # narrative location + add_attribute(attributes, result, 'P840') + + # LANGUAGES + # number of speakers + add_attribute(attributes, result, 'P1098') + # writing system + add_attribute(attributes, result, 'P282') + # regulatory body + add_attribute(attributes, result, 'P1018') + # language code + add_attribute(attributes, result, 'P218') + + # OTHER + # ceo + add_attribute(attributes, result, 'P169', trim=True) + # founder + add_attribute(attributes, result, 'P112') + # legal form (company/organization) + add_attribute(attributes, result, 'P1454') + # operator + add_attribute(attributes, result, 'P137') + # crew members (tripulation) + add_attribute(attributes, result, 'P1029') + # taxon + add_attribute(attributes, result, 'P225') + # chemical formula + add_attribute(attributes, result, 'P274') + # winner (sports/contests) + add_attribute(attributes, result, 'P1346') + # number of deaths + add_attribute(attributes, result, 'P1120') + # currency code + add_attribute(attributes, result, 'P498') + + image = add_image(result) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: results.append({ @@ -190,6 +302,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): 'infobox': title, 'id': wikipedia_link, 'content': description, + 'img_src': image, 'attributes': attributes, 'urls': urls }) @@ -197,92 +310,151 @@ def getDetail(jsonresponse, wikidata_id, language, locale): return results -def add_url(urls, title, url): - if url is not None: - urls.append({'title': title, 'url': url}) - return 1 +# only returns first match +def add_image(result): + # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon + property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] + + for property_id in property_ids: + image = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if image: + image_name = image[0].xpath(value_xpath) + image_src = url_image.replace('{filename}', extract_text(image_name[0])) + return image_src + + +# setting trim will only returned high ranked rows OR the first row +def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False): + attribute = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if attribute: + + if default_label: + label = default_label + else: + label = extract_text(attribute[0].xpath(label_xpath)) + label = label[0].upper() + label[1:] + + if date: + trim = True + # remove calendar name + calendar_name = attribute[0].xpath(calendar_name_xpath) + for calendar in calendar_name: + calendar.getparent().remove(calendar) + + concat_values = "" + values = [] + first_value = None + for row in attribute[0].xpath(property_row_xpath): + if not first_value or not trim or row.xpath(preferred_rank_xpath): + + value = row.xpath(value_xpath) + if not value: + continue + value = extract_text(value) + + # save first value in case no ranked row is found + if trim and not first_value: + first_value = value + else: + # to avoid duplicate values + if value not in values: + concat_values += value + ", " + values.append(value) + + if trim and not values: + attributes.append({'label': label, + 'value': first_value}) + else: + attributes.append({'label': label, + 'value': concat_values[:-2]}) + + +# requires property_id unless it's a wiki link (defined in link_type) +def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None): + links = [] + + # wiki links don't have property in wikidata page + if link_type and 'wiki' in link_type: + links.append(get_wikilink(result, link_type)) else: - return 0 + dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if dom_element: + dom_element = dom_element[0] + if not default_label: + label = extract_text(dom_element.xpath(label_xpath)) + label = label[0].upper() + label[1:] + + if link_type == 'geo': + links.append(get_geolink(dom_element)) + + elif link_type == 'imdb': + links.append(get_imdblink(dom_element, url_prefix)) + + else: + url_results = dom_element.xpath(url_xpath) + for link in url_results: + if link is not None: + if url_prefix: + link = url_prefix + extract_text(link) + else: + link = extract_text(link) + links.append(link) + + # append urls + for url in links: + if url is not None: + urls.append({'title': default_label or label, + 'url': url}) + if results is not None: + results.append({'title': default_label or label, + 'url': url}) + + +def get_imdblink(result, url_prefix): + imdb_id = result.xpath(value_xpath) + if imdb_id: + imdb_id = extract_text(imdb_id) + id_prefix = imdb_id[:2] + if id_prefix == 'tt': + url = url_prefix + 'title/' + imdb_id + elif id_prefix == 'nm': + url = url_prefix + 'name/' + imdb_id + elif id_prefix == 'ch': + url = url_prefix + 'character/' + imdb_id + elif id_prefix == 'co': + url = url_prefix + 'company/' + imdb_id + elif id_prefix == 'ev': + url = url_prefix + 'event/' + imdb_id + else: + url = None + return url -def get_mainsnak(claims, propertyName): - propValue = claims.get(propertyName, {}) - if len(propValue) == 0: +def get_geolink(result): + coordinates = result.xpath(value_xpath) + if not coordinates: return None - - propValue = propValue[0].get('mainsnak', None) - return propValue - - -def get_string(claims, propertyName, defaultValue=None): - propValue = claims.get(propertyName, {}) - if len(propValue) == 0: - return defaultValue - - result = [] - for e in propValue: - mainsnak = e.get('mainsnak', {}) - - datavalue = mainsnak.get('datavalue', {}) - if datavalue is not None: - result.append(datavalue.get('value', '')) - - if len(result) == 0: - return defaultValue - else: - # TODO handle multiple urls - return result[0] - - -def get_time(claims, propertyName, locale, defaultValue=None): - propValue = claims.get(propertyName, {}) - if len(propValue) == 0: - return defaultValue - - result = [] - for e in propValue: - mainsnak = e.get('mainsnak', {}) - - datavalue = mainsnak.get('datavalue', {}) - if datavalue is not None: - value = datavalue.get('value', '') - result.append(value.get('time', '')) - - if len(result) == 0: - date_string = defaultValue - else: - date_string = ', '.join(result) - - try: - parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ") - except: - if date_string.startswith('-'): - return date_string.split('T')[0] - try: - parsed_date = dateutil_parse(date_string, fuzzy=False, default=False) - except: - logger.debug('could not parse date %s', date_string) - return date_string.split('T')[0] - - return format_date_by_locale(parsed_date, locale) - - -def get_geolink(claims, propertyName, defaultValue=''): - mainsnak = get_mainsnak(claims, propertyName) - - if mainsnak is None: - return defaultValue - - datatype = mainsnak.get('datatype', '') - datavalue = mainsnak.get('datavalue', {}) - - if datatype != 'globe-coordinate': - return defaultValue - - value = datavalue.get('value', {}) - - precision = value.get('precision', 0.0002) - + coordinates = extract_text(coordinates[0]) + latitude, longitude = coordinates.split(',') + + # convert to decimal + lat = int(latitude[:latitude.find(u'°')]) + if latitude.find('\'') >= 0: + lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0 + if latitude.find('"') >= 0: + lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 + if latitude.find('S') >= 0: + lat *= -1 + lon = int(longitude[:longitude.find(u'°')]) + if longitude.find('\'') >= 0: + lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0 + if longitude.find('"') >= 0: + lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 + if longitude.find('W') >= 0: + lon *= -1 + + # TODO: get precision + precision = 0.0002 # there is no zoom information, deduce from precision (error prone) # samples : # 13 --> 5 @@ -298,26 +470,20 @@ def get_geolink(claims, propertyName, defaultValue=''): zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) url = url_map\ - .replace('{latitude}', str(value.get('latitude', 0)))\ - .replace('{longitude}', str(value.get('longitude', 0)))\ + .replace('{latitude}', str(lat))\ + .replace('{longitude}', str(lon))\ .replace('{zoom}', str(zoom)) return url def get_wikilink(result, wikiid): - url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None) - if url is None: - return url - elif url.startswith('http://'): + url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid)) + if not url: + return None + url = url[0] + if url.startswith('http://'): url = url.replace('http://', 'https://') elif url.startswith('//'): url = 'https:' + url return url - - -def get_wiki_firstlanguage(result, wikipatternid): - for k in result.get('sitelinks', {}).keys(): - if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)): - return k[0:2] - return None diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py @@ -99,9 +99,8 @@ def response(resp): return [] # link to wikipedia article - # parenthesis are not quoted to make infobox mergeable with wikidata's wikipedia_link = url_lang(resp.search_params['language']) \ - + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')') + + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) results.append({'url': wikipedia_link, 'title': title}) diff --git a/searx/results.py b/searx/results.py @@ -18,7 +18,17 @@ def result_content_len(content): def compare_urls(url_a, url_b): - if url_a.netloc != url_b.netloc or url_a.query != url_b.query: + # ignore www. in comparison + if url_a.netloc.startswith('www.'): + host_a = url_a.netloc.replace('www.', '', 1) + else: + host_a = url_a.netloc + if url_b.netloc.startswith('www.'): + host_b = url_b.netloc.replace('www.', '', 1) + else: + host_b = url_b.netloc + + if host_a != host_b or url_a.query != url_b.query: return False # remove / from the end of the url if required @@ -33,25 +43,42 @@ def compare_urls(url_a, url_b): def merge_two_infoboxes(infobox1, infobox2): + # get engines weights + if hasattr(engines[infobox1['engine']], 'weight'): + weight1 = engines[infobox1['engine']].weight + else: + weight1 = 1 + if hasattr(engines[infobox2['engine']], 'weight'): + weight2 = engines[infobox2['engine']].weight + else: + weight2 = 1 + + if weight2 > weight1: + infobox1['engine'] = infobox2['engine'] + if 'urls' in infobox2: urls1 = infobox1.get('urls', None) if urls1 is None: urls1 = [] - infobox1['urls'] = urls1 - urlSet = set() - for url in infobox1.get('urls', []): - urlSet.add(url.get('url', None)) + for url2 in infobox2.get('urls', []): + unique_url = True + for url1 in infobox1.get('urls', []): + if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))): + unique_url = False + break + if unique_url: + urls1.append(url2) - for url in infobox2.get('urls', []): - if url.get('url', None) not in urlSet: - urls1.append(url) + infobox1['urls'] = urls1 if 'img_src' in infobox2: img1 = infobox1.get('img_src', None) img2 = infobox2.get('img_src') if img1 is None: infobox1['img_src'] = img2 + elif weight2 > weight1: + infobox1['img_src'] = img2 if 'attributes' in infobox2: attributes1 = infobox1.get('attributes', None) @@ -65,7 +92,8 @@ def merge_two_infoboxes(infobox1, infobox2): attributeSet.add(attribute.get('label', None)) for attribute in infobox2.get('attributes', []): - attributes1.append(attribute) + if attribute.get('label', None) not in attributeSet: + attributes1.append(attribute) if 'content' in infobox2: content1 = infobox1.get('content', None) @@ -97,7 +125,6 @@ class ResultContainer(object): self.results = defaultdict(list) self._merged_results = [] self.infoboxes = [] - self._infobox_ids = {} self.suggestions = set() self.answers = set() self._number_of_results = [] @@ -138,14 +165,13 @@ class ResultContainer(object): add_infobox = True infobox_id = infobox.get('id', None) if infobox_id is not None: - existingIndex = self._infobox_ids.get(infobox_id, None) - if existingIndex is not None: - merge_two_infoboxes(self.infoboxes[existingIndex], infobox) - add_infobox = False + for existingIndex in self.infoboxes: + if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)): + merge_two_infoboxes(existingIndex, infobox) + add_infobox = False if add_infobox: self.infoboxes.append(infobox) - self._infobox_ids[infobox_id] = len(self.infoboxes) - 1 def _merge_result(self, result, position): result['parsed_url'] = urlparse(result['url']) @@ -155,11 +181,6 @@ class ResultContainer(object): result['parsed_url'] = result['parsed_url']._replace(scheme="http") result['url'] = result['parsed_url'].geturl() - result['host'] = result['parsed_url'].netloc - - if result['host'].startswith('www.'): - result['host'] = result['host'].replace('www.', '', 1) - result['engines'] = [result['engine']] # strip multiple spaces and cariage returns from content diff --git a/searx/settings.yml b/searx/settings.yml @@ -105,6 +105,7 @@ engines: - name : ddg definitions engine : duckduckgo_definitions shortcut : ddd + weight : 2 disabled : True - name : digg @@ -127,6 +128,7 @@ engines: - name : wikidata engine : wikidata shortcut : wd + weight : 2 - name : duckduckgo engine : duckduckgo diff --git a/searx/templates/default/infobox.html b/searx/templates/default/infobox.html @@ -1,18 +1,18 @@ <div class="infobox"> - <h2>{{ infobox.infobox }}</h2> +<h2><bdi>{{ infobox.infobox }}</bdi></h2> {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %} - <p>{{ infobox.entity }}</p> - <p>{{ infobox.content | safe }}</p> + <p><bdi>{{ infobox.entity }}</bdi></p> + <p><bdi>{{ infobox.content | safe }}</bdi></p> {% if infobox.attributes %} <div class="attributes"> <table> {% for attribute in infobox.attributes %} <tr> - <td>{{ attribute.label }}</td> + <td><bdi>{{ attribute.label }}</bdi></td> {% if attribute.image %} <td><img src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> {% else %} - <td>{{ attribute.value }}</td> + <td><bdi>{{ attribute.value }}</bdi></td> {% endif %} </tr> {% endfor %} @@ -24,7 +24,7 @@ <div class="urls"> <ul> {% for url in infobox.urls %} - <li class="url"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></li> + <li class="url"><bdi><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></bdi></li> {% endfor %} </ul> </div> @@ -34,7 +34,7 @@ <div class="relatedTopics"> {% for topic in infobox.relatedTopics %} <div> - <h3>{{ topic.name }}</h3> + <h3><bdi>{{ topic.name }}</bdi></h3> {% for suggestion in topic.suggestions %} <form method="{{ method or 'POST' }}" action="{{ url_for('index') }}"> <input type="hidden" name="q" value="{{ suggestion }}"> diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html @@ -1,21 +1,20 @@ <div class="panel panel-default infobox"> <div class="panel-heading"> - <bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi> + <h4 class="panel-title infobox_part"><bdi>{{ infobox.infobox }}</bdi></h4> </div> <div class="panel-body"> - <bdi> {% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %} - {% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %} + {% if infobox.content %}<bdi><p class="infobox_part">{{ infobox.content }}</bdi></p>{% endif %} {% if infobox.attributes %} <table class="table table-striped infobox_part"> {% for attribute in infobox.attributes %} <tr> - <td>{{ attribute.label }}</td> + <td><bdi>{{ attribute.label }}</bdi></td> {% if attribute.image %} <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> {% else %} - <td>{{ attribute.value }}</td> + <td><bdi>{{ attribute.value }}</bdi></td> {% endif %} </tr> {% endfor %} @@ -24,11 +23,12 @@ {% if infobox.urls %} <div class="infobox_part"> + <bdi> {% for url in infobox.urls %} <p class="btn btn-default btn-xs"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></p> {% endfor %} + </bdi> </div> {% endif %} - </bdi> </div> </div> diff --git a/searx/utils.py b/searx/utils.py @@ -206,7 +206,13 @@ def format_date_by_locale(date, locale_string): if locale_string == 'all': locale_string = settings['ui']['default_locale'] or 'en_US' - return format_date(date, locale=locale_string) + # to avoid crashing if locale is not supported by babel + try: + formatted_date = format_date(date, locale=locale_string) + except: + formatted_date = format_date(date, "YYYY-MM-dd") + + return formatted_date def dict_subset(d, properties): diff --git a/tests/unit/engines/test_wikidata.py b/tests/unit/engines/test_wikidata.py @@ -0,0 +1,504 @@ +# -*- coding: utf-8 -*- +from json import loads +from lxml.html import fromstring +from collections import defaultdict +import mock +from searx.engines import wikidata +from searx.testing import SearxTestCase + + +class TestWikidataEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['language'] = 'all' + params = wikidata.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('wikidata.org', params['url']) + self.assertIn('en', params['url']) + + dicto['language'] = 'es_ES' + params = wikidata.request(query, dicto) + self.assertIn(query, params['url']) + self.assertIn('es', params['url']) + + # successful cases are not tested here to avoid sending additional requests + def test_response(self): + self.assertRaises(AttributeError, wikidata.response, None) + self.assertRaises(AttributeError, wikidata.response, []) + self.assertRaises(AttributeError, wikidata.response, '') + self.assertRaises(AttributeError, wikidata.response, '[]') + + response = mock.Mock(content='<html></html>', search_params={"language": "all"}) + self.assertEqual(wikidata.response(response), []) + + def test_getDetail(self): + response = {} + results = wikidata.getDetail(response, "Q123", "en", "en-US") + self.assertEqual(results, []) + + title_html = '<div><div class="wikibase-title-label">Test</div></div>' + html = """ + <div> + <div class="wikibase-entitytermsview-heading-description"> + </div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> + </ul> + </div> + </div> + """ + response = {"parse": {"displaytitle": title_html, "text": html}} + + results = wikidata.getDetail(response, "Q123", "en", "en-US") + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test') + + title_html = """ + <div> + <div class="wikibase-title-label"> + <span lang="en">Test</span> + <sup class="wb-language-fallback-indicator">English</sup> + </div> + </div> + """ + html = """ + <div> + <div class="wikibase-entitytermsview-heading-description"> + <span lang="en">Description</span> + <sup class="wb-language-fallback-indicator">English</sup> + </div> + <div id="P856"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P856"> + <span lang="en">official website</span> + <sup class="wb-language-fallback-indicator">English</sup> + </a> + </div> + <div class="wikibase-statementview-mainsnak"> + <a class="external free" href="https://officialsite.com"> + https://officialsite.com + </a> + </div> + </div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> + </ul> + </div> + </div> + """ + response = {"parse": {"displaytitle": title_html, "text": html}} + + results = wikidata.getDetail(response, "Q123", "yua", "yua_MX") + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], 'Official website') + self.assertEqual(results[0]['url'], 'https://officialsite.com') + + self.assertEqual(results[1]['infobox'], 'Test') + self.assertEqual(results[1]['id'], None) + self.assertEqual(results[1]['content'], 'Description') + self.assertEqual(results[1]['attributes'], []) + self.assertEqual(results[1]['urls'][0]['title'], 'Official website') + self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com') + self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)') + self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test') + + def test_add_image(self): + image_src = wikidata.add_image(fromstring("<div></div>")) + self.assertEqual(image_src, None) + + html = u""" + <div> + <div id="P18"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P18"> + image + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="https://commons.wikimedia.org/wiki/File:image.png"> + image.png + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + + image_src = wikidata.add_image(html_etree) + self.assertEqual(image_src, + "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400") + + html = u""" + <div> + <div id="P2910"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P2910"> + icon + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="https://commons.wikimedia.org/wiki/File:icon.png"> + icon.png + </a> + </div> + </div> + </div> + </div> + </div> + </div> + <div id="P154"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P154"> + logo + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="https://commons.wikimedia.org/wiki/File:logo.png"> + logo.png + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + + image_src = wikidata.add_image(html_etree) + self.assertEqual(image_src, + "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400") + + def test_add_attribute(self): + html = u""" + <div> + <div id="P27"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P27"> + country of citizenship + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="/wiki/Q145"> + United Kingdom + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + attributes = [] + html_etree = fromstring(html) + + wikidata.add_attribute(attributes, html_etree, "Fail") + self.assertEqual(attributes, []) + + wikidata.add_attribute(attributes, html_etree, "P27") + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["label"], "Country of citizenship") + self.assertEqual(attributes[0]["value"], "United Kingdom") + + html = u""" + <div> + <div id="P569"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P569"> + date of birth + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + 27 January 1832 + <sup class="wb-calendar-name"> + Gregorian + </sup> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + attributes = [] + html_etree = fromstring(html) + wikidata.add_attribute(attributes, html_etree, "P569", date=True) + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["label"], "Date of birth") + self.assertEqual(attributes[0]["value"], "27 January 1832") + + html = u""" + <div> + <div id="P6"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P27"> + head of government + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-normal"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="/wiki/Q206"> + Old Prime Minister + </a> + </div> + </div> + </div> + </div> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-rankselector"> + <span class="wikibase-rankselector-preferred"></span> + </div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a href="/wiki/Q3099714"> + Actual Prime Minister + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + attributes = [] + html_etree = fromstring(html) + wikidata.add_attribute(attributes, html_etree, "P6") + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["label"], "Head of government") + self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister") + + attributes = [] + html_etree = fromstring(html) + wikidata.add_attribute(attributes, html_etree, "P6", trim=True) + self.assertEqual(len(attributes), 1) + self.assertEqual(attributes[0]["value"], "Actual Prime Minister") + + def test_add_url(self): + html = u""" + <div> + <div id="P856"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P856"> + official website + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="external free" href="https://searx.me"> + https://searx.me/ + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + urls = [] + html_etree = fromstring(html) + wikidata.add_url(urls, html_etree, 'P856') + self.assertEquals(len(urls), 1) + self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls) + urls = [] + results = [] + wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results) + self.assertEquals(len(urls), 1) + self.assertEquals(len(results), 1) + self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls) + self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, results) + + html = u""" + <div> + <div id="P856"> + <div class="wikibase-statementgroupview-property-label"> + <a href="/wiki/Property:P856"> + official website + </a> + </div> + <div class="wikibase-statementlistview"> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="external free" href="http://www.worldofwarcraft.com"> + http://www.worldofwarcraft.com + </a> + </div> + </div> + </div> + </div> + <div class="wikibase-statementview listview-item"> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="external free" href="http://eu.battle.net/wow/en/"> + http://eu.battle.net/wow/en/ + </a> + </div> + </div> + </div> + </div> + </div> + </div> + </div> + """ + urls = [] + html_etree = fromstring(html) + wikidata.add_url(urls, html_etree, 'P856') + self.assertEquals(len(urls), 2) + self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls) + self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) + + def test_get_imdblink(self): + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="wb-external-id" href="http://www.imdb.com/tt0433664"> + tt0433664 + </a> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') + + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + <a class="wb-external-id" + href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994""> + nm4915994 + </a> + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') + self.assertIn('https://www.imdb.com/name/nm4915994', imdblink) + + def test_get_geolink(self): + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + 60°N, 40°E + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + geolink = wikidata.get_geolink(html_etree) + self.assertIn('https://www.openstreetmap.org/', geolink) + self.assertIn('lat=60&lon=40', geolink) + + html = u""" + <div> + <div class="wikibase-statementview-mainsnak"> + <div> + <div class="wikibase-snakview-value"> + 34°35'59"S, 58°22'55"W + </div> + </div> + </div> + </div> + """ + html_etree = fromstring(html) + geolink = wikidata.get_geolink(html_etree) + self.assertIn('https://www.openstreetmap.org/', geolink) + self.assertIn('lat=-34.59', geolink) + self.assertIn('lon=-58.38', geolink) + + def test_get_wikilink(self): + html = """ + <div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="arwiki"><a href="http://ar.wikipedia.org/wiki/Test">Test</a></li> + <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> + </ul> + </div> + <div> + <ul class="wikibase-sitelinklistview-listview"> + <li data-wb-siteid="enwikiquote"><a href="https://en.wikiquote.org/wiki/Test">Test</a></li> + </ul> + </div> + </div> + """ + html_etree = fromstring(html) + wikilink = wikidata.get_wikilink(html_etree, 'nowiki') + self.assertEqual(wikilink, None) + wikilink = wikidata.get_wikilink(html_etree, 'enwiki') + self.assertEqual(wikilink, 'https://en.wikipedia.org/wiki/Test') + wikilink = wikidata.get_wikilink(html_etree, 'arwiki') + self.assertEqual(wikilink, 'https://ar.wikipedia.org/wiki/Test') + wikilink = wikidata.get_wikilink(html_etree, 'enwikiquote') + self.assertEqual(wikilink, 'https://en.wikiquote.org/wiki/Test')