logo

searx

My custom branche(s) on searx, a meta-search engine
commit: b5a3dfca60f23bac10ade068c40729f030bbad63
parent: 09b7673fbd271349b6878959bd2e1ae846981e13
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Tue, 19 Jan 2016 17:02:14 +0100

Merge pull request #486 from a01200356/master

[enh] WolframAlpha no API engine (and tests for both)

Diffstat:

Msearx/engines/wolframalpha_api.py35++++++++++++++++++++++++++---------
Asearx/engines/wolframalpha_noapi.py86+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml16+++++++++-------
Atests/unit/engines/test_wolframalpha_api.py307+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/engines/test_wolframalpha_noapi.py193+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 621 insertions(+), 16 deletions(-)

diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py @@ -10,11 +10,18 @@ from urllib import urlencode from lxml import etree +from re import search # search-url base_url = 'http://api.wolframalpha.com/v2/query' search_url = base_url + '?appid={api_key}&{query}&format=plaintext' -api_key = '' +site_url = 'http://www.wolframalpha.com/input/?{query}' +api_key = '' # defined in settings.yml + +# xpath variables +failure_xpath = '/queryresult[attribute::success="false"]' +answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext' +input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext' # do search-request @@ -45,16 +52,26 @@ def response(resp): search_results = etree.XML(resp.content) # return empty array if there are no results - if search_results.xpath('/queryresult[attribute::success="false"]'): + if search_results.xpath(failure_xpath): return [] - # parse result - result = search_results.xpath('//pod[attribute::primary="true"]/subpod/plaintext')[0].text - result = replace_pua_chars(result) + # parse answers + answers = search_results.xpath(answer_xpath) + if answers: + for answer in answers: + answer = replace_pua_chars(answer.text) + + results.append({'answer': answer}) + + # if there's no input section in search_results, check if answer has the input embedded (before their "=" sign) + try: + query_input = search_results.xpath(input_xpath)[0].text + except IndexError: + query_input = search(u'([^\uf7d9]+)', answers[0].text).group(1) - # append result - # TODO: shouldn't it bind the source too? - results.append({'answer': result}) + # append link to site + result_url = site_url.format(query=urlencode({'i': query_input.encode('utf-8')})) + results.append({'url': result_url, + 'title': query_input + " - Wolfram|Alpha"}) - # return results return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py @@ -0,0 +1,86 @@ +# WolframAlpha (Maths) +# +# @website http://www.wolframalpha.com/ +# @provide-api yes (http://api.wolframalpha.com/v2/) +# +# @using-api no +# @results HTML +# @stable no +# @parse answer + +from re import search, sub +from json import loads +from urllib import urlencode +from lxml import html +import HTMLParser + +# search-url +url = 'http://www.wolframalpha.com/' +search_url = url + 'input/?{query}' + +# xpath variables +scripts_xpath = '//script' +title_xpath = '//title' +failure_xpath = '//p[attribute::class="pfail"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'i': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + line = None + + dom = html.fromstring(resp.text) + scripts = dom.xpath(scripts_xpath) + + # the answer is inside a js function + # answer can be located in different 'pods', although by default it should be in pod_0200 + possible_locations = ['pod_0200\.push\((.*)', + 'pod_0100\.push\((.*)'] + + # failed result + if dom.xpath(failure_xpath): + return results + + # get line that matches the pattern + for pattern in possible_locations: + for script in scripts: + try: + line = search(pattern, script.text_content()).group(1) + break + except AttributeError: + continue + if line: + break + + if line: + # extract answer from json + answer = line[line.find('{'):line.rfind('}') + 1] + try: + answer = loads(answer) + except Exception: + answer = loads(answer.encode('unicode-escape')) + answer = answer['stringified'] + + # clean plaintext answer + h = HTMLParser.HTMLParser() + answer = h.unescape(answer.decode('unicode-escape')) + answer = sub(r'\\', '', answer) + + results.append({'answer': answer}) + + # user input is in first part of title + title = dom.xpath(title_xpath)[0].text.encode('utf-8') + result_url = request(title[:-16], {})['url'] + + # append result + results.append({'url': result_url, + 'title': title.decode('utf-8')}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -300,13 +300,15 @@ engines: engine : vimeo shortcut : vm -# You can use the engine using the official stable API, but you need an API key -# See : http://products.wolframalpha.com/api/ -# - name : wolframalpha -# shortcut : wa -# engine : wolframalpha_api -# api_key: 'apikey' # required! -# timeout: 6.0 + - name : wolframalpha + shortcut : wa + # You can use the engine using the official stable API, but you need an API key + # See : http://products.wolframalpha.com/api/ + # engine : wolframalpha_api + # api_key: 'apikey' # required! + engine : wolframalpha_noapi + timeout: 6.0 + disabled : True #The blekko technology and team have joined IBM Watson! -> https://blekko.com/ # - name : blekko images diff --git a/tests/unit/engines/test_wolframalpha_api.py b/tests/unit/engines/test_wolframalpha_api.py @@ -0,0 +1,307 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import wolframalpha_api +from searx.testing import SearxTestCase + + +class TestWolframAlphaAPIEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + api_key = 'XXXXXX-XXXXXXXXXX' + dicto = defaultdict(dict) + dicto['api_key'] = api_key + params = wolframalpha_api.request(query, dicto) + + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('wolframalpha.com', params['url']) + + self.assertIn('api_key', params) + self.assertIn(api_key, params['api_key']) + + def test_response(self): + self.assertRaises(AttributeError, wolframalpha_api.response, None) + self.assertRaises(AttributeError, wolframalpha_api.response, []) + self.assertRaises(AttributeError, wolframalpha_api.response, '') + self.assertRaises(AttributeError, wolframalpha_api.response, '[]') + + xml = '''<?xml version='1.0' encoding='UTF-8'?> + <queryresult success='false' error='false' /> + ''' + # test failure + response = mock.Mock(content=xml) + self.assertEqual(wolframalpha_api.response(response), []) + + xml = """<?xml version='1.0' encoding='UTF-8'?> + <queryresult success='true' + error='false' + numpods='6' + datatypes='' + timedout='' + timedoutpods='' + timing='0.684' + parsetiming='0.138' + parsetimedout='false' + recalculate='' + id='MSPa416020a7966dachc463600000f9c66cc21444cfg' + host='http://www3.wolframalpha.com' + server='6' + related='http://www3.wolframalpha.com/api/v2/relatedQueries.jsp?...' + version='2.6'> + <pod title='Input' + scanner='Identity' + id='Input' + position='100' + error='false' + numsubpods='1'> + <subpod title=''> + <plaintext>sqrt(-1)</plaintext> + </subpod> + </pod> + <pod title='Result' + scanner='Simplification' + id='Result' + position='200' + error='false' + numsubpods='1' + primary='true'> + <subpod title=''> + <plaintext></plaintext> + </subpod> + <states count='1'> + <state name='Step-by-step solution' + input='Result__Step-by-step solution' /> + </states> + </pod> + <pod title='Polar coordinates' + scanner='Numeric' + id='PolarCoordinates' + position='300' + error='false' + numsubpods='1'> + <subpod title=''> + <plaintext>r1 (radius), θ90° (angle)</plaintext> + </subpod> + </pod> + <pod title='Position in the complex plane' + scanner='Numeric' + id='PositionInTheComplexPlane' + position='400' + error='false' + numsubpods='1'> + <subpod title=''> + <plaintext></plaintext> + </subpod> + </pod> + <pod title='All 2nd roots of -1' + scanner='RootsOfUnity' + id='' + position='500' + error='false' + numsubpods='2'> + <subpod title=''> + <plaintext> (principal root)</plaintext> + </subpod> + <subpod title=''> + <plaintext>-</plaintext> + </subpod> + </pod> + <pod title='Plot of all roots in the complex plane' + scanner='RootsOfUnity' + id='PlotOfAllRootsInTheComplexPlane' + position='600' + error='false' + numsubpods='1'> + <subpod title=''> + <plaintext></plaintext> + </subpod> + </pod> + </queryresult> + """ + # test private user area char in response + response = mock.Mock(content=xml) + results = wolframalpha_api.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertIn('i', results[0]['answer']) + self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=sqrt%28-1%29', results[1]['url']) + + xml = """<?xml version='1.0' encoding='UTF-8'?> + <queryresult success='true' + error='false' + numpods='2' + datatypes='' + timedout='' + timedoutpods='' + timing='1.286' + parsetiming='0.255' + parsetimedout='false' + recalculate='' + id='MSPa195222ad740ede5214h30000480ca61h003d3gd6' + host='http://www3.wolframalpha.com' + server='20' + related='http://www3.wolframalpha.com/api/v2/relatedQueries.jsp?id=...' + version='2.6'> + <pod title='Indefinite integral' + scanner='Integral' + id='IndefiniteIntegral' + position='100' + error='false' + numsubpods='1' + primary='true'> + <subpod title=''> + <plaintext>∫1/xxlog(x)+constant</plaintext> + </subpod> + <states count='1'> + <state name='Step-by-step solution' + input='IndefiniteIntegral__Step-by-step solution' /> + </states> + <infos count='1'> + <info text='log(x) is the natural logarithm'> + <link url='http://reference.wolfram.com/mathematica/ref/Log.html' + text='Documentation' + title='Mathematica' /> + <link url='http://functions.wolfram.com/ElementaryFunctions/Log' + text='Properties' + title='Wolfram Functions Site' /> + <link url='http://mathworld.wolfram.com/NaturalLogarithm.html' + text='Definition' + title='MathWorld' /> + </info> + </infos> + </pod> + <pod title='Plots of the integral' + scanner='Integral' + id='Plot' + position='200' + error='false' + numsubpods='2'> + <subpod title=''> + <plaintext></plaintext> + <states count='1'> + <statelist count='2' + value='Complex-valued plot' + delimiters=''> + <state name='Complex-valued plot' + input='Plot__1_Complex-valued plot' /> + <state name='Real-valued plot' + input='Plot__1_Real-valued plot' /> + </statelist> + </states> + </subpod> + <subpod title=''> + <plaintext></plaintext> + <states count='1'> + <statelist count='2' + value='Complex-valued plot' + delimiters=''> + <state name='Complex-valued plot' + input='Plot__2_Complex-valued plot' /> + <state name='Real-valued plot' + input='Plot__2_Real-valued plot' /> + </statelist> + </states> + </subpod> + </pod> + <assumptions count='1'> + <assumption type='Clash' + word='integral' + template='Assuming &quot;${word}&quot; is ${desc1}. Use as ${desc2} instead' + count='2'> + <value name='IntegralsWord' + desc='an integral' + input='*C.integral-_*IntegralsWord-' /> + <value name='MathematicalFunctionIdentityPropertyClass' + desc='a function property' + input='*C.integral-_*MathematicalFunctionIdentityPropertyClass-' /> + </assumption> + </assumptions> + </queryresult> + """ + # test integral + response = mock.Mock(content=xml) + results = wolframalpha_api.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertIn('log(x)+c', results[0]['answer']) + self.assertIn('∫1/xx - Wolfram|Alpha'.decode('utf-8'), results[1]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=%E2%88%AB1%2Fx%EF%9D%8Cx', results[1]['url']) + + xml = """<?xml version='1.0' encoding='UTF-8'?> + <queryresult success='true' + error='false' + numpods='4' + datatypes='Solve' + timedout='' + timedoutpods='' + timing='0.79' + parsetiming='0.338' + parsetimedout='false' + recalculate='' + id='MSPa7481f7i06d25h3deh2900004810i3a78d9b4fdc' + host='http://www5b.wolframalpha.com' + server='23' + related='http://www5b.wolframalpha.com/api/v2/relatedQueries.jsp?id=...' + version='2.6'> + <pod title='Input interpretation' + scanner='Identity' + id='Input' + position='100' + error='false' + numsubpods='1'> + <subpod title=''> + <plaintext>solve x^2+x0</plaintext> + </subpod> + </pod> + <pod title='Results' + scanner='Solve' + id='Result' + position='200' + error='false' + numsubpods='2' + primary='true'> + <subpod title=''> + <plaintext>x-1</plaintext> + </subpod> + <subpod title=''> + <plaintext>x0</plaintext> + </subpod> + <states count='1'> + <state name='Step-by-step solution' + input='Result__Step-by-step solution' /> + </states> + </pod> + <pod title='Root plot' + scanner='Solve' + id='RootPlot' + position='300' + error='false' + numsubpods='1'> + <subpod title=''> + <plaintext></plaintext> + </subpod> + </pod> + <pod title='Number line' + scanner='Solve' + id='NumberLine' + position='400' + error='false' + numsubpods='1'> + <subpod title=''> + <plaintext></plaintext> + </subpod> + </pod> + </queryresult> + """ + # test ecuation with multiple answers + response = mock.Mock(content=xml) + results = wolframalpha_api.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 3) + self.assertIn('x=-1', results[0]['answer']) + self.assertIn('x=0', results[1]['answer']) + self.assertIn('solve x^2+x0 - Wolfram|Alpha'.decode('utf-8'), results[2]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=solve+x%5E2%2Bx%EF%9F%990', results[2]['url']) diff --git a/tests/unit/engines/test_wolframalpha_noapi.py b/tests/unit/engines/test_wolframalpha_noapi.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import wolframalpha_noapi +from searx.testing import SearxTestCase + + +class TestWolframAlphaNoAPIEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = wolframalpha_noapi.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('wolframalpha.com', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, wolframalpha_noapi.response, None) + self.assertRaises(AttributeError, wolframalpha_noapi.response, []) + self.assertRaises(AttributeError, wolframalpha_noapi.response, '') + self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]') + + html = """ + <!DOCTYPE html> + <title> Parangaricutirimícuaro - Wolfram|Alpha</title> + <meta charset="utf-8" /> + <body> + <div id="closest"> + <p class="pfail">Wolfram|Alpha doesn't know how to interpret your input.</p> + <div id="dtips"> + <div class="tip"> + <span class="tip-title">Tip:&nbsp;</span> + Check your spelling, and use English + <span class="tip-extra"></span> + </div> + </div> + </div> + </body> + </html> + """ + # test failed query + response = mock.Mock(text=html) + self.assertEqual(wolframalpha_noapi.response(response), []) + + html = """ + <!DOCTYPE html> + <title> sqrt(-1) - Wolfram|Alpha</title> + <meta charset="utf-8" /> + <body> + <script type="text/javascript"> + try { + if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { + context.jsonArray.popups.pod_0100 = []; + } + context.jsonArray.popups.pod_0100.push( {"stringified": "sqrt(-1)","mInput": "","mOutput": ""}); + } catch(e) { } + + try { + if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) { + context.jsonArray.popups.pod_0200 = []; + } + context.jsonArray.popups.pod_0200.push( {"stringified": "i","mInput": "","mOutput": ""}); + } catch(e) { } + </script> + </body> + </html> + """ + # test plaintext + response = mock.Mock(text=html) + results = wolframalpha_noapi.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEquals('i', results[0]['answer']) + self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url']) + + html = """ + <!DOCTYPE html> + <title> integral 1/x - Wolfram|Alpha</title> + <meta charset="utf-8" /> + <body> + <script type="text/javascript"> + try { + if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { + context.jsonArray.popups.pod_0100 = []; + } + context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"}); + } catch(e) { } + </script> + </body> + </html> + """ + # test integral + response = mock.Mock(text=html) + results = wolframalpha_noapi.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertIn('log(x)+c', results[0]['answer']) + self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url']) + + html = """ + <!DOCTYPE html> + <title> &int;1&#x2f;x &#xf74c;x - Wolfram|Alpha</title> + <meta charset="utf-8" /> + <body> + <script type="text/javascript"> + try { + if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { + context.jsonArray.popups.pod_0100 = []; + } + context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"}); + } catch(e) { } + </script> + </body> + </html> + """ + # test input in mathematical notation + response = mock.Mock(text=html) + results = wolframalpha_noapi.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertIn('log(x)+c', results[0]['answer']) + self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url']) + + html = """ + <!DOCTYPE html> + <title> 1 euro to yen - Wolfram|Alpha</title> + <meta charset="utf-8" /> + <body> + <script type="text/javascript"> + try { + if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { + context.jsonArray.popups.pod_0100 = []; + } + context.jsonArray.popups.pod_0100.push( {"stringified": "convert euro1 (euro) to Japanese yen"}); + } catch(e) { } + + try { + if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) { + context.jsonArray.popups.pod_0200 = []; + } + context.jsonArray.popups.pod_0200.push( {"stringified": "&yen;130.5 (Japanese yen)"}); + } catch(e) { } + </script> + </body> + </html> + """ + # test output with htmlentity + response = mock.Mock(text=html) + results = wolframalpha_noapi.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertIn('¥'.decode('utf-8'), results[0]['answer']) + self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url']) + + html = """ + <!DOCTYPE html> + <title> distance from nairobi to kyoto in inches - Wolfram|Alpha</title> + <meta charset="utf-8" /> + <body> + <script type="text/javascript"> + try { + if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { + context.jsonArray.popups.pod_0100 = []; + } +[...].pod_0100.push( {"stringified": "convert distance | from | Nairobi, Kenya\nto | Kyoto, Japan to inches"}); + } catch(e) { } + + try { + if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) { + context.jsonArray.popups.pod_0200 = []; + } +pod_0200.push({"stringified": "4.295&times;10^8 inches","mOutput": "Quantity[4.295×10^8,&amp;quot;Inches&amp;quot;]"}); + + } catch(e) { } + </script> + </body> + </html> + """ + # test output with utf-8 character + response = mock.Mock(text=html) + results = wolframalpha_noapi.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer']) + self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title']) + self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches', + results[1]['url'])