logo

searx

My custom branche(s) on searx, a meta-search engine
commit: d06178139f141ae5c6e1908ca70de37371d3572d
parent: 4e5af8d87bc3602fcdb263ad2e1595be91df95c9
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Wed, 17 Feb 2016 17:07:19 +0100

[fix] wolframalpha page changes

related issues: #508 #509

Diffstat:

Msearx/engines/wolframalpha_noapi.py118+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mtests/unit/engines/test_wolframalpha_noapi.py172+------------------------------------------------------------------------------
2 files changed, 63 insertions(+), 227 deletions(-)

diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py @@ -8,79 +8,85 @@ # @stable no # @parse answer -from re import search, sub +from cgi import escape from json import loads +from time import time from urllib import urlencode -from lxml import html -import HTMLParser + +from searx.poolrequests import get as http_get # search-url -url = 'http://www.wolframalpha.com/' +url = 'https://www.wolframalpha.com/' search_url = url + 'input/?{query}' +search_url = url + 'input/json.jsp'\ + '?async=true'\ + '&banners=raw'\ + '&debuggingdata=false'\ + '&format=image,plaintext,imagemap,minput,moutput'\ + '&formattimeout=2'\ + '&{query}'\ + '&output=JSON'\ + '&parsetimeout=2'\ + '&proxycode={token}'\ + '&scantimeout=0.5'\ + '&sponsorcategories=true'\ + '&statemethod=deploybutton' + # xpath variables scripts_xpath = '//script' title_xpath = '//title' failure_xpath = '//p[attribute::class="pfail"]' +token = {'value': '', + 'last_updated': None} + + +# seems, wolframalpha resets its token in every hour +def obtain_token(): + update_time = time() - (time() % 3600) + token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) + token['value'] = loads(token_response.text)['code'] + token['last_updated'] = update_time + return token + + +obtain_token() # do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'i': query})) + # obtain token if last update was more than an hour + if time() - token['last_updated'] > 3600: + obtain_token() + params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) + params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query return params # get response from search-request def response(resp): - results = [] - line = None - - dom = html.fromstring(resp.text) - scripts = dom.xpath(scripts_xpath) - - # the answer is inside a js function - # answer can be located in different 'pods', although by default it should be in pod_0200 - possible_locations = ['pod_0200\.push\((.*)', - 'pod_0100\.push\((.*)'] - - # failed result - if dom.xpath(failure_xpath): - return results - - # get line that matches the pattern - for pattern in possible_locations: - for script in scripts: - try: - line = search(pattern, script.text_content()).group(1) - break - except AttributeError: - continue - if line: - break - - if line: - # extract answer from json - answer = line[line.find('{'):line.rfind('}') + 1] - try: - answer = loads(answer) - except Exception: - answer = loads(answer.encode('unicode-escape')) - answer = answer['stringified'] - - # clean plaintext answer - h = HTMLParser.HTMLParser() - answer = h.unescape(answer.decode('unicode-escape')) - answer = sub(r'\\', '', answer) - - results.append({'answer': answer}) - - # user input is in first part of title - title = dom.xpath(title_xpath)[0].text.encode('utf-8') - result_url = request(title[:-16], {})['url'] - - # append result - results.append({'url': result_url, - 'title': title.decode('utf-8')}) - - return results + resp_json = loads(resp.text) + + if not resp_json['queryresult']['success']: + return [] + + # TODO handle resp_json['queryresult']['assumptions'] + result_chunks = [] + for pod in resp_json['queryresult']['pods']: + pod_title = pod.get('title', '') + if 'subpods' not in pod: + continue + for subpod in pod['subpods']: + if 'img' in subpod: + result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>' + .format(escape(pod_title or subpod['img']['alt']), + escape(subpod['img']['src']), + escape(subpod['img']['alt']))) + + if not result_chunks: + return [] + + return [{'url': resp.request.headers['Referer'], + 'title': 'Wolframalpha', + 'content': ''.join(result_chunks)}] diff --git a/tests/unit/engines/test_wolframalpha_noapi.py b/tests/unit/engines/test_wolframalpha_noapi.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from collections import defaultdict -import mock from searx.engines import wolframalpha_noapi from searx.testing import SearxTestCase @@ -21,173 +20,4 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase): self.assertRaises(AttributeError, wolframalpha_noapi.response, []) self.assertRaises(AttributeError, wolframalpha_noapi.response, '') self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]') - - html = """ - <!DOCTYPE html> - <title> Parangaricutirimícuaro - Wolfram|Alpha</title> - <meta charset="utf-8" /> - <body> - <div id="closest"> - <p class="pfail">Wolfram|Alpha doesn't know how to interpret your input.</p> - <div id="dtips"> - <div class="tip"> - <span class="tip-title">Tip:&nbsp;</span> - Check your spelling, and use English - <span class="tip-extra"></span> - </div> - </div> - </div> - </body> - </html> - """ - # test failed query - response = mock.Mock(text=html) - self.assertEqual(wolframalpha_noapi.response(response), []) - - html = """ - <!DOCTYPE html> - <title> sqrt(-1) - Wolfram|Alpha</title> - <meta charset="utf-8" /> - <body> - <script type="text/javascript"> - try { - if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { - context.jsonArray.popups.pod_0100 = []; - } - context.jsonArray.popups.pod_0100.push( {"stringified": "sqrt(-1)","mInput": "","mOutput": ""}); - } catch(e) { } - - try { - if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) { - context.jsonArray.popups.pod_0200 = []; - } - context.jsonArray.popups.pod_0200.push( {"stringified": "i","mInput": "","mOutput": ""}); - } catch(e) { } - </script> - </body> - </html> - """ - # test plaintext - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertEquals('i', results[0]['answer']) - self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url']) - - html = """ - <!DOCTYPE html> - <title> integral 1/x - Wolfram|Alpha</title> - <meta charset="utf-8" /> - <body> - <script type="text/javascript"> - try { - if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { - context.jsonArray.popups.pod_0100 = []; - } - context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"}); - } catch(e) { } - </script> - </body> - </html> - """ - # test integral - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('log(x)+c', results[0]['answer']) - self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url']) - - html = """ - <!DOCTYPE html> - <title> &int;1&#x2f;x &#xf74c;x - Wolfram|Alpha</title> - <meta charset="utf-8" /> - <body> - <script type="text/javascript"> - try { - if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { - context.jsonArray.popups.pod_0100 = []; - } - context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"}); - } catch(e) { } - </script> - </body> - </html> - """ - # test input in mathematical notation - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('log(x)+c', results[0]['answer']) - self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url']) - - html = """ - <!DOCTYPE html> - <title> 1 euro to yen - Wolfram|Alpha</title> - <meta charset="utf-8" /> - <body> - <script type="text/javascript"> - try { - if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { - context.jsonArray.popups.pod_0100 = []; - } - context.jsonArray.popups.pod_0100.push( {"stringified": "convert euro1 (euro) to Japanese yen"}); - } catch(e) { } - - try { - if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) { - context.jsonArray.popups.pod_0200 = []; - } - context.jsonArray.popups.pod_0200.push( {"stringified": "&yen;130.5 (Japanese yen)"}); - } catch(e) { } - </script> - </body> - </html> - """ - # test output with htmlentity - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('¥'.decode('utf-8'), results[0]['answer']) - self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url']) - - html = """ - <!DOCTYPE html> - <title> distance from nairobi to kyoto in inches - Wolfram|Alpha</title> - <meta charset="utf-8" /> - <body> - <script type="text/javascript"> - try { - if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) { - context.jsonArray.popups.pod_0100 = []; - } -[...].pod_0100.push( {"stringified": "convert distance | from | Nairobi, Kenya\nto | Kyoto, Japan to inches"}); - } catch(e) { } - - try { - if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) { - context.jsonArray.popups.pod_0200 = []; - } -pod_0200.push({"stringified": "4.295&times;10^8 inches","mOutput": "Quantity[4.295×10^8,&amp;quot;Inches&amp;quot;]"}); - - } catch(e) { } - </script> - </body> - </html> - """ - # test output with utf-8 character - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer']) - self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches', - results[1]['url']) + # TODO