commit: d06178139f141ae5c6e1908ca70de37371d3572d
parent: 4e5af8d87bc3602fcdb263ad2e1595be91df95c9
Author: Adam Tauber <asciimoo@gmail.com>
Date: Wed, 17 Feb 2016 17:07:19 +0100
[fix] wolframalpha page changes
related issues: #508 #509
Diffstat:
2 files changed, 63 insertions(+), 227 deletions(-)
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
@@ -8,79 +8,85 @@
# @stable no
# @parse answer
-from re import search, sub
+from cgi import escape
from json import loads
+from time import time
from urllib import urlencode
-from lxml import html
-import HTMLParser
+
+from searx.poolrequests import get as http_get
# search-url
-url = 'http://www.wolframalpha.com/'
+url = 'https://www.wolframalpha.com/'
search_url = url + 'input/?{query}'
+search_url = url + 'input/json.jsp'\
+ '?async=true'\
+ '&banners=raw'\
+ '&debuggingdata=false'\
+ '&format=image,plaintext,imagemap,minput,moutput'\
+ '&formattimeout=2'\
+ '&{query}'\
+ '&output=JSON'\
+ '&parsetimeout=2'\
+ '&proxycode={token}'\
+ '&scantimeout=0.5'\
+ '&sponsorcategories=true'\
+ '&statemethod=deploybutton'
+
# xpath variables
scripts_xpath = '//script'
title_xpath = '//title'
failure_xpath = '//p[attribute::class="pfail"]'
+token = {'value': '',
+ 'last_updated': None}
+
+
+# seems, wolframalpha resets its token in every hour
+def obtain_token():
+ update_time = time() - (time() % 3600)
+ token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
+ token['value'] = loads(token_response.text)['code']
+ token['last_updated'] = update_time
+ return token
+
+
+obtain_token()
# do search-request
def request(query, params):
- params['url'] = search_url.format(query=urlencode({'i': query}))
+ # obtain token if last update was more than an hour
+ if time() - token['last_updated'] > 3600:
+ obtain_token()
+ params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
+ params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
return params
# get response from search-request
def response(resp):
- results = []
- line = None
-
- dom = html.fromstring(resp.text)
- scripts = dom.xpath(scripts_xpath)
-
- # the answer is inside a js function
- # answer can be located in different 'pods', although by default it should be in pod_0200
- possible_locations = ['pod_0200\.push\((.*)',
- 'pod_0100\.push\((.*)']
-
- # failed result
- if dom.xpath(failure_xpath):
- return results
-
- # get line that matches the pattern
- for pattern in possible_locations:
- for script in scripts:
- try:
- line = search(pattern, script.text_content()).group(1)
- break
- except AttributeError:
- continue
- if line:
- break
-
- if line:
- # extract answer from json
- answer = line[line.find('{'):line.rfind('}') + 1]
- try:
- answer = loads(answer)
- except Exception:
- answer = loads(answer.encode('unicode-escape'))
- answer = answer['stringified']
-
- # clean plaintext answer
- h = HTMLParser.HTMLParser()
- answer = h.unescape(answer.decode('unicode-escape'))
- answer = sub(r'\\', '', answer)
-
- results.append({'answer': answer})
-
- # user input is in first part of title
- title = dom.xpath(title_xpath)[0].text.encode('utf-8')
- result_url = request(title[:-16], {})['url']
-
- # append result
- results.append({'url': result_url,
- 'title': title.decode('utf-8')})
-
- return results
+ resp_json = loads(resp.text)
+
+ if not resp_json['queryresult']['success']:
+ return []
+
+ # TODO handle resp_json['queryresult']['assumptions']
+ result_chunks = []
+ for pod in resp_json['queryresult']['pods']:
+ pod_title = pod.get('title', '')
+ if 'subpods' not in pod:
+ continue
+ for subpod in pod['subpods']:
+ if 'img' in subpod:
+ result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
+ .format(escape(pod_title or subpod['img']['alt']),
+ escape(subpod['img']['src']),
+ escape(subpod['img']['alt'])))
+
+ if not result_chunks:
+ return []
+
+ return [{'url': resp.request.headers['Referer'],
+ 'title': 'Wolframalpha',
+ 'content': ''.join(result_chunks)}]
diff --git a/tests/unit/engines/test_wolframalpha_noapi.py b/tests/unit/engines/test_wolframalpha_noapi.py
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
-import mock
from searx.engines import wolframalpha_noapi
from searx.testing import SearxTestCase
@@ -21,173 +20,4 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
self.assertRaises(AttributeError, wolframalpha_noapi.response, [])
self.assertRaises(AttributeError, wolframalpha_noapi.response, '')
self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]')
-
- html = """
- <!DOCTYPE html>
- <title> Parangaricutirimícuaro - Wolfram|Alpha</title>
- <meta charset="utf-8" />
- <body>
- <div id="closest">
- <p class="pfail">Wolfram|Alpha doesn't know how to interpret your input.</p>
- <div id="dtips">
- <div class="tip">
- <span class="tip-title">Tip: </span>
- Check your spelling, and use English
- <span class="tip-extra"></span>
- </div>
- </div>
- </div>
- </body>
- </html>
- """
- # test failed query
- response = mock.Mock(text=html)
- self.assertEqual(wolframalpha_noapi.response(response), [])
-
- html = """
- <!DOCTYPE html>
- <title> sqrt(-1) - Wolfram|Alpha</title>
- <meta charset="utf-8" />
- <body>
- <script type="text/javascript">
- try {
- if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
- context.jsonArray.popups.pod_0100 = [];
- }
- context.jsonArray.popups.pod_0100.push( {"stringified": "sqrt(-1)","mInput": "","mOutput": ""});
- } catch(e) { }
-
- try {
- if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
- context.jsonArray.popups.pod_0200 = [];
- }
- context.jsonArray.popups.pod_0200.push( {"stringified": "i","mInput": "","mOutput": ""});
- } catch(e) { }
- </script>
- </body>
- </html>
- """
- # test plaintext
- response = mock.Mock(text=html)
- results = wolframalpha_noapi.response(response)
- self.assertEqual(type(results), list)
- self.assertEqual(len(results), 2)
- self.assertEquals('i', results[0]['answer'])
- self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title'])
- self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url'])
-
- html = """
- <!DOCTYPE html>
- <title> integral 1/x - Wolfram|Alpha</title>
- <meta charset="utf-8" />
- <body>
- <script type="text/javascript">
- try {
- if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
- context.jsonArray.popups.pod_0100 = [];
- }
- context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
- } catch(e) { }
- </script>
- </body>
- </html>
- """
- # test integral
- response = mock.Mock(text=html)
- results = wolframalpha_noapi.response(response)
- self.assertEqual(type(results), list)
- self.assertEqual(len(results), 2)
- self.assertIn('log(x)+c', results[0]['answer'])
- self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title'])
- self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url'])
-
- html = """
- <!DOCTYPE html>
- <title> ∫1/x x - Wolfram|Alpha</title>
- <meta charset="utf-8" />
- <body>
- <script type="text/javascript">
- try {
- if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
- context.jsonArray.popups.pod_0100 = [];
- }
- context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
- } catch(e) { }
- </script>
- </body>
- </html>
- """
- # test input in mathematical notation
- response = mock.Mock(text=html)
- results = wolframalpha_noapi.response(response)
- self.assertEqual(type(results), list)
- self.assertEqual(len(results), 2)
- self.assertIn('log(x)+c', results[0]['answer'])
- self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title'])
- self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url'])
-
- html = """
- <!DOCTYPE html>
- <title> 1 euro to yen - Wolfram|Alpha</title>
- <meta charset="utf-8" />
- <body>
- <script type="text/javascript">
- try {
- if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
- context.jsonArray.popups.pod_0100 = [];
- }
- context.jsonArray.popups.pod_0100.push( {"stringified": "convert euro1 (euro) to Japanese yen"});
- } catch(e) { }
-
- try {
- if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
- context.jsonArray.popups.pod_0200 = [];
- }
- context.jsonArray.popups.pod_0200.push( {"stringified": "¥130.5 (Japanese yen)"});
- } catch(e) { }
- </script>
- </body>
- </html>
- """
- # test output with htmlentity
- response = mock.Mock(text=html)
- results = wolframalpha_noapi.response(response)
- self.assertEqual(type(results), list)
- self.assertEqual(len(results), 2)
- self.assertIn('¥'.decode('utf-8'), results[0]['answer'])
- self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title'])
- self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url'])
-
- html = """
- <!DOCTYPE html>
- <title> distance from nairobi to kyoto in inches - Wolfram|Alpha</title>
- <meta charset="utf-8" />
- <body>
- <script type="text/javascript">
- try {
- if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
- context.jsonArray.popups.pod_0100 = [];
- }
-[...].pod_0100.push( {"stringified": "convert distance | from | Nairobi, Kenya\nto | Kyoto, Japan to inches"});
- } catch(e) { }
-
- try {
- if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
- context.jsonArray.popups.pod_0200 = [];
- }
-pod_0200.push({"stringified": "4.295×10^8 inches","mOutput": "Quantity[4.295×10^8,&quot;Inches&quot;]"});
-
- } catch(e) { }
- </script>
- </body>
- </html>
- """
- # test output with utf-8 character
- response = mock.Mock(text=html)
- results = wolframalpha_noapi.response(response)
- self.assertEqual(type(results), list)
- self.assertEqual(len(results), 2)
- self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer'])
- self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title'])
- self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches',
- results[1]['url'])
+ # TODO