commit: c65a409f0d2728ba5a0c3ffa1a0cb05659033a71
parent: e60e98156f9aeeb40ca7272c883704c095d9f7d4
Author: marc <a01200356@itesm.mx>
Date: Sat, 20 May 2017 22:33:08 -0500
add duckduckgo images engine
Diffstat:
4 files changed, 204 insertions(+), 30 deletions(-)
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
@@ -41,46 +41,51 @@ title_xpath = './/a[@class="result__a"]'
content_xpath = './/a[@class="result__snippet"]'
-# do search-request
-def request(query, params):
- if params['time_range'] and params['time_range'] not in time_range_dict:
- return params
-
- offset = 30 + (params['pageno'] - 1) * 50
- dc_param = offset + 1
-
+# match query's language to a region code that duckduckgo will accept
+def get_region_code(lang):
# custom fixes for languages
- if params['language'] == 'all':
- locale = None
- elif params['language'][:2] == 'ja':
- locale = 'jp-jp'
- elif params['language'][:2] == 'sl':
- locale = 'sl-sl'
- elif params['language'] == 'zh-TW':
- locale = 'tw-tzh'
- elif params['language'] == 'zh-HK':
- locale = 'hk-tzh'
- elif params['language'][-2:] == 'SA':
- locale = 'xa-' + params['language'].split('-')[0]
- elif params['language'][-2:] == 'GB':
- locale = 'uk-' + params['language'].split('-')[0]
+ if lang == 'all':
+ region_code = None
+ elif lang[:2] == 'ja':
+ region_code = 'jp-jp'
+ elif lang[:2] == 'sl':
+ region_code = 'sl-sl'
+ elif lang == 'zh-TW':
+ region_code = 'tw-tzh'
+ elif lang == 'zh-HK':
+ region_code = 'hk-tzh'
+ elif lang[-2:] == 'SA':
+ region_code = 'xa-' + lang.split('-')[0]
+ elif lang[-2:] == 'GB':
+ region_code = 'uk-' + lang.split('-')[0]
else:
- locale = params['language'].split('-')
- if len(locale) == 2:
+ region_code = lang.split('-')
+ if len(region_code) == 2:
# country code goes first
- locale = locale[1].lower() + '-' + locale[0].lower()
+ region_code = region_code[1].lower() + '-' + region_code[0].lower()
else:
# tries to get a country code from language
- locale = locale[0].lower()
+ region_code = region_code[0].lower()
for lc in supported_languages:
lc = lc.split('-')
- if locale == lc[0]:
- locale = lc[1].lower() + '-' + lc[0].lower()
+ if region_code == lc[0]:
+ region_code = lc[1].lower() + '-' + lc[0].lower()
break
+ return region_code
+
+
+# do search-request
+def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
+ offset = 30 + (params['pageno'] - 1) * 50
+ dc_param = offset + 1
- if locale:
+ region_code = get_region_code(params['language'])
+ if region_code:
params['url'] = url.format(
- query=urlencode({'q': query, 'kl': locale}), offset=offset, dc_param=dc_param)
+ query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=dc_param)
else:
params['url'] = url.format(
query=urlencode({'q': query}), offset=offset, dc_param=dc_param)
diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py
@@ -0,0 +1,91 @@
+"""
+ DuckDuckGo (Images)
+
+ @website https://duckduckgo.com/
+ @provide-api yes (https://duckduckgo.com/api),
+ but images are not supported
+
+ @using-api no
+ @results JSON (site requires js to get images)
+ @stable no (JSON can change)
+ @parse url, title, img_src
+
+ @todo avoid extra request
+"""
+
+from requests import get
+from json import loads
+from searx.engines.xpath import extract_text
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['images']
+paging = True
+language_support = True
+safesearch = True
+
+# search-url
+images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
+site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
+
+
+# run query in site to get vqd number needed for requesting images
+# TODO: find a way to get this number without an extra request (is it a hash of the query?)
+def get_vqd(query):
+ res = get(site_url.format(query=urlencode({'q': query})))
+ content = res.text
+ vqd = content[content.find('vqd=\'') + 5:]
+ vqd = vqd[:vqd.find('\'')]
+ return vqd
+
+
+# do search-request
+def request(query, params):
+ # to avoid running actual external requests when testing
+ if 'is_test' not in params:
+ vqd = get_vqd(query)
+ else:
+ vqd = '12345'
+
+ offset = (params['pageno'] - 1) * 50
+
+ safesearch = params['safesearch'] - 1
+
+ region_code = get_region_code(params['language'])
+ if region_code:
+ params['url'] = images_url.format(
+ query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd)
+ else:
+ params['url'] = images_url.format(
+ query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ content = resp.text
+ try:
+ res_json = loads(content)
+ except:
+ return []
+
+ # parse results
+ for result in res_json['results']:
+ title = result['title']
+ url = result['url']
+ thumbnail = result['thumbnail']
+ image = result['image']
+
+ # append result
+ results.append({'template': 'images.html',
+ 'title': title,
+ 'content': '',
+ 'thumbnail_src': thumbnail,
+ 'img_src': image,
+ 'url': url})
+
+ return results
diff --git a/searx/settings.yml b/searx/settings.yml
@@ -167,6 +167,12 @@ engines:
shortcut : ddg
disabled : True
+ - name : duckduckgo images
+ engine : duckduckgo_images
+ shortcut : ddi
+ timeout: 3.0
+ disabled : True
+
- name : etymonline
engine : xpath
paging : True
diff --git a/tests/unit/engines/test_duckduckgo_images.py b/tests/unit/engines/test_duckduckgo_images.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+from collections import defaultdict
+import mock
+from searx.engines import duckduckgo_images
+from searx.testing import SearxTestCase
+
+
+class TestDuckduckgoImagesEngine(SearxTestCase):
+
+ def test_request(self):
+ query = 'test_query'
+ dicto = defaultdict(dict)
+ dicto['is_test'] = True
+ dicto['pageno'] = 1
+ dicto['safesearch'] = 0
+ dicto['language'] = 'all'
+ params = duckduckgo_images.request(query, dicto)
+ self.assertIn('url', params)
+ self.assertIn(query, params['url'])
+ self.assertIn('duckduckgo.com', params['url'])
+ self.assertIn('s=0', params['url'])
+ self.assertIn('p=-1', params['url'])
+ self.assertIn('vqd=12345', params['url'])
+
+ # test paging and safe search
+ dicto['pageno'] = 2
+ dicto['safesearch'] = 2
+ params = duckduckgo_images.request(query, dicto)
+ self.assertIn('url', params)
+ self.assertIn(query, params['url'])
+ self.assertIn('s=50', params['url'])
+ self.assertIn('p=1', params['url'])
+
+ def test_response(self):
+ self.assertRaises(AttributeError, duckduckgo_images.response, None)
+ self.assertRaises(AttributeError, duckduckgo_images.response, [])
+ self.assertRaises(AttributeError, duckduckgo_images.response, '')
+ self.assertRaises(AttributeError, duckduckgo_images.response, '[]')
+
+ response = mock.Mock(text='If this error persists, please let us know: ops@duckduckgo.com')
+ self.assertEqual(duckduckgo_images.response(response), [])
+
+ json = u"""
+ {
+ "query": "test_query",
+ "results": [
+ {
+ "title": "Result 1",
+ "url": "https://site1.url",
+ "thumbnail": "https://thumb1.nail",
+ "image": "https://image1"
+ },
+ {
+ "title": "Result 2",
+ "url": "https://site2.url",
+ "thumbnail": "https://thumb2.nail",
+ "image": "https://image2"
+ }
+ ]
+ }
+ """
+ response = mock.Mock(text=json)
+ results = duckduckgo_images.response(response)
+ self.assertEqual(len(results), 2)
+ self.assertEqual(results[0]['title'], 'Result 1')
+ self.assertEqual(results[0]['url'], 'https://site1.url')
+ self.assertEqual(results[0]['thumbnail_src'], 'https://thumb1.nail')
+ self.assertEqual(results[0]['img_src'], 'https://image1')
+ self.assertEqual(results[1]['title'], 'Result 2')
+ self.assertEqual(results[1]['url'], 'https://site2.url')
+ self.assertEqual(results[1]['thumbnail_src'], 'https://thumb2.nail')
+ self.assertEqual(results[1]['img_src'], 'https://image2')