logo

searx

My custom branche(s) on searx, a meta-search engine
commit: c65a409f0d2728ba5a0c3ffa1a0cb05659033a71
parent: e60e98156f9aeeb40ca7272c883704c095d9f7d4
Author: marc <a01200356@itesm.mx>
Date:   Sat, 20 May 2017 22:33:08 -0500

add duckduckgo images engine

Diffstat:

Msearx/engines/duckduckgo.py65+++++++++++++++++++++++++++++++++++------------------------------
Asearx/engines/duckduckgo_images.py91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml6++++++
Atests/unit/engines/test_duckduckgo_images.py72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 204 insertions(+), 30 deletions(-)

diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py @@ -41,46 +41,51 @@ title_xpath = './/a[@class="result__a"]' content_xpath = './/a[@class="result__snippet"]' -# do search-request -def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: - return params - - offset = 30 + (params['pageno'] - 1) * 50 - dc_param = offset + 1 - +# match query's language to a region code that duckduckgo will accept +def get_region_code(lang): # custom fixes for languages - if params['language'] == 'all': - locale = None - elif params['language'][:2] == 'ja': - locale = 'jp-jp' - elif params['language'][:2] == 'sl': - locale = 'sl-sl' - elif params['language'] == 'zh-TW': - locale = 'tw-tzh' - elif params['language'] == 'zh-HK': - locale = 'hk-tzh' - elif params['language'][-2:] == 'SA': - locale = 'xa-' + params['language'].split('-')[0] - elif params['language'][-2:] == 'GB': - locale = 'uk-' + params['language'].split('-')[0] + if lang == 'all': + region_code = None + elif lang[:2] == 'ja': + region_code = 'jp-jp' + elif lang[:2] == 'sl': + region_code = 'sl-sl' + elif lang == 'zh-TW': + region_code = 'tw-tzh' + elif lang == 'zh-HK': + region_code = 'hk-tzh' + elif lang[-2:] == 'SA': + region_code = 'xa-' + lang.split('-')[0] + elif lang[-2:] == 'GB': + region_code = 'uk-' + lang.split('-')[0] else: - locale = params['language'].split('-') - if len(locale) == 2: + region_code = lang.split('-') + if len(region_code) == 2: # country code goes first - locale = locale[1].lower() + '-' + locale[0].lower() + region_code = region_code[1].lower() + '-' + region_code[0].lower() else: # tries to get a country code from language - locale = locale[0].lower() + region_code = region_code[0].lower() for lc in supported_languages: lc = lc.split('-') - if locale == lc[0]: - locale = lc[1].lower() + '-' + lc[0].lower() + if region_code == lc[0]: + region_code = lc[1].lower() + '-' + lc[0].lower() break + return region_code + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + offset = 30 + (params['pageno'] - 1) * 50 + dc_param = offset + 1 - if locale: + region_code = get_region_code(params['language']) + if region_code: params['url'] = url.format( - query=urlencode({'q': query, 'kl': locale}), offset=offset, dc_param=dc_param) + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=dc_param) else: params['url'] = url.format( query=urlencode({'q': query}), offset=offset, dc_param=dc_param) diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py @@ -0,0 +1,91 @@ +""" + DuckDuckGo (Images) + + @website https://duckduckgo.com/ + @provide-api yes (https://duckduckgo.com/api), + but images are not supported + + @using-api no + @results JSON (site requires js to get images) + @stable no (JSON can change) + @parse url, title, img_src + + @todo avoid extra request +""" + +from requests import get +from json import loads +from searx.engines.xpath import extract_text +from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +language_support = True +safesearch = True + +# search-url +images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' +site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' + + +# run query in site to get vqd number needed for requesting images +# TODO: find a way to get this number without an extra request (is it a hash of the query?) +def get_vqd(query): + res = get(site_url.format(query=urlencode({'q': query}))) + content = res.text + vqd = content[content.find('vqd=\'') + 5:] + vqd = vqd[:vqd.find('\'')] + return vqd + + +# do search-request +def request(query, params): + # to avoid running actual external requests when testing + if 'is_test' not in params: + vqd = get_vqd(query) + else: + vqd = '12345' + + offset = (params['pageno'] - 1) * 50 + + safesearch = params['safesearch'] - 1 + + region_code = get_region_code(params['language']) + if region_code: + params['url'] = images_url.format( + query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) + else: + params['url'] = images_url.format( + query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) + + return params + + +# get response from search-request +def response(resp): + results = [] + + content = resp.text + try: + res_json = loads(content) + except: + return [] + + # parse results + for result in res_json['results']: + title = result['title'] + url = result['url'] + thumbnail = result['thumbnail'] + image = result['image'] + + # append result + results.append({'template': 'images.html', + 'title': title, + 'content': '', + 'thumbnail_src': thumbnail, + 'img_src': image, + 'url': url}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -167,6 +167,12 @@ engines: shortcut : ddg disabled : True + - name : duckduckgo images + engine : duckduckgo_images + shortcut : ddi + timeout: 3.0 + disabled : True + - name : etymonline engine : xpath paging : True diff --git a/tests/unit/engines/test_duckduckgo_images.py b/tests/unit/engines/test_duckduckgo_images.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import duckduckgo_images +from searx.testing import SearxTestCase + + +class TestDuckduckgoImagesEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['is_test'] = True + dicto['pageno'] = 1 + dicto['safesearch'] = 0 + dicto['language'] = 'all' + params = duckduckgo_images.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('duckduckgo.com', params['url']) + self.assertIn('s=0', params['url']) + self.assertIn('p=-1', params['url']) + self.assertIn('vqd=12345', params['url']) + + # test paging and safe search + dicto['pageno'] = 2 + dicto['safesearch'] = 2 + params = duckduckgo_images.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('s=50', params['url']) + self.assertIn('p=1', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, duckduckgo_images.response, None) + self.assertRaises(AttributeError, duckduckgo_images.response, []) + self.assertRaises(AttributeError, duckduckgo_images.response, '') + self.assertRaises(AttributeError, duckduckgo_images.response, '[]') + + response = mock.Mock(text='If this error persists, please let us know: ops@duckduckgo.com') + self.assertEqual(duckduckgo_images.response(response), []) + + json = u""" + { + "query": "test_query", + "results": [ + { + "title": "Result 1", + "url": "https://site1.url", + "thumbnail": "https://thumb1.nail", + "image": "https://image1" + }, + { + "title": "Result 2", + "url": "https://site2.url", + "thumbnail": "https://thumb2.nail", + "image": "https://image2" + } + ] + } + """ + response = mock.Mock(text=json) + results = duckduckgo_images.response(response) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], 'Result 1') + self.assertEqual(results[0]['url'], 'https://site1.url') + self.assertEqual(results[0]['thumbnail_src'], 'https://thumb1.nail') + self.assertEqual(results[0]['img_src'], 'https://image1') + self.assertEqual(results[1]['title'], 'Result 2') + self.assertEqual(results[1]['url'], 'https://site2.url') + self.assertEqual(results[1]['thumbnail_src'], 'https://thumb2.nail') + self.assertEqual(results[1]['img_src'], 'https://image2')