logo

searx

My custom branche(s) on searx, a meta-search engine
commit: 98e6b4d830ef8101bfdf06e304b98f0125b8ce35
parent: c26041e951d57c44cb5115a78c8fbf6541e48e13
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Tue,  8 Aug 2017 15:45:17 +0200

Merge pull request #988 from a01200356/bing

New engine: Bing videos

Diffstat:

Asearx/engines/bing_videos.py96+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml4++++
Atests/unit/engines/test_bing_videos.py131+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 231 insertions(+), 0 deletions(-)

diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py @@ -0,0 +1,96 @@ +""" + Bing (Videos) + + @website https://www.bing.com/videos + @provide-api yes (http://datamarket.azure.com/dataset/bing/search) + + @using-api no + @results HTML + @stable no + @parse url, title, content, thumbnail +""" + +from json import loads +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + + +categories = ['videos'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 10 + +search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\ + 'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5' +time_range_string = '&qft=+filterui:videoage-lt{interval}' +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} + +# safesearch definitions +safesearch_types = {2: 'STRICT', + 1: 'DEMOTE', + 0: 'OFF'} + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + # safesearch cookie + params['cookies']['SRCHHPGUSR'] = \ + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + # language cookie + params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1' + + # query and paging + params['url'] = search_url.format(query=urlencode({'q': query}), + offset=offset, + number_of_results=number_of_results) + + # time range + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="dg_u"]'): + + # try to extract the url + url_container = result.xpath('.//div[@class="sa_wrapper"]/@data-eventpayload') + if len(url_container) > 0: + url = loads(url_container[0])['purl'] + else: + url = result.xpath('./a/@href')[0] + + # discard results that do not return an external url + # very recent results sometimes don't return the video's url + if url.startswith('/videos/search?'): + continue + + title = extract_text(result.xpath('./a//div[@class="tl"]')) + content = extract_text(result.xpath('.//div[@class="pubInfo"]')) + thumbnail = result.xpath('.//div[@class="vthumb"]/img/@src')[0] + + results.append({'url': url, + 'title': title, + 'content': content, + 'thumbnail': thumbnail, + 'template': 'videos.html'}) + + # first page ignores requested number of results + if len(results) >= number_of_results: + break + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -81,6 +81,10 @@ engines: engine : bing_news shortcut : bin + - name : bing videos + engine : bing_videos + shortcut : biv + - name : bitbucket engine : xpath paging : True diff --git a/tests/unit/engines/test_bing_videos.py b/tests/unit/engines/test_bing_videos.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import bing_videos +from searx.testing import SearxTestCase + + +class TestBingVideosEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + dicto['language'] = 'fr-FR' + dicto['safesearch'] = 0 + dicto['time_range'] = '' + params = bing_videos.request(query, dicto) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('bing.com' in params['url']) + self.assertTrue('SRCHHPGUSR' in params['cookies']) + self.assertTrue('OFF' in params['cookies']['SRCHHPGUSR']) + self.assertTrue('_EDGE_S' in params['cookies']) + self.assertTrue('fr-fr' in params['cookies']['_EDGE_S']) + + dicto['pageno'] = 2 + dicto['time_range'] = 'day' + dicto['safesearch'] = 2 + params = bing_videos.request(query, dicto) + self.assertTrue('first=11' in params['url']) + self.assertTrue('1440' in params['url']) + self.assertIn('SRCHHPGUSR', params['cookies']) + self.assertTrue('STRICT' in params['cookies']['SRCHHPGUSR']) + + def test_response(self): + self.assertRaises(AttributeError, bing_videos.response, None) + self.assertRaises(AttributeError, bing_videos.response, []) + self.assertRaises(AttributeError, bing_videos.response, '') + self.assertRaises(AttributeError, bing_videos.response, '[]') + + response = mock.Mock(text='<html></html>') + self.assertEqual(bing_videos.response(response), []) + + response = mock.Mock(text='<html></html>') + self.assertEqual(bing_videos.response(response), []) + + html = """ + <div> + <div class="dg_u"> + <a class="dv_i" href="/videos/search?abcde"> + <div class="vthblock"> + <div class="vthumb"> + <img src="thumb_1.jpg" /> + </div> + <div> + <div class="tl"> + Title 1 + </div> + </div> + </div> + <div class="videoInfoPanel"> + <div class="pubInfo"> + <div>Content 1</div> + </div> + </div> + </a> + <div class="sa_wrapper" + data-eventpayload="{&quot;purl&quot;: &quot;https://url.com/1&quot;}"> + </div> + </div> + </div> + """ + response = mock.Mock(text=html) + results = bing_videos.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Title 1') + self.assertEqual(results[0]['url'], 'https://url.com/1') + self.assertEqual(results[0]['content'], 'Content 1') + self.assertEqual(results[0]['thumbnail'], 'thumb_1.jpg') + + html = """ + <div> + <div class="dg_u"> + <a class="dv_i" href="https://url.com/1"> + <div class="vthblock"> + <div class="vthumb"> + <img src="thumb_1.jpg" /> + </div> + <div> + <div class="tl"> + Title 1 + </div> + </div> + </div> + <div class="videoInfoPanel"> + <div class="pubInfo"> + <div>Content 1</div> + </div> + </div> + </a> + </div> + <div class="dg_u"> + <a class="dv_i" href="/videos/search?abcde"> + <div class="vthblock"> + <div class="vthumb"> + <img src="thumb_2.jpg" /> + </div> + <div> + <div class="tl"> + Title 2 + </div> + </div> + </div> + <div class="videoInfoPanel"> + <div class="pubInfo"> + <div>Content 2</div> + </div> + </div> + </a> + </div> + </div> + """ + response = mock.Mock(text=html) + results = bing_videos.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Title 1') + self.assertEqual(results[0]['url'], 'https://url.com/1') + self.assertEqual(results[0]['content'], 'Content 1') + self.assertEqual(results[0]['thumbnail'], 'thumb_1.jpg')