logo

searx

My custom branche(s) on searx, a meta-search engine
commit: 3126660be5e85a18ee386f49104d3bbb158a6386
parent: 379feb61f9a6de3604a91e46762b8f0f9717b08e
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Sun, 19 Aug 2018 13:29:06 +0200

Merge pull request #1374 from dadosch/master

[WIP] [engine] Duden.de (German dictionary) 

Diffstat:

Asearx/engines/duden.py76++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml5+++++
Atests/unit/engines/test_duden.py41+++++++++++++++++++++++++++++++++++++++++
3 files changed, 122 insertions(+), 0 deletions(-)

diff --git a/searx/engines/duden.py b/searx/engines/duden.py @@ -0,0 +1,76 @@ +""" + Duden + @website https://www.duden.de + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html, etree +import re +from searx.engines.xpath import extract_text +from searx.url_utils import quote +from searx import logger + +categories = ['general'] +paging = True +language_support = False + +# search-url +base_url = 'https://www.duden.de/' +search_url = base_url + 'suchen/dudenonline/{query}?page={offset}' + + +def request(query, params): + '''pre-request callback + params<dict>: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + offset = (params['pageno'] - 1) + params['url'] = search_url.format(offset=offset, query=quote(query)) + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + + dom = html.fromstring(resp.text) + + try: + number_of_results_string = re.sub('[^0-9]', '', dom.xpath( + '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] + ) + + results.append({'number_of_results': int(number_of_results_string)}) + + except: + logger.debug("Couldn't read number of results.") + pass + + for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): + try: + logger.debug("running for %s" % str(result)) + link = result.xpath('.//h2/a')[0] + url = link.attrib.get('href') + title = result.xpath('string(.//h2/a)') + content = extract_text(result.xpath('.//p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -714,6 +714,11 @@ engines: shortcut : 1337x disabled : True + - name : Duden + engine : duden + shortcut : du + disabled : True + # - name : yacy # engine : yacy # shortcut : ya diff --git a/tests/unit/engines/test_duden.py b/tests/unit/engines/test_duden.py @@ -0,0 +1,41 @@ +from collections import defaultdict +import mock +from searx.engines import duden +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestDudenEngine(SearxTestCase): + + def test_request(self): + query = 'Haus' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = duden.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('duden.de' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(duden.response(resp), []) + + html = """ + <section class="wide"> + <h2><a href="https://this.is.the.url/" class="hidden-link"><strong>This is the title</strong> also here</a></h2> + <p>This is the <strong>content</strong></p> + <a href="https://this.is.the.url/">Zum vollst&auml;ndigen Artikel</a> + </section> + """ + + resp = mock.Mock(text=html) + results = duden.response(resp) + + self.assertEqual(len(results), 1) + self.assertEqual(type(results), list) + + # testing result (dictionary entry) + r = results[0] + self.assertEqual(r['url'], 'https://this.is.the.url/') + self.assertEqual(r['title'], 'This is the title also here') + self.assertEqual(r['content'], 'This is the content')