logo

searx

Unnamed repository; edit this file 'description' to name the repository.
commit: 596c6b6c93a50d8d797e90ad5dd9b608599cd653
parent: 55abf07a4f80f74fbcfbeddaee4f8591216802cd
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Tue,  6 Sep 2016 17:01:50 +0200

Merge pull request #678 from potato/master

[engine] dictzone + mymemory.translated engine

Diffstat:

searx/engines/dictzone.py | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
searx/engines/translated.py | 69+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
searx/settings.yml | 13+++++++++++++
searx/utils.py | 15+++++++++++++++
4 files changed, 167 insertions(+), 0 deletions(-)

diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py @@ -0,0 +1,70 @@ +""" + Dictzone + + @website https://dictzone.com/ + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +import re +from urlparse import urljoin +from lxml import html +from cgi import escape +from searx.engines.xpath import extract_text +from searx.utils import is_valid_lang + +categories = ['general'] +url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) +results_xpath = './/table[@id="r"]/tr' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + params['url'] = url.format(from_lang=from_lang[2], + to_lang=to_lang[2], + query=query) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for k, result in enumerate(dom.xpath(results_xpath)[1:]): + try: + from_result, to_results_raw = result.xpath('./td') + except: + continue + + to_results = [] + for to_result in to_results_raw.xpath('./p/a'): + t = to_result.text_content() + if t.strip(): + to_results.append(to_result.text_content()) + + results.append({ + 'url': urljoin(resp.url, '?%d' % k), + 'title': escape(from_result.text_content()), + 'content': escape('; '.join(to_results)) + }) + + return results diff --git a/searx/engines/translated.py b/searx/engines/translated.py @@ -0,0 +1,69 @@ +""" + MyMemory Translated + + @website https://mymemory.translated.net/ + @provide-api yes (https://mymemory.translated.net/doc/spec.php) + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" +import re +from urlparse import urljoin +from lxml import html +from cgi import escape +from searx.engines.xpath import extract_text +from searx.utils import is_valid_lang + +categories = ['general'] +url = 'http://api.mymemory.translated.net/get?q={query}' \ + '&langpair={from_lang}|{to_lang}{key}' +web_url = 'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) +api_key = '' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + if api_key: + key_form = '&key=' + api_key + else: + key_form = '' + params['url'] = url.format(from_lang=from_lang[1], + to_lang=to_lang[1], + query=query, + key=key_form) + params['query'] = query + params['from_lang'] = from_lang + params['to_lang'] = to_lang + + return params + + +def response(resp): + results = [] + results.append({ + 'url': escape(web_url.format( + from_lang=resp.search_params['from_lang'][2], + to_lang=resp.search_params['to_lang'][2], + query=resp.search_params['query'])), + 'title': escape('[{0}-{1}] {2}'.format( + resp.search_params['from_lang'][1], + resp.search_params['to_lang'][1], + resp.search_params['query'])), + 'content': escape(resp.json()['responseData']['translatedText']) + }) + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -495,6 +495,19 @@ engines: timeout: 6.0 categories : science + - name : dictzone + engine : dictzone + shortcut : dc + + - name : mymemory translated + engine : translated + shortcut : tl + timeout : 5.0 + disabled : True + # You can use without an API key, but you are limited to 1000 words/day + # See : http://mymemory.translated.net/doc/usagelimits.php + # api_key : '' + #The blekko technology and team have joined IBM Watson! -> https://blekko.com/ # - name : blekko images # engine : blekko_images diff --git a/searx/utils.py b/searx/utils.py @@ -9,6 +9,7 @@ from HTMLParser import HTMLParser from random import choice from searx.version import VERSION_STRING +from searx.languages import language_codes from searx import settings from searx import logger @@ -255,3 +256,17 @@ def get_torrent_size(filesize, filesize_multiplier): filesize = None return filesize + + +def is_valid_lang(lang): + is_abbr = (len(lang) == 2) + if is_abbr: + for l in language_codes: + if l[0][:2] == lang.lower(): + return (True, l[0][:2], l[1].lower()) + return False + else: + for l in language_codes: + if l[1].lower() == lang.lower(): + return (True, l[0][:2], l[1].lower()) + return False