logo

searx

Unnamed repository; edit this file 'description' to name the repository.
commit: 51eafdd471fe989accc060f3ebd8efba0d9b6a71
parent: 142cd870950e25fe4bb8f4b995b861162af9777c
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Fri,  4 Nov 2016 19:38:21 +0100

Merge pull request #733 from dalf/searchpy

Search architecture

Diffstat:

searx/plugins/doai_rewrite.py | 8++++----
searx/plugins/https_rewrite.py | 3+--
searx/plugins/self_info.py | 16++++++++--------
searx/plugins/tracker_url_remover.py | 10+++++-----
searx/query.py | 20++++++++++++++++++--
searx/results.py | 17++++++++++++++---
searx/search.py | 278+++++++++++++++++++++++++++++++++++++++++++------------------------------------
searx/webapp.py | 96+++++++++++++++++++++++++++++++++++++++----------------------------------------
tests/unit/test_plugins.py | 53++++++++++++++++++++++++++---------------------------
9 files changed, 274 insertions(+), 227 deletions(-)

diff --git a/searx/plugins/doai_rewrite.py b/searx/plugins/doai_rewrite.py @@ -20,12 +20,12 @@ def extract_doi(url): return None -def on_result(request, ctx): - doi = extract_doi(ctx['result']['parsed_url']) +def on_result(request, search, result): + doi = extract_doi(result['parsed_url']) if doi and len(doi) < 50: for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'): if doi.endswith(suffix): doi = doi[:-len(suffix)] - ctx['result']['url'] = 'http://doai.io/' + doi - ctx['result']['parsed_url'] = urlparse(ctx['result']['url']) + result['url'] = 'http://doai.io/' + doi + result['parsed_url'] = urlparse(ctx['result']['url']) return True diff --git a/searx/plugins/https_rewrite.py b/searx/plugins/https_rewrite.py @@ -220,8 +220,7 @@ def https_url_rewrite(result): return result -def on_result(request, ctx): - result = ctx['result'] +def on_result(request, search, result): if result['parsed_url'].scheme == 'http': https_url_rewrite(result) return True diff --git a/searx/plugins/self_info.py b/searx/plugins/self_info.py @@ -28,19 +28,19 @@ p = re.compile('.*user[ -]agent.*', re.IGNORECASE) # attach callback to the post search hook # request: flask request object # ctx: the whole local context of the pre search hook -def post_search(request, ctx): - if ctx['search'].pageno > 1: +def post_search(request, search): + if search.search_query.pageno > 1: return True - if ctx['search'].query == 'ip': + if search.search_query.query == 'ip': x_forwarded_for = request.headers.getlist("X-Forwarded-For") if x_forwarded_for: ip = x_forwarded_for[0] else: ip = request.remote_addr - ctx['search'].result_container.answers.clear() - ctx['search'].result_container.answers.add(ip) - elif p.match(ctx['search'].query): + search.result_container.answers.clear() + search.result_container.answers.add(ip) + elif p.match(search.search_query.query): ua = request.user_agent - ctx['search'].result_container.answers.clear() - ctx['search'].result_container.answers.add(ua) + search.result_container.answers.clear() + search.result_container.answers.add(ua) return True diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py @@ -28,8 +28,8 @@ description = gettext('Remove trackers arguments from the returned URL') default_on = True -def on_result(request, ctx): - query = ctx['result']['parsed_url'].query +def on_result(request, search, result): + query = result['parsed_url'].query if query == "": return True @@ -37,8 +37,8 @@ def on_result(request, ctx): for reg in regexes: query = reg.sub('', query) - if query != ctx['result']['parsed_url'].query: - ctx['result']['parsed_url'] = ctx['result']['parsed_url']._replace(query=query) - ctx['result']['url'] = urlunparse(ctx['result']['parsed_url']) + if query != result['parsed_url'].query: + result['parsed_url'] = result['parsed_url']._replace(query=query) + result['url'] = urlunparse(result['parsed_url']) return True diff --git a/searx/query.py b/searx/query.py @@ -25,8 +25,8 @@ import string import re -class Query(object): - """parse query""" +class RawTextQuery(object): + """parse raw text query (the value from the html input)""" def __init__(self, query, disabled_engines): self.query = query @@ -130,3 +130,19 @@ class Query(object): def getFullQuery(self): # get full querry including whitespaces return string.join(self.query_parts, '') + + +class SearchQuery(object): + """container for all the search parameters (query, language, etc...)""" + + def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range): + self.query = query + self.engines = engines + self.categories = categories + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + + def __str__(self): + return str(self.query) + ";" + str(self.engines) diff --git a/searx/results.py b/searx/results.py @@ -128,6 +128,8 @@ class ResultContainer(object): self.suggestions = set() self.answers = set() self._number_of_results = [] + self._ordered = False + self.paging = False def extend(self, engine_name, results): for result in list(results): @@ -153,6 +155,9 @@ class ResultContainer(object): self.results[engine_name].extend(results) + if not self.paging and engines[engine_name].paging: + self.paging = True + for i, result in enumerate(results): try: result['url'] = result['url'].decode('utf-8') @@ -219,7 +224,7 @@ class ResultContainer(object): with RLock(): self._merged_results.append(result) - def get_ordered_results(self): + def order_results(self): for result in self._merged_results: score = result_score(result) result['score'] = score @@ -269,8 +274,14 @@ class ResultContainer(object): # update categoryIndex categoryPositions[category] = {'index': len(gresults), 'count': 8} - # return gresults - return gresults + # update _merged_results + self._ordered = True + self._merged_results = gresults + + def get_ordered_results(self): + if not self._ordered: + self.order_results() + return self._merged_results def results_length(self): return len(self._merged_results) diff --git a/searx/search.py b/searx/search.py @@ -25,9 +25,10 @@ from searx.engines import ( categories, engines ) from searx.utils import gen_useragent -from searx.query import Query +from searx.query import RawTextQuery, SearchQuery from searx.results import ResultContainer from searx import logger +from searx.plugins import plugins logger = logger.getChild('search') @@ -127,135 +128,130 @@ def make_callback(engine_name, callback, params, result_container): return process_callback -class Search(object): - - """Search information container""" - - def __init__(self, request): - # init vars - super(Search, self).__init__() - self.query = None - self.engines = [] - self.categories = [] - self.paging = False - self.pageno = 1 - self.lang = 'all' - self.time_range = None - self.is_advanced = None - - # set blocked engines - self.disabled_engines = request.preferences.engines.get_disabled() - - self.result_container = ResultContainer() - self.request_data = {} - - # set specific language if set - self.lang = request.preferences.get_value('language') - - # set request method - if request.method == 'POST': - self.request_data = request.form - else: - self.request_data = request.args - - # TODO better exceptions - if not self.request_data.get('q'): - raise Exception('noquery') - - # set pagenumber - pageno_param = self.request_data.get('pageno', '1') - if not pageno_param.isdigit() or int(pageno_param) < 1: - pageno_param = 1 - - self.pageno = int(pageno_param) - - # parse query, if tags are set, which change - # the serch engine or search-language - query_obj = Query(self.request_data['q'], self.disabled_engines) - query_obj.parse_query() - - # set query - self.query = query_obj.getSearchQuery() - - # get last selected language in query, if possible - # TODO support search with multible languages - if len(query_obj.languages): - self.lang = query_obj.languages[-1] - - self.time_range = self.request_data.get('time_range') - self.is_advanced = self.request_data.get('advanced_search') - - self.engines = query_obj.engines - - # if engines are calculated from query, - # set categories by using that informations - if self.engines and query_obj.specific: - self.categories = list(set(engine['category'] - for engine in self.engines)) - - # otherwise, using defined categories to - # calculate which engines should be used +def get_search_query_from_webapp(preferences, form): + query = None + query_engines = [] + query_categories = [] + query_paging = False + query_pageno = 1 + query_lang = 'all' + query_time_range = None + + # set blocked engines + disabled_engines = preferences.engines.get_disabled() + + # set specific language if set + query_lang = preferences.get_value('language') + + # safesearch + query_safesearch = preferences.get_value('safesearch') + + # TODO better exceptions + if not form.get('q'): + raise Exception('noquery') + + # set pagenumber + pageno_param = form.get('pageno', '1') + if not pageno_param.isdigit() or int(pageno_param) < 1: + pageno_param = 1 + + query_pageno = int(pageno_param) + + # parse query, if tags are set, which change + # the serch engine or search-language + raw_text_query = RawTextQuery(form['q'], disabled_engines) + raw_text_query.parse_query() + + # set query + query = raw_text_query.getSearchQuery() + + # get last selected language in query, if possible + # TODO support search with multible languages + if len(raw_text_query.languages): + query_lang = raw_text_query.languages[-1] + + query_time_range = form.get('time_range') + + query_engines = raw_text_query.engines + + # if engines are calculated from query, + # set categories by using that informations + if query_engines and raw_text_query.specific: + query_categories = list(set(engine['category'] + for engine in query_engines)) + + # otherwise, using defined categories to + # calculate which engines should be used + else: + # set categories/engines + load_default_categories = True + for pd_name, pd in form.items(): + if pd_name == 'categories': + query_categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories) + elif pd_name == 'engines': + pd_engines = [{'category': engines[engine].categories[0], + 'name': engine} + for engine in map(unicode.strip, pd.split(',')) if engine in engines] + if pd_engines: + query_engines.extend(pd_engines) + load_default_categories = False + elif pd_name.startswith('category_'): + category = pd_name[9:] + + # if category is not found in list, skip + if category not in categories: + continue + + if pd != 'off': + # add category to list + query_categories.append(category) + elif category in query_categories: + # remove category from list if property is set to 'off' + query_categories.remove(category) + + if not load_default_categories: + if not query_categories: + query_categories = list(set(engine['category'] + for engine in engines)) else: - # set categories/engines - load_default_categories = True - for pd_name, pd in self.request_data.items(): - if pd_name == 'categories': - self.categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories) - elif pd_name == 'engines': - pd_engines = [{'category': engines[engine].categories[0], - 'name': engine} - for engine in map(unicode.strip, pd.split(',')) if engine in engines] - if pd_engines: - self.engines.extend(pd_engines) - load_default_categories = False - elif pd_name.startswith('category_'): - category = pd_name[9:] - - # if category is not found in list, skip - if category not in categories: - continue - - if pd != 'off': - # add category to list - self.categories.append(category) - elif category in self.categories: - # remove category from list if property is set to 'off' - self.categories.remove(category) - - if not load_default_categories: - if not self.categories: - self.categories = list(set(engine['category'] - for engine in self.engines)) - return - # if no category is specified for this search, # using user-defined default-configuration which # (is stored in cookie) - if not self.categories: - cookie_categories = request.preferences.get_value('categories') + if not query_categories: + cookie_categories = preferences.get_value('categories') for ccateg in cookie_categories: if ccateg in categories: - self.categories.append(ccateg) + query_categories.append(ccateg) # if still no category is specified, using general # as default-category - if not self.categories: - self.categories = ['general'] + if not query_categories: + query_categories = ['general'] # using all engines for that search, which are # declared under the specific categories - for categ in self.categories: - self.engines.extend({'category': categ, - 'name': engine.name} - for engine in categories[categ] - if (engine.name, categ) not in self.disabled_engines) + for categ in query_categories: + query_engines.extend({'category': categ, + 'name': engine.name} + for engine in categories[categ] + if (engine.name, categ) not in disabled_engines) - # remove suspended engines - self.engines = [e for e in self.engines - if engines[e['name']].suspend_end_time <= time()] + return SearchQuery(query, query_engines, query_categories, + query_lang, query_safesearch, query_pageno, query_time_range) + + +class Search(object): + + """Search information container""" + + def __init__(self, search_query): + # init vars + super(Search, self).__init__() + self.search_query = search_query + self.result_container = ResultContainer() # do search-request - def search(self, request): + def search(self): global number_of_searches # init vars @@ -268,23 +264,30 @@ class Search(object): # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() + search_query = self.search_query + # start search-reqest for all selected engines - for selected_engine in self.engines: + for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] + # skip suspended engines + if engine.suspend_end_time and engine.suspend_end_time <= time(): + continue + # if paging is not supported, skip - if self.pageno > 1 and not engine.paging: + if search_query.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip - if self.lang != 'all' and not engine.language_support: + if search_query.lang != 'all' and not engine.language_support: continue - if self.time_range and not engine.time_range_support: + # if time_range is not supported, skip + if search_query.time_range and not engine.time_range_support: continue # set default request parameters @@ -292,21 +295,20 @@ class Search(object): request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() - request_params['pageno'] = self.pageno + request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: - request_params['language'] = self.lang + request_params['language'] = search_query.lang # 0 = None, 1 = Moderate, 2 = Strict - request_params['safesearch'] = request.preferences.get_value('safesearch') - request_params['time_range'] = self.time_range - request_params['advanced_search'] = self.is_advanced + request_params['safesearch'] = search_query.safesearch + request_params['time_range'] = search_query.time_range # update request parameters dependent on # search-engine (contained in engines folder) - engine.request(self.query.encode('utf-8'), request_params) + engine.request(search_query.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines @@ -346,10 +348,32 @@ class Search(object): selected_engine['name'])) if not requests: - return self + return self.result_container # send all search-request threaded_requests(requests) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes - return self + return self.result_container + + +class SearchWithPlugins(Search): + + """Similar to the Search class but call the plugins.""" + + def __init__(self, search_query, request): + super(SearchWithPlugins, self).__init__(search_query) + self.request = request + + def search(self): + if plugins.call('pre_search', self.request, self): + super(SearchWithPlugins, self).search() + + plugins.call('post_search', self.request, self) + + results = self.result_container.get_ordered_results() + + for result in results: + plugins.call('on_result', self.request, self, result) + + return self.result_container diff --git a/searx/webapp.py b/searx/webapp.py @@ -62,8 +62,8 @@ from searx.utils import ( ) from searx.version import VERSION_STRING from searx.languages import language_codes -from searx.search import Search -from searx.query import Query +from searx.search import Search, SearchWithPlugins, get_search_query_from_webapp +from searx.query import RawTextQuery, SearchQuery from searx.autocomplete import searx_bang, backends as autocomplete_backends from searx.plugins import plugins from searx.preferences import Preferences, ValidationException @@ -373,11 +373,13 @@ def pre_request(): logger.warning('Invalid config') request.preferences = preferences + # request.form request.form = dict(request.form.items()) for k, v in request.args.items(): if k not in request.form: request.form[k] = v + # request.user_plugins request.user_plugins = [] allowed_plugins = preferences.plugins.get_enabled() disabled_plugins = preferences.plugins.get_disabled() @@ -400,30 +402,33 @@ def index(): 'index.html', ) + # search + search_query = None + result_container = None try: - search = Search(request) + search_query = get_search_query_from_webapp(request.preferences, request.form) + # search = Search(search_query) # without plugins + search = SearchWithPlugins(search_query, request) + result_container = search.search() except: return render( 'index.html', ) - if plugins.call('pre_search', request, locals()): - search.search(request) + results = result_container.get_ordered_results() - plugins.call('post_search', request, locals()) - - results = search.result_container.get_ordered_results() + # UI + advanced_search = request.form.get('advanced_search', None) + output_format = request.form.get('format', 'html') + if output_format not in ['html', 'csv', 'json', 'rss']: + output_format = 'html' + # output for result in results: - - plugins.call('on_result', request, locals()) - if not search.paging and engines[result['engine']].paging: - search.paging = True - - if search.request_data.get('format', 'html') == 'html': + if output_format == 'html': if 'content' in result and result['content']: - result['content'] = highlight_content(result['content'][:1024], search.query.encode('utf-8')) - result['title'] = highlight_content(result['title'], search.query.encode('utf-8')) + result['content'] = highlight_content(result['content'][:1024], search_query.query.encode('utf-8')) + result['title'] = highlight_content(result['title'], search_query.query.encode('utf-8')) else: if result.get('content'): result['content'] = html_to_text(result['content']).strip() @@ -450,16 +455,16 @@ def index(): else: result['publishedDate'] = format_date(result['publishedDate']) - number_of_results = search.result_container.results_number() - if number_of_results < search.result_container.results_length(): + number_of_results = result_container.results_number() + if number_of_results < result_container.results_length(): number_of_results = 0 - if search.request_data.get('format') == 'json': - return Response(json.dumps({'query': search.query, + if output_format == 'json': + return Response(json.dumps({'query': search_query.query, 'number_of_results': number_of_results, 'results': results}), mimetype='application/json') - elif search.request_data.get('format') == 'csv': + elif output_format == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') csv.writerow(keys) @@ -468,14 +473,14 @@ def index(): csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') - cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query.encode('utf-8')) + cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query.encode('utf-8')) response.headers.add('Content-Disposition', cont_disp) return response - elif search.request_data.get('format') == 'rss': + elif output_format == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=results, - q=search.request_data['q'], + q=request.form['q'], number_of_results=number_of_results, base_url=get_base_url() ) @@ -484,17 +489,17 @@ def index(): return render( 'results.html', results=results, - q=search.request_data['q'], - selected_categories=search.categories, - paging=search.paging, + q=request.form['q'], + selected_categories=search_query.categories, + pageno=search_query.pageno, + time_range=search_query.time_range, number_of_results=format_decimal(number_of_results), - pageno=search.pageno, - advanced_search=search.is_advanced, - time_range=search.time_range, + advanced_search=advanced_search, + suggestions=result_container.suggestions, + answers=result_container.answers, + infoboxes=result_container.infoboxes, + paging=result_container.paging, base_url=get_base_url(), - suggestions=search.result_container.suggestions, - answers=search.result_container.answers, - infoboxes=search.result_container.infoboxes, theme=get_current_theme_name(), favicons=global_favicons[themes.index(get_current_theme_name())] ) @@ -511,30 +516,23 @@ def about(): @app.route('/autocompleter', methods=['GET', 'POST']) def autocompleter(): """Return autocompleter results""" - request_data = {} - - # select request method - if request.method == 'POST': - request_data = request.form - else: - request_data = request.args # set blocked engines disabled_engines = request.preferences.engines.get_disabled() # parse query - query = Query(request_data.get('q', '').encode('utf-8'), disabled_engines) - query.parse_query() + raw_text_query = RawTextQuery(request.form.get('q', '').encode('utf-8'), disabled_engines) + raw_text_query.parse_query() # check if search query is set - if not query.getSearchQuery(): + if not raw_text_query.getSearchQuery(): return '', 400 # run autocompleter completer = autocomplete_backends.get(request.preferences.get_value('autocomplete')) # parse searx specific autocompleter results like !bang - raw_results = searx_bang(query) + raw_results = searx_bang(raw_text_query) # normal autocompletion results only appear if max 3 inner results returned if len(raw_results) <= 3 and completer: @@ -545,19 +543,19 @@ def autocompleter(): else: language = language.split('_')[0] # run autocompletion - raw_results.extend(completer(query.getSearchQuery(), language)) + raw_results.extend(completer(raw_text_query.getSearchQuery(), language)) # parse results (write :language and !engine back to result string) results = [] for result in raw_results: - query.changeSearchQuery(result) + raw_text_query.changeSearchQuery(result) # add parsed result - results.append(query.getFullQuery()) + results.append(raw_text_query.getFullQuery()) # return autocompleter results - if request_data.get('format') == 'x-suggestions': - return Response(json.dumps([query.query, results]), + if request.form.get('format') == 'x-suggestions': + return Response(json.dumps([raw_text_query.query, results]), mimetype='application/json') return Response(json.dumps(results), diff --git a/tests/unit/test_plugins.py b/tests/unit/test_plugins.py @@ -6,9 +6,8 @@ from mock import Mock def get_search_mock(query, **kwargs): - return {'search': Mock(query=query, - result_container=Mock(answers=set()), - **kwargs)} + return Mock(search_query=Mock(query=query, **kwargs), + result_container=Mock(answers=set())) class PluginStoreTest(SearxTestCase): @@ -52,39 +51,39 @@ class SelfIPTest(SearxTestCase): request = Mock(user_plugins=store.plugins, remote_addr='127.0.0.1') request.headers.getlist.return_value = [] - ctx = get_search_mock(query='ip', pageno=1) - store.call('post_search', request, ctx) - self.assertTrue('127.0.0.1' in ctx['search'].result_container.answers) + search = get_search_mock(query='ip', pageno=1) + store.call('post_search', request, search) + self.assertTrue('127.0.0.1' in search.result_container.answers) - ctx = get_search_mock(query='ip', pageno=2) - store.call('post_search', request, ctx) - self.assertFalse('127.0.0.1' in ctx['search'].result_container.answers) + search = get_search_mock(query='ip', pageno=2) + store.call('post_search', request, search) + self.assertFalse('127.0.0.1' in search.result_container.answers) # User agent test request = Mock(user_plugins=store.plugins, user_agent='Mock') request.headers.getlist.return_value = [] - ctx = get_search_mock(query='user-agent', pageno=1) - store.call('post_search', request, ctx) - self.assertTrue('Mock' in ctx['search'].result_container.answers) + search = get_search_mock(query='user-agent', pageno=1) + store.call('post_search', request, search) + self.assertTrue('Mock' in search.result_container.answers) - ctx = get_search_mock(query='user-agent', pageno=2) - store.call('post_search', request, ctx) - self.assertFalse('Mock' in ctx['search'].result_container.answers) + search = get_search_mock(query='user-agent', pageno=2) + store.call('post_search', request, search) + self.assertFalse('Mock' in search.result_container.answers) - ctx = get_search_mock(query='user-agent', pageno=1) - store.call('post_search', request, ctx) - self.assertTrue('Mock' in ctx['search'].result_container.answers) + search = get_search_mock(query='user-agent', pageno=1) + store.call('post_search', request, search) + self.assertTrue('Mock' in search.result_container.answers) - ctx = get_search_mock(query='user-agent', pageno=2) - store.call('post_search', request, ctx) - self.assertFalse('Mock' in ctx['search'].result_container.answers) + search = get_search_mock(query='user-agent', pageno=2) + store.call('post_search', request, search) + self.assertFalse('Mock' in search.result_container.answers) - ctx = get_search_mock(query='What is my User-Agent?', pageno=1) - store.call('post_search', request, ctx) - self.assertTrue('Mock' in ctx['search'].result_container.answers) + search = get_search_mock(query='What is my User-Agent?', pageno=1) + store.call('post_search', request, search) + self.assertTrue('Mock' in search.result_container.answers) - ctx = get_search_mock(query='What is my User-Agent?', pageno=2) - store.call('post_search', request, ctx) - self.assertFalse('Mock' in ctx['search'].result_container.answers) + search = get_search_mock(query='What is my User-Agent?', pageno=2) + store.call('post_search', request, search) + self.assertFalse('Mock' in search.result_container.answers)