logo

searx

My custom branche(s) on searx, a meta-search engine
commit: bd2db71fa6921a757ff5df559535092f45010652
parent: 090254feca463dcd2243c67971c7e24e4907739c
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Mon, 22 Sep 2014 21:40:40 +0200

Merge branch 'comments' of https://github.com/pointhi/searx

Conflicts:
	searx/search.py

Diffstat:

Msearx/__init__.py20++++++++++++++++++++
Msearx/autocomplete.py20+++++++++++++++++++-
Msearx/languages.py18++++++++++++++++++
Msearx/search.py118++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 174 insertions(+), 2 deletions(-)

diff --git a/searx/__init__.py b/searx/__init__.py @@ -1,3 +1,20 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + from os import environ from os.path import realpath, dirname, join, abspath try: @@ -10,11 +27,14 @@ except: searx_dir = abspath(dirname(__file__)) engine_dir = dirname(realpath(__file__)) +# if possible set path to settings using the enviroment variable SEARX_SETTINGS_PATH if 'SEARX_SETTINGS_PATH' in environ: settings_path = environ['SEARX_SETTINGS_PATH'] +# otherwise using default path else: settings_path = join(searx_dir, 'settings.yml') +# load settings with open(settings_path) as settings_yaml: settings = load(settings_yaml) diff --git a/searx/autocomplete.py b/searx/autocomplete.py @@ -1,3 +1,21 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + + from lxml import etree from requests import get from json import loads @@ -22,7 +40,7 @@ def dbpedia(query): def duckduckgo(query): - # wikipedia autocompleter + # duckduckgo autocompleter url = 'https://ac.duckduckgo.com/ac/?{0}&type=list' resp = loads(get(url.format(urlencode(dict(q=query)))).text) diff --git a/searx/languages.py b/searx/languages.py @@ -1,3 +1,21 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + +# list of language codes language_codes = ( ("ar_XA", "Arabic", "Arabia"), ("bg_BG", "Bulgarian", "Bulgaria"), diff --git a/searx/search.py b/searx/search.py @@ -1,3 +1,20 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + import grequests from itertools import izip_longest, chain from datetime import datetime @@ -9,45 +26,65 @@ from searx.engines import ( from searx.languages import language_codes from searx.utils import gen_useragent + number_of_searches = 0 +# get default reqest parameter def default_request_params(): return { 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}} +# create a callback wrapper for the search engine results def make_callback(engine_name, results, suggestions, callback, params): + # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): cb_res = [] response.search_params = params + + # update stats with current page-load-time engines[engine_name].stats['page_load_time'] += \ (datetime.now() - params['started']).total_seconds() + try: search_results = callback(response) except Exception, e: + # increase errors stats engines[engine_name].stats['errors'] += 1 results[engine_name] = cb_res + + # print engine name and specific error message print '[E] Error with engine "{0}":\n\t{1}'.format( engine_name, str(e)) return + for result in search_results: result['engine'] = engine_name + + # if it is a suggestion, add it to list of suggestions if 'suggestion' in result: # TODO type checks suggestions.add(result['suggestion']) continue + + # append result cb_res.append(result) + results[engine_name] = cb_res + return process_callback +# score results and remove duplications def score_results(results): + # calculate scoring parameters flat_res = filter( None, chain.from_iterable(izip_longest(*results.values()))) flat_len = len(flat_res) engines_len = len(results) + results = [] # pass 1: deduplication + scoring @@ -63,34 +100,53 @@ def score_results(results): res['engines'] = [res['engine']] weight = 1.0 + # get weight of this engine if possible if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) + # calculate score for that engine score = int((flat_len - i) / engines_len) * weight + 1 + duplicated = False + # check for duplicates for new_res in results: + # remove / from the end of the url if required p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa + + # check if that result is a duplicate if res['host'] == new_res['host'] and\ unquote(p1) == unquote(p2) and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): duplicated = new_res break + + # merge duplicates together if duplicated: + # using content with more text if res.get('content') > duplicated.get('content'): duplicated['content'] = res['content'] + + # increase result-score duplicated['score'] += score + + # add engine to list of result-engines duplicated['engines'].append(res['engine']) + + # using https if possible if duplicated['parsed_url'].scheme == 'https': continue elif res['parsed_url'].scheme == 'https': duplicated['url'] = res['parsed_url'].geturl() duplicated['parsed_url'] = res['parsed_url'] + + # if there is no duplicate found, append result else: res['score'] = score results.append(res) + results = sorted(results, key=itemgetter('score'), reverse=True) # pass 2 : group results by category and template @@ -99,7 +155,7 @@ def score_results(results): for i, res in enumerate(results): # FIXME : handle more than one category per engine - category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template'] + category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template'] current = None if category not in categoryPositions else categoryPositions[category] @@ -134,6 +190,7 @@ class Search(object): """Search information container""" def __init__(self, request): + # init vars super(Search, self).__init__() self.query = None self.engines = [] @@ -141,18 +198,23 @@ class Search(object): self.paging = False self.pageno = 1 self.lang = 'all' + + # set blocked engines if request.cookies.get('blocked_engines'): self.blocked_engines = request.cookies['blocked_engines'].split(',') # noqa else: self.blocked_engines = [] + self.results = [] self.suggestions = [] self.request_data = {} + # set specific language if set if request.cookies.get('language')\ and request.cookies['language'] in (x[0] for x in language_codes): self.lang = request.cookies['language'] + # set request method if request.method == 'POST': self.request_data = request.form else: @@ -162,51 +224,72 @@ class Search(object): if not self.request_data.get('q'): raise Exception('noquery') + # set query self.query = self.request_data['q'] + # set pagenumber pageno_param = self.request_data.get('pageno', '1') if not pageno_param.isdigit() or int(pageno_param) < 1: raise Exception('wrong pagenumber') self.pageno = int(pageno_param) + # parse query, if tags are set, which change the serch engine or search-language self.parse_query() self.categories = [] + # if engines are calculated from query, set categories by using that informations if self.engines: self.categories = list(set(engine['category'] for engine in self.engines)) + + # otherwise, using defined categories to calculate which engines should be used else: + # set used categories for pd_name, pd in self.request_data.items(): if pd_name.startswith('category_'): category = pd_name[9:] + # if category is not found in list, skip if not category in categories: continue + + # add category to list self.categories.append(category) + + # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie) if not self.categories: cookie_categories = request.cookies.get('categories', '') cookie_categories = cookie_categories.split(',') for ccateg in cookie_categories: if ccateg in categories: self.categories.append(ccateg) + + # if still no category is specified, using general as default-category if not self.categories: self.categories = ['general'] + # using all engines for that search, which are declared under the specific categories for categ in self.categories: self.engines.extend({'category': categ, 'name': x.name} for x in categories[categ] if not x.name in self.blocked_engines) + # parse query, if tags are set, which change the serch engine or search-language def parse_query(self): query_parts = self.query.split() modified = False + + # check if language-prefix is set if query_parts[0].startswith(':'): lang = query_parts[0][1:].lower() + # check if any language-code is equal with declared language-codes for lc in language_codes: lang_id, lang_name, country = map(str.lower, lc) + + # if correct language-code is found, set it as new search-language if lang == lang_id\ or lang_id.startswith(lang)\ or lang == lang_name\ @@ -215,56 +298,78 @@ class Search(object): modified = True break + # check if category/engine prefix is set elif query_parts[0].startswith('!'): prefix = query_parts[0][1:].replace('_', ' ') + # check if prefix is equal with engine shortcut if prefix in engine_shortcuts\ and not engine_shortcuts[prefix] in self.blocked_engines: modified = True self.engines.append({'category': 'none', 'name': engine_shortcuts[prefix]}) + + # check if prefix is equal with engine name elif prefix in engines\ and not prefix in self.blocked_engines: modified = True self.engines.append({'category': 'none', 'name': prefix}) + + # check if prefix is equal with categorie name elif prefix in categories: modified = True + # using all engines for that search, which are declared under that categorie name self.engines.extend({'category': prefix, 'name': engine.name} for engine in categories[prefix] if not engine in self.blocked_engines) + + # if language, category or engine were specificed in this query, search for more tags which does the same if modified: self.query = self.query.replace(query_parts[0], '', 1).strip() self.parse_query() + # do search-request def search(self, request): global number_of_searches + + # init vars requests = [] results = {} suggestions = set() + + # increase number of searches number_of_searches += 1 + + # set default useragent #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() + # start search-reqest for all selected engines for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] + # if paging is not supported, skip if self.pageno > 1 and not engine.paging: continue + # if search-language is set and engine does not provide language-support, skip if self.lang != 'all' and not engine.language_support: continue + # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params['pageno'] = self.pageno request_params['language'] = self.lang + + # update request parameters dependent on search-engine (contained in engines folder) request_params = engine.request(self.query.encode('utf-8'), request_params) @@ -272,6 +377,7 @@ class Search(object): # TODO add support of offline engines pass + # create a callback wrapper for the search engine results callback = make_callback( selected_engine['name'], results, @@ -280,6 +386,7 @@ class Search(object): request_params ) + # create dictionary which contain all informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), @@ -287,6 +394,7 @@ class Search(object): timeout=engine.timeout ) + # specific type of request (GET or POST) if request_params['method'] == 'GET': req = grequests.get else: @@ -297,17 +405,25 @@ class Search(object): if not request_params['url']: continue + # append request to list requests.append(req(request_params['url'], **request_args)) + + # send all search-request grequests.map(requests) + + # update engine-specific stats for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) + # score results and remove duplications results = score_results(results) + # update engine stats, using calculated score for result in results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] + # return results and suggestions return results, suggestions