commit: ceb8ae6439f092dad6e26406cd178c2195eb0898
parent: d80fb2c8e8995facb3a25c152c47a93eecf1fee4
Author: Adam Tauber <asciimoo@gmail.com>
Date: Sat, 10 Dec 2016 00:06:39 +0100
Merge pull request #751 from dalf/searchpy2
Simplify search.py
Diffstat:
3 files changed, 154 insertions(+), 131 deletions(-)
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
@@ -90,6 +90,9 @@ def load_engine(engine_data):
'result_count': 0,
'search_count': 0,
'page_load_time': 0,
+ 'page_load_count': 0,
+ 'engine_time': 0,
+ 'engine_time_count': 0,
'score_count': 0,
'errors': 0
}
@@ -106,32 +109,56 @@ def load_engine(engine_data):
return engine
+def to_percentage(stats, maxvalue):
+ for engine_stat in stats:
+ if maxvalue:
+ engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100)
+ else:
+ engine_stat['percentage'] = 0
+ return stats
+
+
def get_engines_stats():
# TODO refactor
pageloads = []
+ engine_times = []
results = []
scores = []
errors = []
scores_per_result = []
- max_pageload = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
+ max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
for engine in engines.values():
if engine.stats['search_count'] == 0:
continue
results_num = \
engine.stats['result_count'] / float(engine.stats['search_count'])
- load_times = engine.stats['page_load_time'] / float(engine.stats['search_count']) # noqa
+
+ if engine.stats['page_load_count'] != 0:
+ load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
+ else:
+ load_times = 0
+
+ if engine.stats['engine_time_count'] != 0:
+ this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa
+ else:
+ this_engine_time = 0
+
if results_num:
score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa
score_per_result = score / results_num
else:
score = score_per_result = 0.0
- max_results = max(results_num, max_results)
+
max_pageload = max(load_times, max_pageload)
+ max_engine_times = max(this_engine_time, max_engine_times)
+ max_results = max(results_num, max_results)
max_score = max(score, max_score)
max_score_per_result = max(score_per_result, max_score_per_result)
max_errors = max(max_errors, engine.stats['errors'])
+
pageloads.append({'avg': load_times, 'name': engine.name})
+ engine_times.append({'avg': this_engine_time, 'name': engine.name})
results.append({'avg': results_num, 'name': engine.name})
scores.append({'avg': score, 'name': engine.name})
errors.append({'avg': engine.stats['errors'], 'name': engine.name})
@@ -140,39 +167,19 @@ def get_engines_stats():
'name': engine.name
})
- for engine in pageloads:
- if max_pageload:
- engine['percentage'] = int(engine['avg'] / max_pageload * 100)
- else:
- engine['percentage'] = 0
-
- for engine in results:
- if max_results:
- engine['percentage'] = int(engine['avg'] / max_results * 100)
- else:
- engine['percentage'] = 0
-
- for engine in scores:
- if max_score:
- engine['percentage'] = int(engine['avg'] / max_score * 100)
- else:
- engine['percentage'] = 0
-
- for engine in scores_per_result:
- if max_score_per_result:
- engine['percentage'] = int(engine['avg']
- / max_score_per_result * 100)
- else:
- engine['percentage'] = 0
-
- for engine in errors:
- if max_errors:
- engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
- else:
- engine['percentage'] = 0
+ pageloads = to_percentage(pageloads, max_pageload)
+ engine_times = to_percentage(engine_times, max_engine_times)
+ results = to_percentage(results, max_results)
+ scores = to_percentage(scores, max_score)
+ scores_per_result = to_percentage(scores_per_result, max_score_per_result)
+ erros = to_percentage(errors, max_errors)
return [
(
+ gettext('Engine time (sec)'),
+ sorted(engine_times, key=itemgetter('avg'))
+ ),
+ (
gettext('Page loads (sec)'),
sorted(pageloads, key=itemgetter('avg'))
),
diff --git a/searx/search.py b/searx/search.py
@@ -36,14 +36,53 @@ logger = logger.getChild('search')
number_of_searches = 0
-def search_request_wrapper(fn, url, engine_name, **kwargs):
- ret = None
- engine = engines[engine_name]
+def send_http_request(engine, request_params, timeout_limit):
+ response = None
try:
- ret = fn(url, **kwargs)
+ # create dictionary which contain all
+ # informations about the request
+ request_args = dict(
+ headers=request_params['headers'],
+ cookies=request_params['cookies'],
+ timeout=timeout_limit,
+ verify=request_params['verify']
+ )
+ # specific type of request (GET or POST)
+ if request_params['method'] == 'GET':
+ req = requests_lib.get
+ else:
+ req = requests_lib.post
+ request_args['data'] = request_params['data']
+
+ # for page_load_time stats
+ time_before_request = time()
+
+ # send the request
+ response = req(request_params['url'], **request_args)
+
with threading.RLock():
+ # no error : reset the suspend variables
engine.continuous_errors = 0
engine.suspend_end_time = 0
+ # update stats with current page-load-time
+ # only the HTTP request
+ engine.stats['page_load_time'] += time() - time_before_request
+ engine.stats['page_load_count'] += 1
+
+ # is there a timeout (no parsing in this case)
+ timeout_overhead = 0.2 # seconds
+ search_duration = time() - request_params['started']
+ if search_duration > timeout_limit + timeout_overhead:
+ logger.exception('engine timeout on HTTP request:'
+ '{0} (search duration : {1} ms, time-out: {2} )'
+ .format(engine.name, search_duration, timeout_limit))
+ with threading.RLock():
+ engine.stats['errors'] += 1
+ return False
+
+ # everything is ok : return the response
+ return response
+
except:
# increase errors stats
with threading.RLock():
@@ -52,20 +91,62 @@ def search_request_wrapper(fn, url, engine_name, **kwargs):
engine.suspend_end_time = time() + min(60, engine.continuous_errors)
# print engine name and specific error message
- logger.exception('engine crash: {0}'.format(engine_name))
- return ret
+ logger.exception('engine crash: {0}'.format(engine.name))
+ return False
+
+
+def search_one_request(engine_name, query, request_params, result_container, timeout_limit):
+ engine = engines[engine_name]
+
+ # update request parameters dependent on
+ # search-engine (contained in engines folder)
+ engine.request(query, request_params)
+
+ # TODO add support of offline engines
+ if request_params['url'] is None:
+ return False
+
+ # ignoring empty urls
+ if not request_params['url']:
+ return False
+
+ # send request
+ response = send_http_request(engine, request_params, timeout_limit)
+
+ # parse response
+ success = None
+ if response:
+ # parse the response
+ response.search_params = request_params
+ search_results = engine.response(response)
+
+ # add results
+ for result in search_results:
+ result['engine'] = engine.name
+
+ result_container.extend(engine.name, search_results)
+
+ success = True
+ else:
+ success = False
+
+ with threading.RLock():
+ # update stats : total time
+ engine.stats['engine_time'] += time() - request_params['started']
+ engine.stats['engine_time_count'] += 1
+ #
+ return success
-def threaded_requests(requests):
- timeout_limit = max(r[2]['timeout'] for r in requests)
- search_start = time()
+
+def search_multiple_requests(requests, result_container, timeout_limit):
+ start_time = time()
search_id = uuid4().__str__()
- for fn, url, request_args, engine_name in requests:
- request_args['timeout'] = timeout_limit
+
+ for engine_name, query, request_params in requests:
th = threading.Thread(
- target=search_request_wrapper,
- args=(fn, url, engine_name),
- kwargs=request_args,
+ target=search_one_request,
+ args=(engine_name, query, request_params, result_container, timeout_limit),
name=search_id,
)
th._engine_name = engine_name
@@ -73,7 +154,7 @@ def threaded_requests(requests):
for th in threading.enumerate():
if th.name == search_id:
- remaining_time = max(0.0, timeout_limit - (time() - search_start))
+ remaining_time = max(0.0, timeout_limit - (time() - start_time))
th.join(remaining_time)
if th.isAlive():
logger.warning('engine timeout: {0}'.format(th._engine_name))
@@ -91,44 +172,6 @@ def default_request_params():
}
-# create a callback wrapper for the search engine results
-def make_callback(engine_name, callback, params, result_container):
-
- # creating a callback wrapper for the search engine results
- def process_callback(response, **kwargs):
- # check if redirect comparing to the True value,
- # because resp can be a Mock object, and any attribut name returns something.
- if response.is_redirect is True:
- logger.debug('{0} redirect on: {1}'.format(engine_name, response))
- return
-
- response.search_params = params
-
- search_duration = time() - params['started']
- # update stats with current page-load-time
- with threading.RLock():
- engines[engine_name].stats['page_load_time'] += search_duration
-
- timeout_overhead = 0.2 # seconds
- timeout_limit = engines[engine_name].timeout + timeout_overhead
-
- if search_duration > timeout_limit:
- with threading.RLock():
- engines[engine_name].stats['errors'] += 1
- return
-
- # callback
- search_results = callback(response)
-
- # add results
- for result in search_results:
- result['engine'] = engine_name
-
- result_container.extend(engine_name, search_results)
-
- return process_callback
-
-
def get_search_query_from_webapp(preferences, form):
query = None
query_engines = []
@@ -255,6 +298,10 @@ class Search(object):
def search(self):
global number_of_searches
+ # start time
+ start_time = time()
+
+ # answeres ?
answerers_results = ask(self.search_query)
if answerers_results:
@@ -274,6 +321,9 @@ class Search(object):
search_query = self.search_query
+ # max of all selected engine timeout
+ timeout_limit = 0
+
# start search-reqest for all selected engines
for selected_engine in search_query.engines:
if selected_engine['name'] not in engines:
@@ -303,7 +353,7 @@ class Search(object):
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
- request_params['started'] = time()
+ request_params['started'] = start_time
request_params['pageno'] = search_query.pageno
if hasattr(engine, 'language') and engine.language:
@@ -315,52 +365,16 @@ class Search(object):
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
- # update request parameters dependent on
- # search-engine (contained in engines folder)
- engine.request(search_query.query.encode('utf-8'), request_params)
-
- if request_params['url'] is None:
- # TODO add support of offline engines
- pass
-
- # create a callback wrapper for the search engine results
- callback = make_callback(
- selected_engine['name'],
- engine.response,
- request_params,
- self.result_container)
-
- # create dictionary which contain all
- # informations about the request
- request_args = dict(
- headers=request_params['headers'],
- hooks=dict(response=callback),
- cookies=request_params['cookies'],
- timeout=engine.timeout,
- verify=request_params['verify']
- )
-
- # specific type of request (GET or POST)
- if request_params['method'] == 'GET':
- req = requests_lib.get
- else:
- req = requests_lib.post
- request_args['data'] = request_params['data']
-
- # ignoring empty urls
- if not request_params['url']:
- continue
-
# append request to list
- requests.append((req, request_params['url'],
- request_args,
- selected_engine['name']))
+ requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))
- if not requests:
- return self.result_container
- # send all search-request
- threaded_requests(requests)
- start_new_thread(gc.collect, tuple())
+ # update timeout_limit
+ timeout_limit = max(timeout_limit, engine.timeout)
+
+ if requests:
+ # send all search-request
+ search_multiple_requests(requests, self.result_container, timeout_limit - (time() - start_time))
+ start_new_thread(gc.collect, tuple())
# return results, suggestions, answers and infoboxes
return self.result_container
diff --git a/searx/webapp.py b/searx/webapp.py
@@ -601,6 +601,8 @@ def preferences():
if e.timeout > settings['outgoing']['request_timeout']:
stats[e.name]['warn_timeout'] = True
+ # get first element [0], the engine time,
+ # and then the second element [1] : the time (the first one is the label)
for engine_stat in get_engines_stats()[0][1]:
stats[engine_stat.get('name')]['time'] = round(engine_stat.get('avg'), 3)
if engine_stat.get('avg') > settings['outgoing']['request_timeout']: