logo

searx

My custom branche(s) on searx, a meta-search engine
commit: 090254feca463dcd2243c67971c7e24e4907739c
parent: d23dd7e276f37a37106d7b6a925d7760c4f0e9b4
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Sun, 14 Sep 2014 18:57:08 +0200

Merge pull request #99 from dalf/master

[enh] stick results from the same category and template and [fix] rewrite the google engine

Diffstat:

Msearx/engines/google.py106+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Msearx/search.py40++++++++++++++++++++++++++++++++++++++--
2 files changed, 120 insertions(+), 26 deletions(-)

diff --git a/searx/engines/google.py b/searx/engines/google.py @@ -1,15 +1,17 @@ ## Google (Web) # # @website https://www.google.com -# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! +# @provide-api yes (https://developers.google.com/custom-search/) # -# @using-api yes -# @results JSON -# @stable yes (but deprecated) -# @parse url, title, content +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content, suggestion from urllib import urlencode -from json import loads +from urlparse import unquote,urlparse,parse_qsl +from lxml import html +from searx.engines.xpath import extract_text, extract_url # engine dependent config categories = ['general'] @@ -17,21 +19,45 @@ paging = True language_support = True # search-url -url = 'https://ajax.googleapis.com/' -search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa +google_hostname = 'www.google.com' +search_path = '/search' +redirect_path = '/url' +images_path = '/images' +search_url = 'https://' + google_hostname + search_path + '?{query}&start={offset}&gbv=1' +# specific xpath variables +results_xpath= '//li[@class="g"]' +url_xpath = './/h3/a/@href' +title_xpath = './/h3' +content_xpath = './/span[@class="st"]' +suggestion_xpath = '//p[@class="_Bmc"]' + +images_xpath = './/div/a' +image_url_xpath = './@href' +image_img_src_xpath = './img/@src' + +# remove google-specific tracking-url +def parse_url(url_string): + parsed_url = urlparse(url_string) + if parsed_url.netloc in [google_hostname, ''] and parsed_url.path==redirect_path: + query = dict(parse_qsl(parsed_url.query)) + return query['q'] + else: + return url_string # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 8 + offset = (params['pageno'] - 1) * 10 - language = 'en-US' - if params['language'] != 'all': - language = params['language'].replace('_', '-') + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].replace('_','-').lower() params['url'] = search_url.format(offset=offset, - query=urlencode({'q': query}), - language=language) + query=urlencode({'q': query})) + + params['headers']['Accept-Language'] = language return params @@ -40,18 +66,50 @@ def request(query, params): def response(resp): results = [] - search_res = loads(resp.text) - - # return empty array if there are no results - if not search_res.get('responseData', {}).get('results'): - return [] + dom = html.fromstring(resp.text) # parse results - for result in search_res['responseData']['results']: - # append result - results.append({'url': result['unescapedUrl'], - 'title': result['titleNoFormatting'], - 'content': result['content']}) + for result in dom.xpath(results_xpath): + title = extract_text(result.xpath(title_xpath)[0]) + try: + url = parse_url(extract_url(result.xpath(url_xpath), search_url)) + parsed_url = urlparse(url) + if parsed_url.netloc==google_hostname and parsed_url.path==search_path: + # remove the link to google news + continue + + if parsed_url.netloc==google_hostname and parsed_url.path==images_path: + # images result + results = results + parse_images(result) + else: + # normal result + content = extract_text(result.xpath(content_xpath)[0]) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + continue + + # parse suggestion + for suggestion in dom.xpath(suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) # return results return results + +def parse_images(result): + results = [] + for image in result.xpath(images_xpath): + url = parse_url(extract_text(image.xpath(image_url_xpath)[0])) + img_src = extract_text(image.xpath(image_img_src_xpath)[0]) + + # append result + results.append({'url': url, + 'title': '', + 'content': '', + 'img_src': img_src, + 'template': 'images.html'}) + + return results diff --git a/searx/search.py b/searx/search.py @@ -49,7 +49,8 @@ def score_results(results): flat_len = len(flat_res) engines_len = len(results) results = [] - # deduplication + scoring + + # pass 1: deduplication + scoring for i, res in enumerate(flat_res): res['parsed_url'] = urlparse(res['url']) @@ -90,7 +91,42 @@ def score_results(results): else: res['score'] = score results.append(res) - return sorted(results, key=itemgetter('score'), reverse=True) + results = sorted(results, key=itemgetter('score'), reverse=True) + + # pass 2 : group results by category and template + gresults = [] + categoryPositions = {} + + for i, res in enumerate(results): + # FIXME : handle more than one category per engine + category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template'] + + current = None if category not in categoryPositions else categoryPositions[category] + + # group with previous results using the same category if the group can accept more result and is not too far from the current position + if current != None and (current['count'] > 0) and (len(gresults) - current['index'] < 20): + # group with the previous results using the same category with this one + index = current['index'] + gresults.insert(index, res) + + # update every index after the current one (including the current one) + for k in categoryPositions: + v = categoryPositions[k]['index'] + if v >= index: + categoryPositions[k]['index'] = v+1 + + # update this category + current['count'] -= 1 + + else: + # same category + gresults.append(res) + + # update categoryIndex + categoryPositions[category] = { 'index' : len(gresults), 'count' : 8 } + + # return gresults + return gresults class Search(object):