logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git
commit: d04e471ce53c5efd224a4ed0e7b5d88fb0d3a093
parent b9d4c0523e8d6eab81658d77f8213b39f9b28f17
Author: Noémi Ványi <sitbackandwait@gmail.com>
Date:   Wed, 21 Mar 2018 22:30:29 +0100

add findx engine for general, images and videos

Diffstat:

Asearx/engines/findx.py119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml18++++++++++++++++++
2 files changed, 137 insertions(+), 0 deletions(-)

diff --git a/searx/engines/findx.py b/searx/engines/findx.py @@ -0,0 +1,119 @@ +""" +FindX (General, Images, Videos) + +@website https://www.findx.com +@provide-api no +@using-api no +@results HTML +@stable no +@parse url, title, content, embedded, img_src, thumbnail_src +""" + +from dateutil import parser +from json import loads +import re + +from lxml import html + +from searx import logger +from searx.engines.xpath import extract_text +from searx.engines.youtube_noapi import base_youtube_url, embedded_url +from searx.url_utils import urlencode + + +paging = True +results_xpath = '//script[@id="initial-state"]' +search_url = 'https://www.findx.com/{category}?{q}' +type_map = { + 'none': 'web', + 'general': 'web', + 'images': 'images', + 'videos': 'videos', +} + + +def request(query, params): + category = 'general' + if 'category' in params and len(params['category']) == 1: + category = params['category'][0] + + params['url'] = search_url.format( + category=type_map[category], + q=urlencode({ + 'q': query, + 'page': params['pageno'] + }) + ) + return params + + +def response(resp): + dom = html.fromstring(resp.text) + results_raw_json = dom.xpath(results_xpath) + results_json = loads(extract_text(results_raw_json)) + + if len(results_json['web']['results']) > 0: + return _general_results(results_json['web']['results']) + + if len(results_json['images']['results']) > 0: + return _images_results(results_json['images']['results']) + + if len(results_json['video']['results']) > 0: + return _videos_results(results_json['video']['results']) + + return [] + + +def _general_results(general_results): + results = [] + for result in general_results: + results.append({ + 'url': result['url'], + 'title': result['title'], + 'content': result['sum'], + }) + return results + + +def _images_results(image_results): + results = [] + for result in image_results: + results.append({ + 'url': result['sourceURL'], + 'title': result['title'], + 'content': result['source'], + 'thumbnail_src': _extract_url(result['assets']['thumb']['url']), + 'img_src': _extract_url(result['assets']['file']['url']), + 'template': 'images.html', + }) + return results + + +def _videos_results(video_results): + results = [] + for result in video_results: + if not result['kind'].startswith('youtube'): + logger.warn('Unknown video kind in findx: {}'.format(result['kind'])) + continue + + description = result['snippet']['description'] + if len(description) > 300: + description = description[:300] + '...' + + results.append({ + 'url': base_youtube_url + result['id'], + 'title': result['snippet']['title'], + 'content': description, + 'thumbnail': _extract_url(result['snippet']['thumbnails']['default']['url']), + 'publishedDate': parser.parse(result['snippet']['publishedAt']), + 'embedded': embedded_url.format(videoid=result['id']), + 'template': 'videos.html', + }) + return results + + +def _extract_url(url): + matching = re.search('(/https?://[^)]+)', url) + if matching: + return matching.group(0)[1:] + return '' diff --git a/searx/settings.yml b/searx/settings.yml @@ -218,6 +218,24 @@ engines: shortcut : fd disabled : True + - name : findx + engine : findx + shortcut : fx + categories : general + disabled : True + + - name : findx images + engine : findx + shortcut : fxi + categories : images + disabled : True + + - name : findx videos + engine : findx + shortcut : fxv + categories : videos + disabled : True + - name : flickr categories : images shortcut : fl