logo

searx

My custom branche(s) on searx, a meta-search engine
commit: 85c0351dca086c5f652c34048fef290b09e088d9
parent: 5544fdb75610bb66d05392289e0f0ad48c13ccf6
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Thu, 14 Apr 2016 10:59:31 +0200

Merge pull request #526 from ukwt/anime

Add a few search engines

Diffstat:

Asearx/engines/fdroid.py53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/engines/google.py12++++++------
Asearx/engines/nyaa.py119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asearx/engines/reddit.py77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asearx/engines/tokyotoshokan.py102+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asearx/engines/torrentz.py93+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/engines/xpath.py16+++++++++++++++-
Msearx/settings.yml88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/engines/test_fdroid.py49+++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/engines/test_nyaa.py66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/engines/test_reddit.py67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/engines/test_tokyotoshokan.py110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/engines/test_torrentz.py91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
13 files changed, 936 insertions(+), 7 deletions(-)

diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py @@ -0,0 +1,53 @@ +""" + F-Droid (a repository of FOSS applications for Android) + + @website https://f-droid.org/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from cgi import escape +from urllib import urlencode +from searx.engines.xpath import extract_text +from lxml import html + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'https://f-droid.org/' +search_url = base_url + 'repository/browse/?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'fdfilter': query, + 'fdpage': params['pageno']}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for app in dom.xpath('//div[@id="appheader"]'): + url = app.xpath('./ancestor::a/@href')[0] + title = app.xpath('./p/span/text()')[0] + img_src = app.xpath('.//img/@src')[0] + + content = extract_text(app.xpath('./p')[0]) + content = escape(content.replace(title, '', 1).strip()) + + results.append({'url': url, + 'title': title, + 'content': content, + 'img_src': img_src}) + + return results diff --git a/searx/engines/google.py b/searx/engines/google.py @@ -46,11 +46,11 @@ country_to_hostname = { 'NZ': 'www.google.co.nz', # New Zealand 'PH': 'www.google.com.ph', # Philippines 'SG': 'www.google.com.sg', # Singapore - # 'US': 'www.google.us', # United State, redirect to .com + # 'US': 'www.google.us', # United States, redirect to .com 'ZA': 'www.google.co.za', # South Africa 'AR': 'www.google.com.ar', # Argentina 'CL': 'www.google.cl', # Chile - 'ES': 'www.google.es', # Span + 'ES': 'www.google.es', # Spain 'MX': 'www.google.com.mx', # Mexico 'EE': 'www.google.ee', # Estonia 'FI': 'www.google.fi', # Finland @@ -61,7 +61,7 @@ country_to_hostname = { 'HU': 'www.google.hu', # Hungary 'IT': 'www.google.it', # Italy 'JP': 'www.google.co.jp', # Japan - 'KR': 'www.google.co.kr', # South Korean + 'KR': 'www.google.co.kr', # South Korea 'LT': 'www.google.lt', # Lithuania 'LV': 'www.google.lv', # Latvia 'NO': 'www.google.no', # Norway @@ -76,9 +76,9 @@ country_to_hostname = { 'SE': 'www.google.se', # Sweden 'TH': 'www.google.co.th', # Thailand 'TR': 'www.google.com.tr', # Turkey - 'UA': 'www.google.com.ua', # Ikraine - # 'CN': 'www.google.cn', # China, only from china ? - 'HK': 'www.google.com.hk', # Hong kong + 'UA': 'www.google.com.ua', # Ukraine + # 'CN': 'www.google.cn', # China, only from China ? + 'HK': 'www.google.com.hk', # Hong Kong 'TW': 'www.google.com.tw' # Taiwan } diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py @@ -0,0 +1,119 @@ +""" + Nyaa.se (Anime Bittorrent tracker) + + @website http://www.nyaa.se/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'http://www.nyaa.se/' +search_url = base_url + '?page=search&{query}&offset={offset}' + +# xpath queries +xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]' +xpath_category = './/td[@class="tlisticon"]/a' +xpath_title = './/td[@class="tlistname"]/a' +xpath_torrent_file = './/td[@class="tlistdownload"]/a' +xpath_filesize = './/td[@class="tlistsize"]/text()' +xpath_seeds = './/td[@class="tlistsn"]/text()' +xpath_leeches = './/td[@class="tlistln"]/text()' +xpath_downloads = './/td[@class="tlistdn"]/text()' + + +# convert a variable to integer or return 0 if it's not a number +def int_or_zero(num): + if isinstance(num, list): + if len(num) < 1: + return 0 + num = num[0] + if num.isdigit(): + return int(num) + return 0 + + +# get multiplier to convert torrent size to bytes +def get_filesize_mul(suffix): + return { + 'KB': 1024, + 'MB': 1024 ** 2, + 'GB': 1024 ** 3, + 'TB': 1024 ** 4, + + 'KIB': 1024, + 'MIB': 1024 ** 2, + 'GIB': 1024 ** 3, + 'TIB': 1024 ** 4 + }[str(suffix).upper()] + + +# do search-request +def request(query, params): + query = urlencode({'term': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath(xpath_results): + # category in which our torrent belongs + category = result.xpath(xpath_category)[0].attrib.get('title') + + # torrent title + page_a = result.xpath(xpath_title)[0] + title = escape(extract_text(page_a)) + + # link to the page + href = page_a.attrib.get('href') + + # link to the torrent file + torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href') + + # torrent size + try: + file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') + file_size = int(float(file_size) * get_filesize_mul(suffix)) + except Exception as e: + file_size = None + + # seed count + seed = int_or_zero(result.xpath(xpath_seeds)) + + # leech count + leech = int_or_zero(result.xpath(xpath_leeches)) + + # torrent downloads count + downloads = int_or_zero(result.xpath(xpath_downloads)) + + # content string contains all information not included into template + content = 'Category: "{category}". Downloaded {downloads} times.' + content = content.format(category=category, downloads=downloads) + content = escape(content) + + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': file_size, + 'torrentfile': torrent_link, + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py @@ -0,0 +1,77 @@ +""" + Reddit + + @website https://www.reddit.com/ + @provide-api yes (https://www.reddit.com/dev/api) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, thumbnail, publishedDate +""" + +import json +from cgi import escape +from urllib import urlencode +from urlparse import urlparse +from datetime import datetime + +# engine dependent config +categories = ['general', 'images', 'news', 'social media'] +page_size = 25 + +# search-url +search_url = 'https://www.reddit.com/search.json?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'q': query, + 'limit': page_size}) + params['url'] = search_url.format(query=query) + + return params + + +# get response from search-request +def response(resp): + img_results = [] + text_results = [] + + search_results = json.loads(resp.text) + + # return empty array if there are no results + if 'data' not in search_results: + return [] + + posts = search_results.get('data', {}).get('children', []) + + # process results + for post in posts: + data = post['data'] + + # extract post information + params = { + 'url': data['url'], + 'title': data['title'] + } + + # if thumbnail field contains a valid URL, we need to change template + thumbnail = data['thumbnail'] + url_info = urlparse(thumbnail) + # netloc & path + if url_info[1] != '' and url_info[2] != '': + params['thumbnail_src'] = thumbnail + params['template'] = 'images.html' + img_results.append(params) + else: + created = datetime.fromtimestamp(data['created_utc']) + content = escape(data['selftext']) + if len(content) > 500: + content = content[:500] + '...' + params['content'] = content + params['publishedDate'] = created + text_results.append(params) + + # show images first and text results second + return img_results + text_results diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py @@ -0,0 +1,102 @@ +""" + Tokyo Toshokan (A BitTorrent Library for Japanese Media) + + @website https://www.tokyotosho.info/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, publishedDate, seed, leech, + filesize, magnetlink, content +""" + +import re +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +base_url = 'https://www.tokyotosho.info/' +search_url = base_url + 'search.php?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'page': params['pageno'], + 'terms': query}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]') + + # check if there are no results or page layout was changed so we cannot parse it + # currently there are two rows for each result, so total count must be even + if len(rows) == 0 or len(rows) % 2 != 0: + return [] + + # regular expression for parsing torrent size strings + size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) + + # processing the results, two rows at a time + for i in xrange(0, len(rows), 2): + # parse the first row + name_row = rows[i] + + links = name_row.xpath('./td[@class="desc-top"]/a') + params = { + 'template': 'torrent.html', + 'url': links[-1].attrib.get('href'), + 'title': extract_text(links[-1]) + } + # I have not yet seen any torrents without magnet links, but + # it's better to be prepared to stumble upon one some day + if len(links) == 2: + magnet = links[0].attrib.get('href') + if magnet.startswith('magnet'): + # okay, we have a valid magnet link, let's add it to the result + params['magnetlink'] = magnet + + # no more info in the first row, start parsing the second one + info_row = rows[i + 1] + desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) + for item in desc.split('|'): + item = item.strip() + if item.startswith('Size:'): + try: + # ('1.228', 'GB') + groups = size_re.match(item).groups() + multiplier = get_filesize_mul(groups[1]) + params['filesize'] = int(multiplier * float(groups[0])) + except Exception as e: + pass + elif item.startswith('Date:'): + try: + # Date: 2016-02-21 21:44 UTC + date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') + params['publishedDate'] = date + except Exception as e: + pass + elif item.startswith('Comment:'): + params['content'] = item + stats = info_row.xpath('./td[@class="stats"]/span') + # has the layout not changed yet? + if len(stats) == 3: + params['seed'] = int_or_zero(extract_text(stats[0])) + params['leech'] = int_or_zero(extract_text(stats[1])) + + results.append(params) + + return results diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py @@ -0,0 +1,93 @@ +""" + Torrentz.eu (BitTorrent meta-search engine) + + @website https://torrentz.eu/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change, although unlikely, + see https://torrentz.eu/torrentz.btsearch) + @parse url, title, publishedDate, seed, leech, filesize, magnetlink +""" + +import re +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +# https://torrentz.eu/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz.eu/' +search_url = base_url + 'search?{query}' + + +# do search-request +def request(query, params): + page = params['pageno'] - 1 + query = urlencode({'q': query, 'p': page}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="results"]/dl'): + name_cell = result.xpath('./dt')[0] + title = extract_text(name_cell) + + # skip rows that do not contain a link to a torrent + links = name_cell.xpath('./a') + if len(links) != 1: + continue + + # extract url and remove a slash in the beginning + link = links[0].attrib.get('href').lstrip('/') + + seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '') + leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '') + + params = { + 'url': base_url + link, + 'title': title, + 'seed': int_or_zero(seed), + 'leech': int_or_zero(leech), + 'template': 'torrent.html' + } + + # let's try to calculate the torrent size + try: + size_str = result.xpath('./dd/span[@class="s"]/text()')[0] + size, suffix = size_str.split() + params['filesize'] = int(size) * get_filesize_mul(suffix) + except Exception as e: + pass + + # does our link contain a valid SHA1 sum? + if re.compile('[0-9a-fA-F]{40}').match(link): + # add a magnet link to the result + params['magnetlink'] = 'magnet:?xt=urn:btih:' + link + + # extract and convert creation date + try: + date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title') + # Fri, 25 Mar 2016 16:29:01 + date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S') + params['publishedDate'] = date + except Exception as e: + pass + + results.append(params) + + return results diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py @@ -11,6 +11,14 @@ title_xpath = None suggestion_xpath = '' results_xpath = '' +# parameters for engines with paging support +# +# number of results on each page +# (only needed if the site requires not a page number, but an offset) +page_size = 1 +# number of the first page (usually 0 or 1) +first_page_num = 1 + ''' if xpath_results is list, extract the text from each result and concat the list @@ -76,8 +84,14 @@ def normalize_url(url): def request(query, params): query = urlencode({'q': query})[2:] - params['url'] = search_url.format(query=query) + + fp = {'query': query} + if paging and search_url.find('{pageno}') >= 0: + fp['pageno'] = (params['pageno'] + first_page_num - 1) * page_size + + params['url'] = search_url.format(**fp) params['query'] = query + return params diff --git a/searx/settings.yml b/searx/settings.yml @@ -60,6 +60,18 @@ engines: engine : bing_news shortcut : bin + - name : bitbucket + engine : xpath + paging : True + search_url : https://bitbucket.org/repo/all/{pageno}?name={query} + url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href + title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"] + content_xpath : //article[@class="repo-summary"]/p + categories : it + timeout : 4.0 + disabled : True + shortcut : bb + - name : btdigg engine : btdigg shortcut : bt @@ -86,6 +98,19 @@ engines: engine : digg shortcut : dg + - name : erowid + engine : xpath + paging : True + first_page_num : 0 + page_size : 30 + search_url : https://www.erowid.org/search.php?q={query}&s={pageno} + url_xpath : //dl[@class="results-list"]/dt[@class="result-title"]/a/@href + title_xpath : //dl[@class="results-list"]/dt[@class="result-title"]/a/text() + content_xpath : //dl[@class="results-list"]/dd[@class="result-details"] + categories : general + shortcut : ew + disabled : True + - name : wikidata engine : wikidata shortcut : wd @@ -109,6 +134,11 @@ engines: shortcut : 1x disabled : True + - name : fdroid + engine : fdroid + shortcut : fd + disabled : True + - name : flickr categories : images shortcut : fl @@ -129,6 +159,18 @@ engines: shortcut : gb disabled: True + - name : gitlab + engine : xpath + paging : True + search_url : https://gitlab.com/search?page={pageno}&search={query} + url_xpath : //li[@class="project-row"]//a[@class="project"]/@href + title_xpath : //li[@class="project-row"]//span[contains(@class, "project-full-name")] + content_xpath : //li[@class="project-row"]//div[@class="description"]/p + categories : it + shortcut : gl + timeout : 5.0 + disabled : True + - name : github engine : github shortcut : gh @@ -175,10 +217,38 @@ engines: shortcut : gps disabled : True + - name : geektimes + engine : xpath + paging : True + search_url : https://geektimes.ru/search/page{pageno}/?q={query} + url_xpath : //div[@class="search_results"]//a[@class="post_title"]/@href + title_xpath : //div[@class="search_results"]//a[@class="post_title"] + content_xpath : //div[@class="search_results"]//div[contains(@class, "content")] + categories : it + timeout : 4.0 + disabled : True + shortcut : gt + + - name : habrahabr + engine : xpath + paging : True + search_url : https://habrahabr.ru/search/page{pageno}/?q={query} + url_xpath : //div[@class="search_results"]//a[@class="post_title"]/@href + title_xpath : //div[@class="search_results"]//a[@class="post_title"] + content_xpath : //div[@class="search_results"]//div[contains(@class, "content")] + categories : it + timeout : 4.0 + disabled : True + shortcut : habr + - name : mixcloud engine : mixcloud shortcut : mc + - name : nyaa + engine : nyaa + shortcut : nt + - name : openstreetmap engine : openstreetmap shortcut : osm @@ -213,6 +283,13 @@ engines: shortcut : qws categories : social media + - name : reddit + engine : reddit + shortcut : re + page_size : 25 + timeout : 10.0 + disabled : True + - name : kickass engine : kickass shortcut : ka @@ -264,6 +341,17 @@ engines: shortcut : sw disabled : True + - name : tokyotoshokan + engine : tokyotoshokan + shortcut : tt + timeout : 6.0 + disabled : True + + - name : torrentz + engine : torrentz + timeout : 5.0 + shortcut : to + - name : twitter engine : twitter shortcut : tw diff --git a/tests/unit/engines/test_fdroid.py b/tests/unit/engines/test_fdroid.py @@ -0,0 +1,49 @@ +import mock +from collections import defaultdict +from searx.engines import fdroid +from searx.testing import SearxTestCase + + +class TestFdroidEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = fdroid.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('f-droid.org' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(fdroid.response(resp), []) + + html = """ + <a href="https://google.com/qwerty"> + <div id="appheader"> + <div style="float:left;padding-right:10px;"> + <img src="http://example.com/image.png" + style="width:48px;border:none;"> + </div> + <div style="float:right;"> + <p>Details...</p> + </div> + <p style="color:#000000;"> + <span style="font-size:20px;">Sample title</span> + <br> + Sample content + </p> + </div> + </a> + """ + + resp = mock.Mock(text=html) + results = fdroid.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['url'], 'https://google.com/qwerty') + self.assertEqual(results[0]['title'], 'Sample title') + self.assertEqual(results[0]['content'], 'Sample content') + self.assertEqual(results[0]['img_src'], 'http://example.com/image.png') diff --git a/tests/unit/engines/test_nyaa.py b/tests/unit/engines/test_nyaa.py @@ -0,0 +1,66 @@ +from collections import defaultdict +import mock +from searx.engines import nyaa +from searx.testing import SearxTestCase + + +class TestNyaaEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = nyaa.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('nyaa.se' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(nyaa.response(resp), []) + + html = """ + <table class="tlist"> + <tbody> + <tr class="trusted tlistrow"> + <td class="tlisticon"> + <a href="//www.nyaa.se" title="English-translated Anime"> + <img src="//files.nyaa.se" alt="English-translated Anime"> + </a> + </td> + <td class="tlistname"> + <a href="//www.nyaa.se/?page3"> + Sample torrent title + </a> + </td> + <td class="tlistdownload"> + <a href="//www.nyaa.se/?page_dl" title="Download"> + <img src="//files.nyaa.se/www-dl.png" alt="DL"> + </a> + </td> + <td class="tlistsize">10 MiB</td> + <td class="tlistsn">1</td> + <td class="tlistln">3</td> + <td class="tlistdn">666</td> + <td class="tlistmn">0</td> + </tr> + </tbody> + </table> + """ + + resp = mock.Mock(text=html) + results = nyaa.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + + r = results[0] + self.assertTrue(r['url'].find('www.nyaa.se/?page3') >= 0) + self.assertTrue(r['torrentfile'].find('www.nyaa.se/?page_dl') >= 0) + self.assertTrue(r['content'].find('English-translated Anime') >= 0) + self.assertTrue(r['content'].find('Downloaded 666 times.') >= 0) + + self.assertEqual(r['title'], 'Sample torrent title') + self.assertEqual(r['seed'], 1) + self.assertEqual(r['leech'], 3) + self.assertEqual(r['filesize'], 10 * 1024 * 1024) diff --git a/tests/unit/engines/test_reddit.py b/tests/unit/engines/test_reddit.py @@ -0,0 +1,67 @@ +from collections import defaultdict +import mock +from searx.engines import reddit +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestRedditEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + params = reddit.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('reddit.com' in params['url']) + + def test_response(self): + resp = mock.Mock(text='{}') + self.assertEqual(reddit.response(resp), []) + + json = """ + { + "kind": "Listing", + "data": { + "children": [{ + "data": { + "url": "http://google.com/", + "title": "Title number one", + "selftext": "Sample", + "created_utc": 1401219957.0, + "thumbnail": "http://image.com/picture.jpg" + } + }, { + "data": { + "url": "https://reddit.com/", + "title": "Title number two", + "selftext": "Dominus vobiscum", + "created_utc": 1438792533.0, + "thumbnail": "self" + } + }] + } + } + """ + + resp = mock.Mock(text=json) + results = reddit.response(resp) + + self.assertEqual(len(results), 2) + self.assertEqual(type(results), list) + + # testing first result (picture) + r = results[0] + self.assertEqual(r['url'], 'http://google.com/') + self.assertEqual(r['title'], 'Title number one') + self.assertEqual(r['template'], 'images.html') + self.assertEqual(r['thumbnail_src'], 'http://image.com/picture.jpg') + + # testing second result (self-post) + r = results[1] + self.assertEqual(r['url'], 'https://reddit.com/') + self.assertEqual(r['title'], 'Title number two') + self.assertEqual(r['content'], 'Dominus vobiscum') + created = datetime.fromtimestamp(1438792533.0) + self.assertEqual(r['publishedDate'], created) + self.assertTrue('thumbnail_src' not in r) diff --git a/tests/unit/engines/test_tokyotoshokan.py b/tests/unit/engines/test_tokyotoshokan.py @@ -0,0 +1,110 @@ +import mock +from collections import defaultdict +from searx.engines import tokyotoshokan +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestTokyotoshokanEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = tokyotoshokan.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('tokyotosho.info' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(tokyotoshokan.response(resp), []) + + html = """ + <table class="listing"> + <tbody> + <tr class="shade category_0"> + <td rowspan="2"> + <a href="/?cat=7"><span class="sprite_cat-raw"></span></a> + </td> + <td class="desc-top"> + <a href="magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b"> + <span class="sprite_magnet"></span> + </a> + <a rel="nofollow" type="application/x-bittorrent" href="http://www.nyaa.se/f"> + Koyomimonogatari + </a> + </td> + <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td> + </tr> + <tr class="shade category_0"> + <td class="desc-bot"> + Authorized: <span class="auth_ok">Yes</span> + Submitter: <a href="?username=Ohys">Ohys</a> | + Size: 10.5MB | + Date: 2016-03-26 16:41 UTC | + Comment: sample comment + </td> + <td style="color: #BBB; font-family: monospace" class="stats" align="right"> + S: <span style="color: red">53</span> + L: <span style="color: red">18</span> + C: <span style="color: red">0</span> + ID: 975700 + </td> + </tr> + + <tr class="category_0"> + <td rowspan="2"> + <a href="/?cat=7"><span class="sprite_cat-raw"></span></a> + </td> + <td class="desc-top"> + <a rel="nofollow" type="application/x-bittorrent" href="http://google.com/q"> + Owarimonogatari + </a> + </td> + <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td> + </tr> + <tr class="category_0"> + <td class="desc-bot"> + Submitter: <a href="?username=Ohys">Ohys</a> | + Size: 932.84EB | + Date: QWERTY-03-26 16:41 UTC + </td> + <td style="color: #BBB; font-family: monospace" class="stats" align="right"> + S: <span style="color: red">0</span> + </td> + </tr> + </tbody> + </table> + """ + + resp = mock.Mock(text=html) + results = tokyotoshokan.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + + # testing the first result, which has correct format + # and should have all information fields filled + r = results[0] + self.assertEqual(r['url'], 'http://www.nyaa.se/f') + self.assertEqual(r['title'], 'Koyomimonogatari') + self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b') + self.assertEqual(r['filesize'], int(1024 * 1024 * 10.5)) + self.assertEqual(r['publishedDate'], datetime(2016, 03, 26, 16, 41)) + self.assertEqual(r['content'], 'Comment: sample comment') + self.assertEqual(r['seed'], 53) + self.assertEqual(r['leech'], 18) + + # testing the second result, which does not include magnet link, + # seed & leech info, and has incorrect size & creation date + r = results[1] + self.assertEqual(r['url'], 'http://google.com/q') + self.assertEqual(r['title'], 'Owarimonogatari') + + self.assertFalse('magnetlink' in r) + self.assertFalse('filesize' in r) + self.assertFalse('content' in r) + self.assertFalse('publishedDate' in r) + self.assertFalse('seed' in r) + self.assertFalse('leech' in r) diff --git a/tests/unit/engines/test_torrentz.py b/tests/unit/engines/test_torrentz.py @@ -0,0 +1,91 @@ +import mock +from collections import defaultdict +from searx.engines import torrentz +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestTorrentzEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = torrentz.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('torrentz.eu' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(torrentz.response(resp), []) + + html = """ + <div class="results"> + <dl> + <dt> + <a href="/4362e08b1d80e1820fb2550b752f9f3126fe76d6"> + Completely valid info + </a> + books ebooks + </dt> + <dd> + <span class="v">1</span> + <span class="a"> + <span title="Sun, 22 Nov 2015 03:01:42">4 months</span> + </span> + <span class="s">30 MB</span> + <span class="u">14</span> + <span class="d">1</span> + </dd> + </dl> + + <dl> + <dt> + <a href="/poaskdpokaspod"> + Invalid hash and date and filesize + </a> + books ebooks + </dt> + <dd> + <span class="v">1</span> + <span class="a"> + <span title="Sun, 2124091j0j190gm42">4 months</span> + </span> + <span class="s">30MB</span> + <span class="u">5,555</span> + <span class="d">1,234,567</span> + </dd> + </dl> + </div> + """ + + resp = mock.Mock(text=html) + results = torrentz.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + + # testing against the first result + r = results[0] + self.assertEqual(r['url'], 'https://torrentz.eu/4362e08b1d80e1820fb2550b752f9f3126fe76d6') + self.assertEqual(r['title'], 'Completely valid info books ebooks') + # 22 Nov 2015 03:01:42 + self.assertEqual(r['publishedDate'], datetime(2015, 11, 22, 3, 1, 42)) + self.assertEqual(r['seed'], 14) + self.assertEqual(r['leech'], 1) + self.assertEqual(r['filesize'], 30 * 1024 * 1024) + self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4362e08b1d80e1820fb2550b752f9f3126fe76d6') + + # testing against the second result + r = results[1] + self.assertEqual(r['url'], 'https://torrentz.eu/poaskdpokaspod') + self.assertEqual(r['title'], 'Invalid hash and date and filesize books ebooks') + self.assertEqual(r['seed'], 5555) + self.assertEqual(r['leech'], 1234567) + + # in the second result we have invalid hash, creation date & torrent size, + # so these tests should fail + self.assertFalse('magnetlink' in r) + self.assertFalse('filesize' in r) + self.assertFalse('publishedDate' in r)