logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git
commit: e5677ae6b6b23c943e64d8e2abcb64c13c0e8bbf
parent d748b8419ad1ef875f34783bbbcf773ebc4cfb5e
Author: Kirill Isakov <ukwt@ya.ru>
Date:   Fri, 25 Mar 2016 00:24:37 +0600

Add Nyaa.se search engine

Diffstat:

Asearx/engines/nyaa.py115+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml4++++
Atests/unit/engines/test_nyaa.py66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 185 insertions(+), 0 deletions(-)

diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py @@ -0,0 +1,115 @@ +""" + Nyaa.se (Anime Bittorrent tracker) + + @website http://www.nyaa.se/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'http://www.nyaa.se/' +search_url = base_url + '?page=search&{query}&offset={offset}' + +# xpath queries +xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]' +xpath_category = './/td[@class="tlisticon"]/a' +xpath_title = './/td[@class="tlistname"]/a' +xpath_torrent_file = './/td[@class="tlistdownload"]/a' +xpath_filesize = './/td[@class="tlistsize"]/text()' +xpath_seeds = './/td[@class="tlistsn"]/text()' +xpath_leeches = './/td[@class="tlistln"]/text()' +xpath_downloads = './/td[@class="tlistdn"]/text()' + + +# convert a variable to integer or return 0 if it's not a number +def int_or_zero(num): + if isinstance(num, list): + if len(num) < 1: + return 0 + num = num[0] + if num.isdigit(): + return int(num) + return 0 + + +# do search-request +def request(query, params): + query = urlencode({'term': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath(xpath_results): + # category in which our torrent belongs + category = result.xpath(xpath_category)[0].attrib.get('title') + + # torrent title + page_a = result.xpath(xpath_title)[0] + title = escape(extract_text(page_a)) + + # link to the page + href = page_a.attrib.get('href') + + # link to the torrent file + torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href') + + # torrent size + try: + file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') + + # convert torrent size to bytes. + # if there is no correct index in this dictionary, + # the try block fails as it should + multiplier = { + 'KIB': 1024, + 'MIB': 1024 ** 2, + 'GIB': 1024 ** 3, + 'TIB': 1024 ** 4 + }[suffix.upper()] + + file_size = int(float(file_size) * multiplier) + except Exception as e: + file_size = None + + # seed count + seed = int_or_zero(result.xpath(xpath_seeds)) + + # leech count + leech = int_or_zero(result.xpath(xpath_leeches)) + + # torrent downloads count + downloads = int_or_zero(result.xpath(xpath_downloads)) + + # content string contains all information not included into template + content = 'Category: "{category}". Downloaded {downloads} times.' + content = content.format(category=category, downloads=downloads) + content = escape(content) + + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': file_size, + 'torrentfile': torrent_link, + 'template': 'torrent.html'}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -175,6 +175,10 @@ engines: engine : mixcloud shortcut : mc + - name : nyaa + engine : nyaa + shortcut : nt + - name : openstreetmap engine : openstreetmap shortcut : osm diff --git a/tests/unit/engines/test_nyaa.py b/tests/unit/engines/test_nyaa.py @@ -0,0 +1,66 @@ +from collections import defaultdict +import mock +from searx.engines import nyaa +from searx.testing import SearxTestCase + + +class TestNyaaEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = nyaa.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('nyaa.se' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(nyaa.response(resp), []) + + html = """ + <table class="tlist"> + <tbody> + <tr class="trusted tlistrow"> + <td class="tlisticon"> + <a href="//www.nyaa.se" title="English-translated Anime"> + <img src="//files.nyaa.se" alt="English-translated Anime"> + </a> + </td> + <td class="tlistname"> + <a href="//www.nyaa.se/?page3"> + Sample torrent title + </a> + </td> + <td class="tlistdownload"> + <a href="//www.nyaa.se/?page_dl" title="Download"> + <img src="//files.nyaa.se/www-dl.png" alt="DL"> + </a> + </td> + <td class="tlistsize">10 MiB</td> + <td class="tlistsn">1</td> + <td class="tlistln">3</td> + <td class="tlistdn">666</td> + <td class="tlistmn">0</td> + </tr> + </tbody> + </table> + """ + + resp = mock.Mock(text=html) + results = nyaa.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + + r = results[0] + self.assertTrue(r['url'].find('www.nyaa.se/?page3') >= 0) + self.assertTrue(r['torrentfile'].find('www.nyaa.se/?page_dl') >= 0) + self.assertTrue(r['content'].find('English-translated Anime') >= 0) + self.assertTrue(r['content'].find('Downloaded 666 times.') >= 0) + + self.assertEqual(r['title'], 'Sample torrent title') + self.assertEqual(r['seed'], 1) + self.assertEqual(r['leech'], 3) + self.assertEqual(r['filesize'], 10 * 1024 * 1024)