logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git
commit: 7fbc12ee4e6aea8a8ad0098deb03054976056371
parent d026a97e42dce14bb187ea79682b9a303cd91e9e
Author: Kirill Isakov <ukwt@ya.ru>
Date:   Sat, 26 Mar 2016 05:28:58 +0600

Add Torrentz.eu search engine

Diffstat:

Msearx/engines/nyaa.py26++++++++++++++------------
Asearx/engines/torrentz.py93+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml5+++++
Atests/unit/engines/test_torrentz.py91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 203 insertions(+), 12 deletions(-)

diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py @@ -43,6 +43,19 @@ def int_or_zero(num): return int(num) return 0 +# get multiplier to convert torrent size to bytes +def get_filesize_mul(suffix): + return { + 'KB': 1024, + 'MB': 1024 ** 2, + 'GB': 1024 ** 3, + 'TB': 1024 ** 4, + + 'KIB': 1024, + 'MIB': 1024 ** 2, + 'GIB': 1024 ** 3, + 'TIB': 1024 ** 4 + }[str(suffix).upper()] # do search-request def request(query, params): @@ -74,18 +87,7 @@ def response(resp): # torrent size try: file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') - - # convert torrent size to bytes. - # if there is no correct index in this dictionary, - # the try block fails as it should - multiplier = { - 'KIB': 1024, - 'MIB': 1024 ** 2, - 'GIB': 1024 ** 3, - 'TIB': 1024 ** 4 - }[suffix.upper()] - - file_size = int(float(file_size) * multiplier) + file_size = int(float(file_size) * get_filesize_mul(suffix)) except Exception as e: file_size = None diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py @@ -0,0 +1,93 @@ +""" + Torrentz.eu (BitTorrent meta-search engine) + + @website https://torrentz.eu/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change, although unlikely, + see https://torrentz.eu/torrentz.btsearch) + @parse url, title, publishedDate, seed, leech, filesize, magnetlink +""" + +import re +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +# https://torrentz.eu/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz.eu/' +search_url = base_url + 'search?{query}' + + +# do search-request +def request(query, params): + page = params['pageno'] - 1 + query = urlencode({'q': query, 'p': page}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="results"]/dl'): + name_cell = result.xpath('./dt')[0] + title = extract_text(name_cell) + + # skip rows that do not contain a link to a torrent + links = name_cell.xpath('./a') + if len(links) != 1: + continue + + # extract url and remove a slash in the beginning + link = links[0].attrib.get('href').lstrip('/') + + seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '') + leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '') + + params = { + 'url': base_url + link, + 'title': title, + 'seed': int_or_zero(seed), + 'leech': int_or_zero(leech), + 'template': 'torrent.html' + } + + # let's try to calculate the torrent size + try: + size_str = result.xpath('./dd/span[@class="s"]/text()')[0] + size, suffix = size_str.split() + params['filesize'] = int(size) * get_filesize_mul(suffix) + except Exception as e: + pass + + # does our link contain a valid SHA1 sum? + if re.compile('[0-9a-fA-F]{40}').match(link): + # add a magnet link to the result + params['magnetlink'] = 'magnet:?xt=urn:btih:' + link + + # extract and convert creation date + try: + date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title') + # Fri, 25 Mar 2016 16:29:01 + date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S') + params['publishedDate'] = date + except Exception as e: + pass + + results.append(params) + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -271,6 +271,11 @@ engines: shortcut : sw disabled : True + - name : torrentz + engine : torrentz + timeout : 5.0 + shortcut : to + - name : twitter engine : twitter shortcut : tw diff --git a/tests/unit/engines/test_torrentz.py b/tests/unit/engines/test_torrentz.py @@ -0,0 +1,91 @@ +import mock +from collections import defaultdict +from searx.engines import torrentz +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestTorrentzEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = torrentz.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('torrentz.eu' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(torrentz.response(resp), []) + + html = """ + <div class="results"> + <dl> + <dt> + <a href="/4362e08b1d80e1820fb2550b752f9f3126fe76d6"> + Completely valid info + </a> + books ebooks + </dt> + <dd> + <span class="v">1</span> + <span class="a"> + <span title="Sun, 22 Nov 2015 03:01:42">4 months</span> + </span> + <span class="s">30 MB</span> + <span class="u">14</span> + <span class="d">1</span> + </dd> + </dl> + + <dl> + <dt> + <a href="/poaskdpokaspod"> + Invalid hash and date and filesize + </a> + books ebooks + </dt> + <dd> + <span class="v">1</span> + <span class="a"> + <span title="Sun, 2124091j0j190gm42">4 months</span> + </span> + <span class="s">30MB</span> + <span class="u">5,555</span> + <span class="d">1,234,567</span> + </dd> + </dl> + </div> + """ + + resp = mock.Mock(text=html) + results = torrentz.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + + # testing against the first result + r = results[0] + self.assertEqual(r['url'], 'https://torrentz.eu/4362e08b1d80e1820fb2550b752f9f3126fe76d6') + self.assertEqual(r['title'], 'Completely valid info books ebooks') + # 22 Nov 2015 03:01:42 + self.assertEqual(r['publishedDate'], datetime(2015, 11, 22, 3, 1, 42)) + self.assertEqual(r['seed'], 14) + self.assertEqual(r['leech'], 1) + self.assertEqual(r['filesize'], 30 * 1024 * 1024) + self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4362e08b1d80e1820fb2550b752f9f3126fe76d6') + + # testing against the second result + r = results[1] + self.assertEqual(r['url'], 'https://torrentz.eu/poaskdpokaspod') + self.assertEqual(r['title'], 'Invalid hash and date and filesize books ebooks') + self.assertEqual(r['seed'], 5555) + self.assertEqual(r['leech'], 1234567) + + # in the second result we have invalid hash, creation date & torrent size, + # so these tests should fail + self.assertFalse('magnetlink' in r) + self.assertFalse('filesize' in r) + self.assertFalse('publishedDate' in r)