commit: 03137eebd9fdfaa57452cb364c1bc9f31b243f67
parent: 4a20fc202e886eaf7778481c403106e6243f49b7
Author: Adam Tauber <asciimoo@gmail.com>
Date: Sun, 1 Feb 2015 14:07:34 +0100
Merge pull request #208 from pointhi/new_engines
add 1x.com engine, improve yacy-engine
Diffstat:
5 files changed, 157 insertions(+), 13 deletions(-)
diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py
@@ -0,0 +1,82 @@
+## 1x (Images)
+#
+# @website http://1x.com/
+# @provide-api no
+#
+# @using-api no
+# @results HTML
+# @stable no (HTML can change)
+# @parse url, title, thumbnail, img_src, content
+
+
+from urllib import urlencode
+from urlparse import urljoin
+from lxml import html
+import string
+import re
+
+# engine dependent config
+categories = ['images']
+paging = False
+
+# search-url
+base_url = 'http://1x.com'
+search_url = base_url+'/backend/search.php?{query}'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ # get links from result-text
+ regex = re.compile('(</a>|<a)')
+ results_parts = re.split(regex, resp.text)
+
+ cur_element = ''
+
+ # iterate over link parts
+ for result_part in results_parts:
+ # processed start and end of link
+ if result_part == '<a':
+ cur_element = result_part
+ continue
+ elif result_part != '</a>':
+ cur_element += result_part
+ continue
+
+ cur_element += result_part
+
+ # fix xml-error
+ cur_element = string.replace(cur_element, '"></a>', '"/></a>')
+
+ dom = html.fromstring(cur_element)
+ link = dom.xpath('//a')[0]
+
+ url = urljoin(base_url, link.attrib.get('href'))
+ title = link.attrib.get('title', '')
+
+ thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
+ # TODO: get image with higher resolution
+ img_src = thumbnail_src
+
+ # check if url is showing to a photo
+ if '/photo/' not in url:
+ continue
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'img_src': img_src,
+ 'content': '',
+ 'thumbnail_src': thumbnail_src,
+ 'template': 'images.html'})
+
+ # return results
+ return results
diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py
@@ -68,9 +68,18 @@ def response(resp):
search_results = raw_search_results.get('channels', {})[0].get('items', [])
- if resp.search_params['category'] == 'general':
+ for result in search_results:
+ # parse image results
+ if result.get('image'):
+ # append result
+ results.append({'url': result['url'],
+ 'title': result['title'],
+ 'content': '',
+ 'img_src': result['image'],
+ 'template': 'images.html'})
+
# parse general results
- for result in search_results:
+ else:
publishedDate = parser.parse(result['pubDate'])
# append result
@@ -79,17 +88,7 @@ def response(resp):
'content': result['description'],
'publishedDate': publishedDate})
- elif resp.search_params['category'] == 'images':
- # parse image results
- for result in search_results:
- # append result
- results.append({'url': result['url'],
- 'title': result['title'],
- 'content': '',
- 'img_src': result['image'],
- 'template': 'images.html'})
-
- #TODO parse video, audio and file results
+ #TODO parse video, audio and file results
# return results
return results
diff --git a/searx/settings.yml b/searx/settings.yml
@@ -83,6 +83,11 @@ engines:
engine : www500px
shortcut : px
+ - name : 1x
+ engine : www1x
+ shortcut : 1x
+ disabled : True
+
- name : flickr
categories : images
shortcut : fl
diff --git a/searx/tests/engines/test_www1x.py b/searx/tests/engines/test_www1x.py
@@ -0,0 +1,57 @@
+from collections import defaultdict
+import mock
+from searx.engines import www1x
+from searx.testing import SearxTestCase
+
+
+class TestWww1xEngine(SearxTestCase):
+
+ def test_request(self):
+ query = 'test_query'
+ params = www1x.request(query, defaultdict(dict))
+ self.assertTrue('url' in params)
+ self.assertTrue(query in params['url'])
+ self.assertTrue('1x.com' in params['url'])
+
+ def test_response(self):
+ self.assertRaises(AttributeError, www1x.response, None)
+ self.assertRaises(AttributeError, www1x.response, [])
+ self.assertRaises(AttributeError, www1x.response, '')
+ self.assertRaises(AttributeError, www1x.response, '[]')
+
+ response = mock.Mock(text='<html></html>')
+ self.assertEqual(www1x.response(response), [])
+ html = """
+ <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters
+ [
+ <!ELEMENT characters (character*) >
+ <!ELEMENT character (#PCDATA ) >
+
+ <!ENTITY iexcl "¡" >
+ <!ENTITY cent "¢" >
+ <!ENTITY pound "£" >
+ ]
+ ><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%">
+ <tr>
+ <td style="min-width: 220px;" valign="top">
+ <div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div>
+ <div>
+ <a href="/photo/123456" class="dynamiclink">
+<img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;">
+ </a>
+ <a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink">
+<img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg">
+ </a>
+ </div>
+ </td>
+ </table>
+ ]]></searchresult></root>
+ """
+ response = mock.Mock(text=html)
+ results = www1x.response(response)
+ self.assertEqual(type(results), list)
+ self.assertEqual(len(results), 1)
+ self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456')
+ self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg')
+ self.assertEqual(results[0]['content'], '')
+ self.assertEqual(results[0]['template'], 'images.html')
diff --git a/searx/tests/test_engines.py b/searx/tests/test_engines.py
@@ -1,2 +1,3 @@
from searx.tests.engines.test_dummy import * # noqa
from searx.tests.engines.test_github import * # noqa
+from searx.tests.engines.test_www1x import * # noqa