commit: 469e08881ee17d8a180d0c0741c1552a29108f0e
parent: c7cbd38fcc60601dd3b41df8a3a234c079f5dc0b
Author: Adam Tauber <asciimoo@gmail.com>
Date: Thu, 1 Jan 2015 14:10:59 +0100
Merge pull request #165 from Cqoicebordel/Moar-engines
Moar engines
Diffstat:
14 files changed, 191 insertions(+), 51 deletions(-)
diff --git a/searx/engines/500px.py b/searx/engines/500px.py
@@ -35,9 +35,9 @@ def request(query, params):
# get response from search-request
def response(resp):
results = []
-
+
dom = html.fromstring(resp.text)
-
+
# parse results
for result in dom.xpath('//div[@class="photo"]'):
link = result.xpath('.//a')[0]
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
@@ -81,7 +81,7 @@ def load_engine(engine_data):
if engine_attr.startswith('_'):
continue
if getattr(engine, engine_attr) is None:
- print('[E] Engine config error: Missing attribute "{0}.{1}"'\
+ print('[E] Engine config error: Missing attribute "{0}.{1}"'
.format(engine.name, engine_attr))
sys.exit(1)
@@ -102,7 +102,7 @@ def load_engine(engine_data):
if engine.shortcut:
# TODO check duplications
if engine.shortcut in engine_shortcuts:
- print('[E] Engine config error: ambigious shortcut: {0}'\
+ print('[E] Engine config error: ambigious shortcut: {0}'
.format(engine.shortcut))
sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
@@ -0,0 +1,67 @@
+## Digg (News, Social media)
+#
+# @website https://digg.com/
+# @provide-api no
+#
+# @using-api no
+# @results HTML (using search portal)
+# @stable no (HTML can change)
+# @parse url, title, content, publishedDate, thumbnail
+
+from urllib import quote_plus
+from json import loads
+from lxml import html
+from cgi import escape
+from dateutil import parser
+
+# engine dependent config
+categories = ['news', 'social media']
+paging = True
+
+# search-url
+base_url = 'https://digg.com/'
+search_url = base_url+'api/search/{query}.json?position={position}&format=html'
+
+# specific xpath variables
+results_xpath = '//article'
+link_xpath = './/small[@class="time"]//a'
+title_xpath = './/h2//a//text()'
+content_xpath = './/p//text()'
+pubdate_xpath = './/time'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 10
+ params['url'] = search_url.format(position=offset,
+ query=quote_plus(query))
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_result = loads(resp.text)
+
+ dom = html.fromstring(search_result['html'])
+
+ # parse results
+ for result in dom.xpath(results_xpath):
+ url = result.attrib.get('data-contenturl')
+ thumbnail = result.xpath('.//img')[0].attrib.get('src')
+ title = ''.join(result.xpath(title_xpath))
+ content = escape(''.join(result.xpath(content_xpath)))
+ pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
+ publishedDate = parser.parse(pubdate)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'template': 'videos.html',
+ 'publishedDate': publishedDate,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results
diff --git a/searx/engines/flickr-noapi.py b/searx/engines/flickr-noapi.py
@@ -53,7 +53,8 @@ def response(resp):
for photo in photos:
- # In paged configuration, the first pages' photos are represented by a None object
+ # In paged configuration, the first pages' photos
+ # are represented by a None object
if photo is None:
continue
@@ -74,10 +75,15 @@ def response(resp):
title = photo['title']
- content = '<span class="photo-author">' + photo['owner']['username'] + '</span><br />'
+ content = '<span class="photo-author">' +\
+ photo['owner']['username'] +\
+ '</span><br />'
if 'description' in photo:
- content = content + '<span class="description">' + photo['description'] + '</span>'
+ content = content +\
+ '<span class="description">' +\
+ photo['description'] +\
+ '</span>'
# append result
results.append({'url': url,
diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python
## Flickr (Images)
-#
+#
# @website https://www.flickr.com
-# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
-#
+# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
+#
# @using-api yes
# @results JSON
# @stable yes
@@ -18,16 +18,20 @@ categories = ['images']
nb_per_page = 15
paging = True
-api_key= None
+api_key = None
-url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={api_key}&{text}&sort=relevance&extras=description%2C+owner_name%2C+url_o%2C+url_z&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
+url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\
+ '&api_key={api_key}&{text}&sort=relevance' +\
+ '&extras=description%2C+owner_name%2C+url_o%2C+url_z' +\
+ '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
paging = True
+
def build_flickr_url(user_id, photo_id):
- return photo_url.format(userid=user_id,photoid=photo_id)
+ return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params):
@@ -40,7 +44,7 @@ def request(query, params):
def response(resp):
results = []
-
+
search_results = loads(resp.text)
# return empty array if there are no results
@@ -64,11 +68,14 @@ def response(resp):
url = build_flickr_url(photo['owner'], photo['id'])
title = photo['title']
-
- content = '<span class="photo-author">'+ photo['ownername'] +'</span><br />'
-
- content = content + '<span class="description">' + photo['description']['_content'] + '</span>'
-
+
+ content = '<span class="photo-author">' +\
+ photo['ownername'] +\
+ '</span><br />' +\
+ '<span class="description">' +\
+ photo['description']['_content'] +\
+ '</span>'
+
# append result
results.append({'url': url,
'title': title,
diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py
@@ -24,7 +24,7 @@ search_url = url + 'search/{search_term}/{pageno}/'
# specific xpath variables
magnet_xpath = './/a[@title="Torrent magnet link"]'
-#content_xpath = './/font[@class="detDesc"]//text()'
+content_xpath = './/span[@class="font11px lightgrey block"]'
# do search-request
@@ -56,7 +56,8 @@ def response(resp):
link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href'])
title = ' '.join(link.xpath('.//text()'))
- content = escape(html.tostring(result.xpath('.//span[@class="font11px lightgrey block"]')[0], method="text"))
+ content = escape(html.tostring(result.xpath(content_xpath)[0],
+ method="text"))
seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
@@ -11,7 +11,6 @@
from urllib import urlencode
from json import loads
import cgi
-import re
# engine dependent config
categories = ['it']
@@ -33,7 +32,7 @@ def request(query, params):
# get response from search-request
def response(resp):
results = []
-
+
search_results = loads(resp.text)
# parse results
@@ -41,21 +40,22 @@ def response(resp):
href = result['url']
title = "" + result['name'] + " - " + result['filename']
content = result['repo'] + "<br />"
-
+
lines = dict()
for line, code in result['lines'].items():
lines[int(line)] = code
content = content + '<pre class="code-formatter"><table class="code">'
for line, code in sorted(lines.items()):
- content = content + '<tr><td class="line-number" style="padding-right:5px;">'
- content = content + str(line) + '</td><td class="code-snippet">'
- # Replace every two spaces with ' &nbps;' to keep formatting while allowing the browser to break the line if necessary
- content = content + cgi.escape(code).replace('\t', ' ').replace(' ', ' ').replace(' ', ' ')
+ content = content + '<tr><td class="line-number" style="padding-right:5px;">'
+ content = content + str(line) + '</td><td class="code-snippet">'
+ # Replace every two spaces with ' &nbps;' to keep formatting
+ # while allowing the browser to break the line if necessary
+ content = content + cgi.escape(code).replace('\t', ' ').replace(' ', ' ').replace(' ', ' ')
content = content + "</td></tr>"
-
+
content = content + "</table></pre>"
-
+
# append result
results.append({'url': href,
'title': title,
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
@@ -31,15 +31,22 @@ def request(query, params):
# get response from search-request
def response(resp):
results = []
-
+
search_results = loads(resp.text)
# parse results
for result in search_results['results']:
href = result['url']
- title = "[" + result['type'] + "] " + result['namespace'] + " " + result['name']
- content = '<span class="highlight">[' + result['type'] + "] " + result['name'] + " " + result['synopsis'] + "</span><br />" + result['description']
-
+ title = "[" + result['type'] + "] " +\
+ result['namespace'] +\
+ " " + result['name']
+ content = '<span class="highlight">[' +\
+ result['type'] + "] " +\
+ result['name'] + " " +\
+ result['synopsis'] +\
+ "</span><br />" +\
+ result['description']
+
# append result
results.append({'url': href,
'title': title,
diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py
@@ -60,10 +60,14 @@ def response(resp):
content = result.xpath('.//div[contains(@class,"red")]//text()')[0]
content = content + " - "
- content = content + html.tostring(result.xpath('.//div[contains(@class,"grey-web")]')[0], method='text')
+ text = result.xpath('.//div[contains(@class,"grey-web")]')[0]
+ content = content + html.tostring(text, method='text')
if result.xpath(".//span") != []:
- content = content + " - (" + result.xpath(".//span//text()")[0].strip() + ")"
+ content = content +\
+ " - (" +\
+ result.xpath(".//span//text()")[0].strip() +\
+ ")"
# append result
results.append({'url': href,
diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py
@@ -1,6 +1,6 @@
## Twitter (Social media)
#
-# @website https://www.bing.com/news
+# @website https://twitter.com/
# @provide-api yes (https://dev.twitter.com/docs/using-search)
#
# @using-api no
@@ -14,6 +14,7 @@ from urlparse import urljoin
from urllib import urlencode
from lxml import html
from cgi import escape
+from datetime import datetime
# engine dependent config
categories = ['social media']
@@ -27,7 +28,8 @@ search_url = base_url+'search?'
results_xpath = '//li[@data-item-type="tweet"]'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/span[@class="username js-action-profile-name"]//text()'
-content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
+content_xpath = './/p[@class="js-tweet-text tweet-text"]'
+timestamp_xpath = './/span[contains(@class,"_timestamp")]'
# do search-request
@@ -52,12 +54,21 @@ def response(resp):
link = tweet.xpath(link_xpath)[0]
url = urljoin(base_url, link.attrib.get('href'))
title = ''.join(tweet.xpath(title_xpath))
- content = escape(''.join(tweet.xpath(content_xpath)))
-
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8"))
+ pubdate = tweet.xpath(timestamp_xpath)
+ if len(pubdate) > 0:
+ timestamp = float(pubdate[0].attrib.get('data-time'))
+ publishedDate = datetime.fromtimestamp(timestamp, None)
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': publishedDate})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results
diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py
@@ -154,7 +154,6 @@ def load_https_rules(rules_path):
print(' * {n} https-rules loaded'.format(n=len(https_rules)))
-
def https_url_rewrite(result):
skip_https_rewrite = False
# check if HTTPS rewrite is possible
diff --git a/searx/search.py b/searx/search.py
@@ -69,11 +69,16 @@ def threaded_requests(requests):
print('engine timeout: {0}'.format(th._engine_name))
-
# get default reqest parameter
def default_request_params():
return {
- 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}, 'verify': True}
+ 'method': 'GET',
+ 'headers': {},
+ 'data': {},
+ 'url': '',
+ 'cookies': {},
+ 'verify': True
+ }
# create a callback wrapper for the search engine results
@@ -487,14 +492,15 @@ class Search(object):
continue
# append request to list
- requests.append((req, request_params['url'], request_args, selected_engine['name']))
+ requests.append((req, request_params['url'],
+ request_args,
+ selected_engine['name']))
if not requests:
return results, suggestions, answers, infoboxes
# send all search-request
threaded_requests(requests)
-
while not results_queue.empty():
engine_name, engine_results = results_queue.get_nowait()
diff --git a/searx/settings.yml b/searx/settings.yml
@@ -44,6 +44,10 @@ engines:
- name : ddg definitions
engine : duckduckgo_definitions
shortcut : ddd
+
+ - name : digg
+ engine : digg
+ shortcut : dg
- name : wikidata
engine : wikidata
@@ -99,6 +103,33 @@ engines:
engine : google_news
shortcut : gon
+ - name : google play apps
+ engine : xpath
+ search_url : https://play.google.com/store/search?q={query}&c=apps
+ url_xpath : //a[@class="title"]/@href
+ title_xpath : //a[@class="title"]
+ content_xpath : //a[@class="subtitle"]
+ categories : files
+ shortcut : gpa
+
+ - name : google play movies
+ engine : xpath
+ search_url : https://play.google.com/store/search?q={query}&c=movies
+ url_xpath : //a[@class="title"]/@href
+ title_xpath : //a[@class="title"]
+ content_xpath : //a[@class="subtitle"]
+ categories : videos
+ shortcut : gpm
+
+ - name : google play music
+ engine : xpath
+ search_url : https://play.google.com/store/search?q={query}&c=music
+ url_xpath : //a[@class="title"]/@href
+ title_xpath : //a[@class="title"]
+ content_xpath : //a[@class="subtitle"]
+ categories : music
+ shortcut : gps
+
- name : openstreetmap
engine : openstreetmap
shortcut : osm
diff --git a/searx/utils.py b/searx/utils.py
@@ -30,8 +30,9 @@ def gen_useragent():
def searx_useragent():
- return 'searx/{searx_version} {suffix}'.format(searx_version=VERSION_STRING,
- suffix=settings['server'].get('useragent_suffix', ''))
+ return 'searx/{searx_version} {suffix}'.format(
+ searx_version=VERSION_STRING,
+ suffix=settings['server'].get('useragent_suffix', ''))
def highlight_content(content, query):