commit: 16bdc0baf4f2b56af000337c4a2fa1e689f1220c
parent: 28f12ef5a0917b8cefddb4d5f74c9aaeb945355f
Author: Adam Tauber <asciimoo@gmail.com>
Date: Fri, 9 Dec 2016 11:44:24 +0100
[mod] do not escape html content in engines
Diffstat:
30 files changed, 56 insertions(+), 97 deletions(-)
diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py
@@ -12,7 +12,6 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
@@ -135,7 +134,7 @@ def response(resp):
for result in dom.xpath(xpath_results):
link = result.xpath(xpath_link)[0]
href = urljoin(base_url, link.attrib.get('href'))
- title = escape(extract_text(link))
+ title = extract_text(link)
results.append({'url': href,
'title': title})
diff --git a/searx/engines/base.py b/searx/engines/base.py
@@ -16,7 +16,6 @@
from lxml import etree
from urllib import urlencode
from searx.utils import searx_useragent
-from cgi import escape
from datetime import datetime
import re
@@ -94,7 +93,7 @@ def response(resp):
url = item.text
elif item.attrib["name"] == "dcdescription":
- content = escape(item.text[:300])
+ content = item.text[:300]
if len(item.text) > 300:
content += "..."
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
@@ -14,7 +14,6 @@
"""
from urllib import urlencode
-from cgi import escape
from lxml import html
from searx.engines.xpath import extract_text
@@ -61,7 +60,7 @@ def response(resp):
link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
- content = escape(extract_text(result.xpath('.//p')))
+ content = extract_text(result.xpath('.//p'))
# append result
results.append({'url': url,
@@ -73,7 +72,7 @@ def response(resp):
link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
- content = escape(extract_text(result.xpath('.//p')))
+ content = extract_text(result.xpath('.//p'))
# append result
results.append({'url': url,
diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py
@@ -11,7 +11,6 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
@@ -51,8 +50,8 @@ def response(resp):
for result in search_res:
link = result.xpath('.//td[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
- title = escape(extract_text(link))
- content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
+ title = extract_text(link)
+ content = extract_text(result.xpath('.//pre[@class="snippet"]')[0])
content = "<br />".join(content.split("\n"))
filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]
diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py
@@ -14,7 +14,6 @@
from urllib import urlencode
from json import loads
-from cgi import escape
from datetime import datetime
# engine dependent config
@@ -57,7 +56,7 @@ def response(resp):
for res in search_res['list']:
title = res['title']
url = res['url']
- content = escape(res['description'])
+ content = res['description']
thumbnail = res['thumbnail_360_url']
publishedDate = datetime.fromtimestamp(res['created_time'], None)
embedded = embedded_url.format(videoid=res['id'])
diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py
@@ -51,10 +51,11 @@ def response(resp):
if url.startswith('http://'):
url = 'https' + url[4:]
- content = result['artist']['name'] +\
- " • " +\
- result['album']['title'] +\
- " • " + result['title']
+ content = '{} - {} - {}'.format(
+ result['artist']['name'],
+ result['album']['title'],
+ result['title'])
+
embedded = embedded_url.format(audioid=result['id'])
# append result
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
@@ -12,7 +12,6 @@
import re
from urlparse import urljoin
from lxml import html
-from cgi import escape
from searx.utils import is_valid_lang
categories = ['general']
@@ -62,8 +61,8 @@ def response(resp):
results.append({
'url': urljoin(resp.url, '?%d' % k),
- 'title': escape(from_result.text_content()),
- 'content': escape('; '.join(to_results))
+ 'title': from_result.text_content(),
+ 'content': '; '.join(to_results)
})
return results
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
@@ -13,7 +13,6 @@
from urllib import quote_plus
from json import loads
from lxml import html
-from cgi import escape
from dateutil import parser
# engine dependent config
@@ -56,7 +55,7 @@ def response(resp):
url = result.attrib.get('data-contenturl')
thumbnail = result.xpath('.//img')[0].attrib.get('src')
title = ''.join(result.xpath(title_xpath))
- content = escape(''.join(result.xpath(content_xpath)))
+ content = ''.join(result.xpath(content_xpath))
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
publishedDate = parser.parse(pubdate)
diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py
@@ -9,7 +9,6 @@
@parse url, title, content
"""
-from cgi import escape
from urllib import urlencode
from searx.engines.xpath import extract_text
from lxml import html
@@ -43,7 +42,7 @@ def response(resp):
img_src = app.xpath('.//img/@src')[0]
content = extract_text(app.xpath('./p')[0])
- content = escape(content.replace(title, '', 1).strip())
+ content = content.replace(title, '', 1).strip()
results.append({'url': url,
'title': title,
diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py
@@ -77,21 +77,13 @@ def response(resp):
url = build_flickr_url(photo['owner'], photo['id'])
- title = photo['title']
-
- content = '<span class="photo-author">' +\
- photo['ownername'] +\
- '</span><br />' +\
- '<span class="description">' +\
- photo['description']['_content'] +\
- '</span>'
-
# append result
results.append({'url': url,
- 'title': title,
+ 'title': photo['title'],
'img_src': img_src,
'thumbnail_src': thumbnail_src,
- 'content': content,
+ 'content': content = photo['description']['_content'],
+ 'author': photo['ownername'],
'template': 'images.html'})
# return results
diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py
@@ -102,16 +102,15 @@ def response(resp):
title = photo.get('title', '')
- content = '<span class="photo-author">' +\
- photo['username'] +\
- '</span><br />'
+ author = photo['username']
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
- 'content': content,
+ 'content': '',
+ 'author': author,
'template': 'images.html'})
return results
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
@@ -10,7 +10,6 @@
@parse url, title, content
"""
-from cgi import escape
from json import loads
from random import randint
from time import time
@@ -78,8 +77,8 @@ def response(resp):
for result in response_json['results']:
# append result
results.append({'url': result['url'],
- 'title': escape(result['title']),
- 'content': escape(result['sum'])})
+ 'title': result['title'],
+ 'content': result['sum']})
# return results
return results
diff --git a/searx/engines/github.py b/searx/engines/github.py
@@ -12,7 +12,6 @@
from urllib import urlencode
from json import loads
-from cgi import escape
# engine dependent config
categories = ['it']
@@ -48,7 +47,7 @@ def response(resp):
url = res['html_url']
if res['description']:
- content = escape(res['description'][:500])
+ content = res['description'][:500]
else:
content = ''
diff --git a/searx/engines/google.py b/searx/engines/google.py
@@ -9,7 +9,6 @@
# @parse url, title, content, suggestion
import re
-from cgi import escape
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html, etree
@@ -155,7 +154,7 @@ def parse_url(url_string, google_hostname):
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
if len(r) > 0:
- return escape(extract_text(r[0]))
+ return extract_text(r[0])
return None
@@ -264,7 +263,7 @@ def response(resp):
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
- results.append({'suggestion': escape(extract_text(suggestion))})
+ results.append({'suggestion': extract_text(suggestion)})
# return results
return results
diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py
@@ -11,7 +11,6 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
@@ -57,7 +56,7 @@ def response(resp):
link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href'])
title = extract_text(link)
- content = escape(extract_text(result.xpath(content_xpath)))
+ content = extract_text(result.xpath(content_xpath))
seed = extract_text(result.xpath('.//td[contains(@class, "green")]'))
leech = extract_text(result.xpath('.//td[contains(@class, "red")]'))
filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]'))
diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py
@@ -9,7 +9,6 @@
@parse url, title, content, seed, leech, torrentfile
"""
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
@@ -78,7 +77,7 @@ def response(resp):
# torrent title
page_a = result.xpath(xpath_title)[0]
- title = escape(extract_text(page_a))
+ title = extract_text(page_a)
# link to the page
href = page_a.attrib.get('href')
@@ -90,7 +89,7 @@ def response(resp):
try:
file_size, suffix = result.xpath(xpath_filesize)[0].split(' ')
file_size = int(float(file_size) * get_filesize_mul(suffix))
- except Exception as e:
+ except:
file_size = None
# seed count
@@ -105,7 +104,6 @@ def response(resp):
# content string contains all information not included into template
content = 'Category: "{category}". Downloaded {downloads} times.'
content = content.format(category=category, downloads=downloads)
- content = escape(content)
results.append({'url': href,
'title': title,
diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py
@@ -9,7 +9,6 @@
# @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin
-from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
@@ -62,7 +61,7 @@ def response(resp):
link = result.xpath('.//div[@class="detName"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
- content = escape(extract_text(result.xpath(content_xpath)))
+ content = extract_text(result.xpath(content_xpath))
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
# convert seed to int if possible
diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py
@@ -11,7 +11,6 @@
"""
import json
-from cgi import escape
from urllib import urlencode
from urlparse import urlparse, urljoin
from datetime import datetime
@@ -68,7 +67,7 @@ def response(resp):
img_results.append(params)
else:
created = datetime.fromtimestamp(data['created_utc'])
- content = escape(data['selftext'])
+ content = data['selftext']
if len(content) > 500:
content = content[:500] + '...'
params['content'] = content
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
@@ -44,20 +44,12 @@ def response(resp):
# parse results
for result in search_results.get('results', []):
href = result['url']
- title = "[" + result['type'] + "] " +\
- result['namespace'] +\
- " " + result['name']
- content = '<span class="highlight">[' +\
- result['type'] + "] " +\
- result['name'] + " " +\
- result['synopsis'] +\
- "</span><br />" +\
- result['description']
+ title = "[{}] {} {}".format(result['type'], result['namespace'], result['name'])
# append result
results.append({'url': href,
'title': title,
- 'content': content})
+ 'content': result['description']})
# return results
return results
diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py
@@ -9,7 +9,6 @@
# @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin
-from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py
@@ -46,10 +46,11 @@ def response(resp):
if result['type'] == 'track':
title = result['name']
url = result['external_urls']['spotify']
- content = result['artists'][0]['name'] +\
- " • " +\
- result['album']['name'] +\
- " • " + result['name']
+ content = '{} - {} - {}'.format(
+ result['artists'][0]['name'],
+ result['album']['name'],
+ result['name'])
+
embedded = embedded_url.format(audioid=result['id'])
# append result
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
@@ -11,7 +11,6 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
@@ -48,8 +47,8 @@ def response(resp):
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(url, link.attrib.get('href'))
- title = escape(extract_text(link))
- content = escape(extract_text(result.xpath(content_xpath)))
+ title = extract_text(link)
+ content = extract_text(result.xpath(content_xpath))
# append result
results.append({'url': href,
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
@@ -11,7 +11,6 @@
# @todo paging
from lxml import html
-from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re
@@ -79,10 +78,10 @@ def response(resp):
if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue
- title = escape(extract_text(link))
+ title = extract_text(link)
if result.xpath('./p[@class="desc clk"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
+ content = extract_text(result.xpath('./p[@class="desc clk"]'))
else:
content = ''
diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py
@@ -10,7 +10,6 @@
@parse url, title, content
"""
-from cgi import escape
from urllib import quote_plus
from lxml import html
from searx.languages import language_codes
@@ -59,7 +58,7 @@ def response(resp):
elif search_lang:
href = href + search_lang + '/'
- title = escape(extract_text(link))
+ title = extract_text(link)
content = extract_text(result.xpath('.//div[contains(@class,"red")]'))
content = content + " - "
@@ -75,7 +74,7 @@ def response(resp):
# append result
results.append({'url': href,
'title': title,
- 'content': escape(content)})
+ 'content': content})
# return results
return results
diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py
@@ -10,7 +10,6 @@
@parse url, title, content
"""
-from cgi import escape
from json import loads
from urllib import urlencode, unquote
import re
@@ -78,7 +77,7 @@ def response(resp):
# append result
results.append({'url': result['SourceUrl'],
- 'title': escape(result['Title']),
+ 'title': result['Title'],
'content': '',
'img_src': img_url,
'template': 'images.html'})
@@ -90,8 +89,8 @@ def response(resp):
# append result
results.append({'url': result_url,
- 'title': escape(result_title),
- 'content': escape(result_content)})
+ 'title': result_title,
+ 'content': result_content})
# parse images
for result in json.get('Images', []):
@@ -100,7 +99,7 @@ def response(resp):
# append result
results.append({'url': result['SourceUrl'],
- 'title': escape(result['Title']),
+ 'title': result['Title'],
'content': '',
'img_src': img_url,
'template': 'images.html'})
diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py
@@ -11,7 +11,6 @@
"""
import re
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py
@@ -12,7 +12,6 @@
"""
import re
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
diff --git a/searx/engines/translated.py b/searx/engines/translated.py
@@ -9,7 +9,6 @@
@parse url, title, content
"""
import re
-from cgi import escape
from searx.utils import is_valid_lang
categories = ['general']
@@ -52,14 +51,14 @@ def request(query, params):
def response(resp):
results = []
results.append({
- 'url': escape(web_url.format(
+ 'url': web_url.format(
from_lang=resp.search_params['from_lang'][2],
to_lang=resp.search_params['to_lang'][2],
- query=resp.search_params['query'])),
- 'title': escape('[{0}-{1}] {2}'.format(
+ query=resp.search_params['query']),
+ 'title': '[{0}-{1}] {2}'.format(
resp.search_params['from_lang'][1],
resp.search_params['to_lang'][1],
- resp.search_params['query'])),
- 'content': escape(resp.json()['responseData']['translatedText'])
+ resp.search_params['query']),
+ 'content': resp.json()['responseData']['translatedText']
})
return results
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
@@ -8,7 +8,6 @@
# @stable no
# @parse url, infobox
-from cgi import escape
from json import loads
from time import time
from urllib import urlencode
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
@@ -9,7 +9,6 @@
@parse url, title, content
"""
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.search import logger
@@ -52,8 +51,8 @@ def response(resp):
for result in dom.xpath(results_xpath):
try:
res = {'url': result.xpath(url_xpath)[0],
- 'title': escape(''.join(result.xpath(title_xpath))),
- 'content': escape(''.join(result.xpath(content_xpath)))}
+ 'title': ''.join(result.xpath(title_xpath)),
+ 'content': ''.join(result.xpath(content_xpath))}
except:
logger.exception('yandex parse crash')
continue