logo

searx

My custom branche(s) on searx, a meta-search engine
commit: 16bdc0baf4f2b56af000337c4a2fa1e689f1220c
parent: 28f12ef5a0917b8cefddb4d5f74c9aaeb945355f
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Fri,  9 Dec 2016 11:44:24 +0100

[mod] do not escape html content in engines

Diffstat:

Msearx/engines/archlinux.py3+--
Msearx/engines/base.py3+--
Msearx/engines/bing.py5++---
Msearx/engines/btdigg.py5++---
Msearx/engines/dailymotion.py3+--
Msearx/engines/deezer.py9+++++----
Msearx/engines/dictzone.py5++---
Msearx/engines/digg.py3+--
Msearx/engines/fdroid.py3+--
Msearx/engines/flickr.py14+++-----------
Msearx/engines/flickr_noapi.py7+++----
Msearx/engines/gigablast.py5++---
Msearx/engines/github.py3+--
Msearx/engines/google.py5++---
Msearx/engines/kickass.py3+--
Msearx/engines/nyaa.py6++----
Msearx/engines/piratebay.py3+--
Msearx/engines/reddit.py3+--
Msearx/engines/searchcode_doc.py12++----------
Msearx/engines/seedpeer.py1-
Msearx/engines/spotify.py9+++++----
Msearx/engines/stackoverflow.py5++---
Msearx/engines/startpage.py5++---
Msearx/engines/subtitleseeker.py5++---
Msearx/engines/swisscows.py9++++-----
Msearx/engines/tokyotoshokan.py1-
Msearx/engines/torrentz.py1-
Msearx/engines/translated.py11+++++------
Msearx/engines/wolframalpha_noapi.py1-
Msearx/engines/yandex.py5++---
30 files changed, 56 insertions(+), 97 deletions(-)

diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py @@ -12,7 +12,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -135,7 +134,7 @@ def response(resp): for result in dom.xpath(xpath_results): link = result.xpath(xpath_link)[0] href = urljoin(base_url, link.attrib.get('href')) - title = escape(extract_text(link)) + title = extract_text(link) results.append({'url': href, 'title': title}) diff --git a/searx/engines/base.py b/searx/engines/base.py @@ -16,7 +16,6 @@ from lxml import etree from urllib import urlencode from searx.utils import searx_useragent -from cgi import escape from datetime import datetime import re @@ -94,7 +93,7 @@ def response(resp): url = item.text elif item.attrib["name"] == "dcdescription": - content = escape(item.text[:300]) + content = item.text[:300] if len(item.text) > 300: content += "..." diff --git a/searx/engines/bing.py b/searx/engines/bing.py @@ -14,7 +14,6 @@ """ from urllib import urlencode -from cgi import escape from lxml import html from searx.engines.xpath import extract_text @@ -61,7 +60,7 @@ def response(resp): link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = escape(extract_text(result.xpath('.//p'))) + content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, @@ -73,7 +72,7 @@ def response(resp): link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = escape(extract_text(result.xpath('.//p'))) + content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py @@ -11,7 +11,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter @@ -51,8 +50,8 @@ def response(resp): for result in search_res: link = result.xpath('.//td[@class="torrent_name"]//a')[0] href = urljoin(url, link.attrib.get('href')) - title = escape(extract_text(link)) - content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0])) + title = extract_text(link) + content = extract_text(result.xpath('.//pre[@class="snippet"]')[0]) content = "<br />".join(content.split("\n")) filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0] diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py @@ -14,7 +14,6 @@ from urllib import urlencode from json import loads -from cgi import escape from datetime import datetime # engine dependent config @@ -57,7 +56,7 @@ def response(resp): for res in search_res['list']: title = res['title'] url = res['url'] - content = escape(res['description']) + content = res['description'] thumbnail = res['thumbnail_360_url'] publishedDate = datetime.fromtimestamp(res['created_time'], None) embedded = embedded_url.format(videoid=res['id']) diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py @@ -51,10 +51,11 @@ def response(resp): if url.startswith('http://'): url = 'https' + url[4:] - content = result['artist']['name'] +\ - " &bull; " +\ - result['album']['title'] +\ - " &bull; " + result['title'] + content = '{} - {} - {}'.format( + result['artist']['name'], + result['album']['title'], + result['title']) + embedded = embedded_url.format(audioid=result['id']) # append result diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py @@ -12,7 +12,6 @@ import re from urlparse import urljoin from lxml import html -from cgi import escape from searx.utils import is_valid_lang categories = ['general'] @@ -62,8 +61,8 @@ def response(resp): results.append({ 'url': urljoin(resp.url, '?%d' % k), - 'title': escape(from_result.text_content()), - 'content': escape('; '.join(to_results)) + 'title': from_result.text_content(), + 'content': '; '.join(to_results) }) return results diff --git a/searx/engines/digg.py b/searx/engines/digg.py @@ -13,7 +13,6 @@ from urllib import quote_plus from json import loads from lxml import html -from cgi import escape from dateutil import parser # engine dependent config @@ -56,7 +55,7 @@ def response(resp): url = result.attrib.get('data-contenturl') thumbnail = result.xpath('.//img')[0].attrib.get('src') title = ''.join(result.xpath(title_xpath)) - content = escape(''.join(result.xpath(content_xpath))) + content = ''.join(result.xpath(content_xpath)) pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') publishedDate = parser.parse(pubdate) diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py @@ -9,7 +9,6 @@ @parse url, title, content """ -from cgi import escape from urllib import urlencode from searx.engines.xpath import extract_text from lxml import html @@ -43,7 +42,7 @@ def response(resp): img_src = app.xpath('.//img/@src')[0] content = extract_text(app.xpath('./p')[0]) - content = escape(content.replace(title, '', 1).strip()) + content = content.replace(title, '', 1).strip() results.append({'url': url, 'title': title, diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py @@ -77,21 +77,13 @@ def response(resp): url = build_flickr_url(photo['owner'], photo['id']) - title = photo['title'] - - content = '<span class="photo-author">' +\ - photo['ownername'] +\ - '</span><br />' +\ - '<span class="description">' +\ - photo['description']['_content'] +\ - '</span>' - # append result results.append({'url': url, - 'title': title, + 'title': photo['title'], 'img_src': img_src, 'thumbnail_src': thumbnail_src, - 'content': content, + 'content': content = photo['description']['_content'], + 'author': photo['ownername'], 'template': 'images.html'}) # return results diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py @@ -102,16 +102,15 @@ def response(resp): title = photo.get('title', '') - content = '<span class="photo-author">' +\ - photo['username'] +\ - '</span><br />' + author = photo['username'] # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'thumbnail_src': thumbnail_src, - 'content': content, + 'content': '', + 'author': author, 'template': 'images.html'}) return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from json import loads from random import randint from time import time @@ -78,8 +77,8 @@ def response(resp): for result in response_json['results']: # append result results.append({'url': result['url'], - 'title': escape(result['title']), - 'content': escape(result['sum'])}) + 'title': result['title'], + 'content': result['sum']}) # return results return results diff --git a/searx/engines/github.py b/searx/engines/github.py @@ -12,7 +12,6 @@ from urllib import urlencode from json import loads -from cgi import escape # engine dependent config categories = ['it'] @@ -48,7 +47,7 @@ def response(resp): url = res['html_url'] if res['description']: - content = escape(res['description'][:500]) + content = res['description'][:500] else: content = '' diff --git a/searx/engines/google.py b/searx/engines/google.py @@ -9,7 +9,6 @@ # @parse url, title, content, suggestion import re -from cgi import escape from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html, etree @@ -155,7 +154,7 @@ def parse_url(url_string, google_hostname): def extract_text_from_dom(result, xpath): r = result.xpath(xpath) if len(r) > 0: - return escape(extract_text(r[0])) + return extract_text(r[0]) return None @@ -264,7 +263,7 @@ def response(resp): # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion - results.append({'suggestion': escape(extract_text(suggestion))}) + results.append({'suggestion': extract_text(suggestion)}) # return results return results diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py @@ -11,7 +11,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter @@ -57,7 +56,7 @@ def response(resp): link = result.xpath('.//a[@class="cellMainLink"]')[0] href = urljoin(url, link.attrib['href']) title = extract_text(link) - content = escape(extract_text(result.xpath(content_xpath))) + content = extract_text(result.xpath(content_xpath)) seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py @@ -9,7 +9,6 @@ @parse url, title, content, seed, leech, torrentfile """ -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -78,7 +77,7 @@ def response(resp): # torrent title page_a = result.xpath(xpath_title)[0] - title = escape(extract_text(page_a)) + title = extract_text(page_a) # link to the page href = page_a.attrib.get('href') @@ -90,7 +89,7 @@ def response(resp): try: file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') file_size = int(float(file_size) * get_filesize_mul(suffix)) - except Exception as e: + except: file_size = None # seed count @@ -105,7 +104,6 @@ def response(resp): # content string contains all information not included into template content = 'Category: "{category}". Downloaded {downloads} times.' content = content.format(category=category, downloads=downloads) - content = escape(content) results.append({'url': href, 'title': title, diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py @@ -9,7 +9,6 @@ # @parse url, title, content, seed, leech, magnetlink from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter @@ -62,7 +61,7 @@ def response(resp): link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) - content = escape(extract_text(result.xpath(content_xpath))) + content = extract_text(result.xpath(content_xpath)) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] # convert seed to int if possible diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py @@ -11,7 +11,6 @@ """ import json -from cgi import escape from urllib import urlencode from urlparse import urlparse, urljoin from datetime import datetime @@ -68,7 +67,7 @@ def response(resp): img_results.append(params) else: created = datetime.fromtimestamp(data['created_utc']) - content = escape(data['selftext']) + content = data['selftext'] if len(content) > 500: content = content[:500] + '...' params['content'] = content diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py @@ -44,20 +44,12 @@ def response(resp): # parse results for result in search_results.get('results', []): href = result['url'] - title = "[" + result['type'] + "] " +\ - result['namespace'] +\ - " " + result['name'] - content = '<span class="highlight">[' +\ - result['type'] + "] " +\ - result['name'] + " " +\ - result['synopsis'] +\ - "</span><br />" +\ - result['description'] + title = "[{}] {} {}".format(result['type'], result['namespace'], result['name']) # append result results.append({'url': href, 'title': title, - 'content': content}) + 'content': result['description']}) # return results return results diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py @@ -9,7 +9,6 @@ # @parse url, title, content, seed, leech, magnetlink from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py @@ -46,10 +46,11 @@ def response(resp): if result['type'] == 'track': title = result['name'] url = result['external_urls']['spotify'] - content = result['artists'][0]['name'] +\ - " &bull; " +\ - result['album']['name'] +\ - " &bull; " + result['name'] + content = '{} - {} - {}'.format( + result['artists'][0]['name'], + result['album']['name'], + result['name']) + embedded = embedded_url.format(audioid=result['id']) # append result diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py @@ -11,7 +11,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -48,8 +47,8 @@ def response(resp): for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(extract_text(link)) - content = escape(extract_text(result.xpath(content_xpath))) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': href, diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py @@ -11,7 +11,6 @@ # @todo paging from lxml import html -from cgi import escape from dateutil import parser from datetime import datetime, timedelta import re @@ -79,10 +78,10 @@ def response(resp): if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): continue - title = escape(extract_text(link)) + title = extract_text(link) if result.xpath('./p[@class="desc clk"]'): - content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) + content = extract_text(result.xpath('./p[@class="desc clk"]')) else: content = '' diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from urllib import quote_plus from lxml import html from searx.languages import language_codes @@ -59,7 +58,7 @@ def response(resp): elif search_lang: href = href + search_lang + '/' - title = escape(extract_text(link)) + title = extract_text(link) content = extract_text(result.xpath('.//div[contains(@class,"red")]')) content = content + " - " @@ -75,7 +74,7 @@ def response(resp): # append result results.append({'url': href, 'title': title, - 'content': escape(content)}) + 'content': content}) # return results return results diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from json import loads from urllib import urlencode, unquote import re @@ -78,7 +77,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': escape(result['Title']), + 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) @@ -90,8 +89,8 @@ def response(resp): # append result results.append({'url': result_url, - 'title': escape(result_title), - 'content': escape(result_content)}) + 'title': result_title, + 'content': result_content}) # parse images for result in json.get('Images', []): @@ -100,7 +99,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': escape(result['Title']), + 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py @@ -11,7 +11,6 @@ """ import re -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py @@ -12,7 +12,6 @@ """ import re -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text diff --git a/searx/engines/translated.py b/searx/engines/translated.py @@ -9,7 +9,6 @@ @parse url, title, content """ import re -from cgi import escape from searx.utils import is_valid_lang categories = ['general'] @@ -52,14 +51,14 @@ def request(query, params): def response(resp): results = [] results.append({ - 'url': escape(web_url.format( + 'url': web_url.format( from_lang=resp.search_params['from_lang'][2], to_lang=resp.search_params['to_lang'][2], - query=resp.search_params['query'])), - 'title': escape('[{0}-{1}] {2}'.format( + query=resp.search_params['query']), + 'title': '[{0}-{1}] {2}'.format( resp.search_params['from_lang'][1], resp.search_params['to_lang'][1], - resp.search_params['query'])), - 'content': escape(resp.json()['responseData']['translatedText']) + resp.search_params['query']), + 'content': resp.json()['responseData']['translatedText'] }) return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py @@ -8,7 +8,6 @@ # @stable no # @parse url, infobox -from cgi import escape from json import loads from time import time from urllib import urlencode diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py @@ -9,7 +9,6 @@ @parse url, title, content """ -from cgi import escape from urllib import urlencode from lxml import html from searx.search import logger @@ -52,8 +51,8 @@ def response(resp): for result in dom.xpath(results_xpath): try: res = {'url': result.xpath(url_xpath)[0], - 'title': escape(''.join(result.xpath(title_xpath))), - 'content': escape(''.join(result.xpath(content_xpath)))} + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} except: logger.exception('yandex parse crash') continue