logo

searx

Unnamed repository; edit this file 'description' to name the repository.
commit: f36d1e28fae212b8b8640324d2e787b73305e2d2
parent: 55dfb305a0057b8e94706ae152bb61d07772f334
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Mon,  1 Sep 2014 18:30:55 +0200

Merge pull request #88 from pointhi/engines

update and fix search engines

Diffstat:

searx/engines/bing.py | 45+++++++++++++++++++++++++++++++++++++++------
searx/engines/bing_images.py | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
searx/engines/bing_news.py | 70+++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
searx/engines/dailymotion.py | 54+++++++++++++++++++++++++++++++++++-------------------
searx/engines/google.py | 28++++++++++++++++++++++++----
searx/engines/google_images.py | 31++++++++++++++++++++++++++-----
searx/engines/google_news.py | 31+++++++++++++++++++++++++------
searx/engines/vimeo.py | 62++++++++++++++++++++++++++++++++++++++++----------------------
searx/engines/yahoo.py | 47+++++++++++++++++++++++++++++++++++++++++------
searx/engines/yahoo_news.py | 42+++++++++++++++++++++++++++++++-----------
searx/settings.yml | 11++++++-----
11 files changed, 401 insertions(+), 101 deletions(-)

diff --git a/searx/engines/bing.py b/searx/engines/bing.py @@ -1,48 +1,81 @@ +## Bing (Web) +# +# @website https://www.bing.com +# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month +# +# @using-api no (because of query limit) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo publishedDate + from urllib import urlencode from cgi import escape from lxml import html -base_url = 'http://www.bing.com/' -search_string = 'search?{query}&first={offset}' +# engine dependent config +categories = ['general'] paging = True language_support = True +# search-url +base_url = 'https://www.bing.com/' +search_string = 'search?{query}&first={offset}' +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en-US' else: language = params['language'].replace('_', '-') + search_path = search_string.format( query=urlencode({'q': query, 'setmkt': language}), offset=offset) params['cookies']['SRCHHPGUSR'] = \ 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] - #if params['category'] == 'images': - # params['url'] = base_url + 'images/' + search_path + params['url'] = base_url + search_path return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.content) + + # parse results for result in dom.xpath('//div[@class="sa_cc"]'): link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') title = ' '.join(link.xpath('.//text()')) content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results if something is found if results: return results + # parse results again if nothing is found yet for result in dom.xpath('//li[@class="b_algo"]'): link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = ' '.join(link.xpath('.//text()')) content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py @@ -0,0 +1,81 @@ +## Bing (Images) +# +# @website https://www.bing.com/images +# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month +# +# @using-api no (because of query limit) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, img_src +# +# @todo currently there are up to 35 images receive per page, because bing does not parse count=10. limited response to 10 images + +from urllib import urlencode +from cgi import escape +from lxml import html +from yaml import load +import re + +# engine dependent config +categories = ['images'] +paging = True + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'images/search?{query}&count=10&first={offset}' + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + # required for cookie + language = 'en-US' + + search_path = search_string.format( + query=urlencode({'q': query}), + offset=offset) + + params['cookies']['SRCHHPGUSR'] = \ + 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] + + params['url'] = base_url + search_path + + print(params['url']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.content) + + # init regex for yaml-parsing + p = re.compile( '({|,)([a-z]+):(")') + + # parse results + for result in dom.xpath('//div[@class="dg_u"]'): + link = result.xpath('./a')[0] + + # parse yaml-data (it is required to add a space, to make it parsable) + yaml_data = load(p.sub( r'\1\2: \3', link.attrib.get('m'))) + + title = link.attrib.get('t1') + #url = 'http://' + link.attrib.get('t3') + url = yaml_data.get('surl') + img_src = yaml_data.get('imgurl') + + # append result + results.append({'template': 'images.html', + 'url': url, + 'title': title, + 'content': '', + 'img_src': img_src}) + + # TODO stop parsing if 10 images are found + if len(results) >= 10: + break + + # return results + return results diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py @@ -1,50 +1,86 @@ +## Bing (News) +# +# @website https://www.bing.com/news +# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month +# +# @using-api no (because of query limit) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate + from urllib import urlencode from cgi import escape from lxml import html +from datetime import datetime, timedelta +from dateutil import parser +import re +# engine dependent config categories = ['news'] - -base_url = 'http://www.bing.com/' -search_string = 'news/search?{query}&first={offset}' paging = True language_support = True +# search-url +base_url = 'https://www.bing.com/' +search_string = 'news/search?{query}&first={offset}' +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en-US' else: language = params['language'].replace('_', '-') + search_path = search_string.format( query=urlencode({'q': query, 'setmkt': language}), offset=offset) params['cookies']['SRCHHPGUSR'] = \ 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] - #if params['category'] == 'images': - # params['url'] = base_url + 'images/' + search_path + params['url'] = base_url + search_path return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.content) - for result in dom.xpath('//div[@class="sa_cc"]'): - link = result.xpath('.//h3/a')[0] + + # parse results + for result in dom.xpath('//div[@class="sn_r"]'): + link = result.xpath('.//div[@class="newstitle"]/a')[0] url = link.attrib.get('href') title = ' '.join(link.xpath('.//text()')) - content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()'))) + + # parse publishedDate + publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()'))) - if results: - return results + if re.match("^[0-9]+ minute(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(minutes=int(timeNumbers[0])) + elif re.match("^[0-9]+ hour(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(hours=int(timeNumbers[0])) + elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(hours=int(timeNumbers[0]))\ + - timedelta(minutes=int(timeNumbers[1])) + else: + publishedDate = parser.parse(publishedDate) - for result in dom.xpath('//li[@class="b_algo"]'): - link = result.xpath('.//h2/a')[0] - url = link.attrib.get('href') - title = ' '.join(link.xpath('.//text()')) - content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content}) + + # return results return results diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py @@ -1,45 +1,61 @@ +## Dailymotion (Videos) +# +# @website https://www.dailymotion.com +# @provide-api yes (http://www.dailymotion.com/developer) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, thumbnail +# +# @todo set content-parameter with correct data + from urllib import urlencode from json import loads from lxml import html +# engine dependent config categories = ['videos'] locale = 'en_US' +paging = True +# search-url # see http://www.dailymotion.com/doc/api/obj-video.html -search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page={pageno}&{query}' # noqa - -# TODO use video result template -content_tpl = '<a href="{0}" title="{0}" ><img src="{1}" /></a><br />' - -paging = True +search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=5&page={pageno}&{query}' # noqa +# do search-request def request(query, params): params['url'] = search_url.format( query=urlencode({'search': query, 'localization': locale}), pageno=params['pageno']) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + + # return empty array if there are no results if not 'list' in search_res: - return results + return [] + + # parse results for res in search_res['list']: title = res['title'] url = res['url'] - if res['thumbnail_360_url']: - content = content_tpl.format(url, res['thumbnail_360_url']) - else: - content = '' - if res['description']: - description = text_content_from_html(res['description']) - content += description[:500] - results.append({'url': url, 'title': title, 'content': content}) - return results + #content = res['description'] + content = '' + thumbnail = res['thumbnail_360_url'] + results.append({'template': 'videos.html', + 'url': url, + 'title': title, + 'content': content, + 'thumbnail': thumbnail}) -def text_content_from_html(html_string): - desc_html = html.fragment_fromstring(html_string, create_parent=True) - return desc_html.text_content() + # return results + return results diff --git a/searx/engines/google.py b/searx/engines/google.py @@ -1,37 +1,57 @@ -#!/usr/bin/env python +## Google (Web) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! +# +# @using-api yes +# @results JSON +# @stable yes (but deprecated) +# @parse url, title, content from urllib import urlencode from json import loads +# engine dependent config categories = ['general'] +paging = True +language_support = True +# search-url url = 'https://ajax.googleapis.com/' search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa -paging = True -language_support = True - +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 8 + language = 'en-US' if params['language'] != 'all': language = params['language'].replace('_', '-') + params['url'] = search_url.format(offset=offset, query=urlencode({'q': query}), language=language) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + # return empty array if there are no results if not search_res.get('responseData', {}).get('results'): return [] + # parse results for result in search_res['responseData']['results']: + # append result results.append({'url': result['unescapedUrl'], 'title': result['titleNoFormatting'], 'content': result['content']}) + + # return results return results diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py @@ -1,37 +1,58 @@ -#!/usr/bin/env python +## Google (Images) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! +# +# @using-api yes +# @results JSON +# @stable yes (but deprecated) +# @parse url, title, img_src from urllib import urlencode from json import loads +# engine dependent config categories = ['images'] +paging = True +# search-url url = 'https://ajax.googleapis.com/' search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # noqa -paging = True +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 8 + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) - if not search_res.get('responseData'): - return [] - if not search_res['responseData'].get('results'): + + # return empty array if there are no results + if not search_res.get('responseData', {}).get('results'): return [] + + # parse results for result in search_res['responseData']['results']: href = result['originalContextUrl'] title = result['title'] if not result['url']: continue + + # append result results.append({'url': href, 'title': title, 'content': '', 'img_src': result['url'], 'template': 'images.html'}) + + # return results return results diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py @@ -1,43 +1,62 @@ -#!/usr/bin/env python +## Google (News) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! +# +# @using-api yes +# @results JSON +# @stable yes (but deprecated) +# @parse url, title, content, publishedDate from urllib import urlencode from json import loads from dateutil import parser +# search-url categories = ['news'] +paging = True +language_support = True +# engine dependent config url = 'https://ajax.googleapis.com/' search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa -paging = True -language_support = True - +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 8 + language = 'en-US' if params['language'] != 'all': language = params['language'].replace('_', '-') + params['url'] = search_url.format(offset=offset, query=urlencode({'q': query}), language=language) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + # return empty array if there are no results if not search_res.get('responseData', {}).get('results'): return [] + # parse results for result in search_res['responseData']['results']: - -# Mon, 10 Mar 2014 16:26:15 -0700 + # parse publishedDate publishedDate = parser.parse(result['publishedDate']) + # append result results.append({'url': result['unescapedUrl'], 'title': result['titleNoFormatting'], 'publishedDate': publishedDate, 'content': result['content']}) + + # return results return results diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py @@ -1,43 +1,58 @@ +## Vimeo (Videos) +# +# @website https://vimeo.com/ +# @provide-api yes (http://developer.vimeo.com/api), they have a maximum count of queries/hour +# +# @using-api no (TODO, rewrite to api) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, publishedDate, thumbnail +# +# @todo rewrite to api +# @todo set content-parameter with correct data + from urllib import urlencode from HTMLParser import HTMLParser from lxml import html from searx.engines.xpath import extract_text from dateutil import parser -base_url = 'http://vimeo.com' -search_url = base_url + '/search?{query}' -url_xpath = None -content_xpath = None -title_xpath = None -results_xpath = '' -content_tpl = '<a href="{0}"> <img src="{2}"/> </a>' -publishedDate_xpath = './/p[@class="meta"]//attribute::datetime' +# engine dependent config +categories = ['videos'] +paging = True -# the cookie set by vimeo contains all the following values, -# but only __utma seems to be requiered -cookie = { - #'vuid':'918282893.1027205400' - # 'ab_bs':'%7B%223%22%3A279%7D' - '__utma': '00000000.000#0000000.0000000000.0000000000.0000000000.0' - # '__utmb':'18302654.1.10.1388942090' - #, '__utmc':'18302654' - #, '__utmz':'18#302654.1388942090.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)' # noqa - #, '__utml':'search' -} +# search-url +base_url = 'https://vimeo.com' +search_url = base_url + '/search/page:{pageno}?{query}' + +# specific xpath variables +url_xpath = './a/@href' +content_xpath = './a/img/@src' +title_xpath = './a/div[@class="data"]/p[@class="title"]/text()' +results_xpath = '//div[@id="browse_content"]/ol/li' +publishedDate_xpath = './/p[@class="meta"]//attribute::datetime' +# do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query})) - params['cookies'] = cookie + params['url'] = search_url.format(pageno=params['pageno'] , + query=urlencode({'q': query})) + + # TODO required? + params['cookies']['__utma'] = '00000000.000#0000000.0000000000.0000000000.0000000000.0' + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) p = HTMLParser() + # parse results for result in dom.xpath(results_xpath): url = base_url + result.xpath(url_xpath)[0] title = p.unescape(extract_text(result.xpath(title_xpath))) @@ -45,10 +60,13 @@ def response(resp): publishedDate = parser.parse(extract_text( result.xpath(publishedDate_xpath)[0])) + # append result results.append({'url': url, 'title': title, - 'content': content_tpl.format(url, title, thumbnail), + 'content': '', 'template': 'videos.html', 'publishedDate': publishedDate, 'thumbnail': thumbnail}) + + # return results return results diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py @@ -1,64 +1,99 @@ -#!/usr/bin/env python +## Yahoo (Web) +# +# @website https://search.yahoo.com/web +# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries +# +# @using-api no (because pricing) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, suggestion from urllib import urlencode from urlparse import unquote from lxml import html from searx.engines.xpath import extract_text, extract_url +# engine dependent config categories = ['general'] -search_url = 'http://search.yahoo.com/search?{query}&b={offset}' +paging = True +language_support = True + +# search-url +search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' + +# specific xpath variables results_xpath = '//div[@class="res"]' url_xpath = './/h3/a/@href' title_xpath = './/h3/a' content_xpath = './/div[@class="abstr"]' suggestion_xpath = '//div[@id="satat"]//a' -paging = True - +# remove yahoo-specific tracking-url def parse_url(url_string): endings = ['/RS', '/RK'] endpositions = [] start = url_string.find('http', url_string.find('/RU=')+1) + for ending in endings: endpos = url_string.rfind(ending) if endpos > -1: endpositions.append(endpos) end = min(endpositions) + return unquote(url_string[start:end]) +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en' else: language = params['language'].split('_')[0] + params['url'] = search_url.format(offset=offset, - query=urlencode({'p': query})) + query=urlencode({'p': query}), + lang=language) + + # TODO required? params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ .format(lang=language) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) + # parse results for result in dom.xpath(results_xpath): try: url = parse_url(extract_url(result.xpath(url_xpath), search_url)) title = extract_text(result.xpath(title_xpath)[0]) except: continue + content = extract_text(result.xpath(content_xpath)[0]) - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # if no suggestion found, return results if not suggestion_xpath: return results + # parse suggestion for suggestion in dom.xpath(suggestion_xpath): + # append suggestion results.append({'suggestion': extract_text(suggestion)}) + # return results return results diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py @@ -1,4 +1,12 @@ -#!/usr/bin/env python +## Yahoo (News) +# +# @website https://news.yahoo.com +# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries +# +# @using-api no (because pricing) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate from urllib import urlencode from lxml import html @@ -8,8 +16,15 @@ from datetime import datetime, timedelta import re from dateutil import parser +# engine dependent config categories = ['news'] -search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' +paging = True +language_support = True + +# search-url +search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' + +# specific xpath variables results_xpath = '//div[@class="res"]' url_xpath = './/h3/a/@href' title_xpath = './/h3/a' @@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]' publishedDate_xpath = './/span[@class="timestamp"]' suggestion_xpath = '//div[@id="satat"]//a' -paging = True - +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en' else: language = params['language'].split('_')[0] + params['url'] = search_url.format(offset=offset, - query=urlencode({'p': query})) + query=urlencode({'p': query}), + lang=language) + + # TODO required? params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ .format(lang=language) return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) + # parse results for result in dom.xpath(results_xpath): url = parse_url(extract_url(result.xpath(url_xpath), search_url)) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) + + # parse publishedDate publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) if re.match("^[0-9]+ minute(s|) ago$", publishedDate): @@ -58,15 +82,11 @@ def response(resp): if publishedDate.year == 1900: publishedDate = publishedDate.replace(year=datetime.now().year) + # append result results.append({'url': url, 'title': title, 'content': content, 'publishedDate': publishedDate}) - if not suggestion_xpath: - return results - - for suggestion in dom.xpath(suggestion_xpath): - results.append({'suggestion': extract_text(suggestion)}) - + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -20,6 +20,11 @@ engines: locale : en-US shortcut : bi + - name : bing images + engine : bing_images + locale : en-US + shortcut : bii + - name : bing news engine : bing_news locale : en-US @@ -148,11 +153,7 @@ engines: - name : vimeo engine : vimeo - categories : videos - results_xpath : //div[@id="browse_content"]/ol/li - url_xpath : ./a/@href - title_xpath : ./a/div[@class="data"]/p[@class="title"]/text() - content_xpath : ./a/img/@src + locale : en-US shortcut : vm locales: