logo

searx

Unnamed repository; edit this file 'description' to name the repository.
commit: fff9460238f63f05113c8dfc970b69d84b99a991
parent: 26c818193d11598550c28f8a72bf5835b2a95bf5
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Wed, 30 Mar 2016 11:09:22 +0200

Merge pull request #522 from jibe-b/master

add BASE engine in category "Science" 

Diffstat:

searx/engines/base.py | 122+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
searx/settings.yml | 4++++
searx/webapp.py | 24++++++++++++++----------
3 files changed, 140 insertions(+), 10 deletions(-)

diff --git a/searx/engines/base.py b/searx/engines/base.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +""" + BASE (Scholar publications) + + @website https://base-search.net + @provide-api yes with authorization (https://api.base-search.net/) + + @using-api yes + @results XML + @stable ? + @parse url, title, publishedDate, content + More info on api: http://base-search.net/about/download/base_interface.pdf +""" + +from lxml import etree +from urllib import urlencode +from searx.utils import searx_useragent +from cgi import escape +from datetime import datetime +import re + + +categories = ['science'] + +base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\ + + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}' + +# engine dependent config +paging = True +number_of_results = 10 + +# shortcuts for advanced search +shorcut_dict = { + # user-friendly keywords + 'format:': 'dcformat:', + 'author:': 'dccreator:', + 'collection:': 'dccollection:', + 'hdate:': 'dchdate:', + 'contributor:': 'dccontributor:', + 'coverage:': 'dccoverage:', + 'date:': 'dcdate:', + 'abstract:': 'dcdescription:', + 'urls:': 'dcidentifier:', + 'language:': 'dclanguage:', + 'publisher:': 'dcpublisher:', + 'relation:': 'dcrelation:', + 'rights:': 'dcrights:', + 'source:': 'dcsource:', + 'subject:': 'dcsubject:', + 'title:': 'dctitle:', + 'type:': 'dcdctype:' +} + + +def request(query, params): + # replace shortcuts with API advanced search keywords + for key in shorcut_dict.keys(): + query = re.sub(str(key), str(shorcut_dict[key]), query) + + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'query': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + params['headers']['User-Agent'] = searx_useragent() + return params + + +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + for entry in search_results.xpath('./result/doc'): + content = "No description available" + + date = datetime.now() # needed in case no dcdate is available for an item + for item in entry: + if item.attrib["name"] == "dchdate": + harvestDate = item.text + + elif item.attrib["name"] == "dcdate": + date = item.text + + elif item.attrib["name"] == "dctitle": + title = item.text + + elif item.attrib["name"] == "dclink": + url = item.text + + elif item.attrib["name"] == "dcdescription": + content = escape(item.text[:300]) + if len(item.text) > 300: + content += "..." + +# dates returned by the BASE API are not several formats + publishedDate = None + for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']: + try: + publishedDate = datetime.strptime(date, date_format) + break + except: + pass + + if publishedDate is not None: + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + else: + res_dict = {'url': url, + 'title': title, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -38,6 +38,10 @@ engines: engine : archlinux shortcut : al + - name : base + engine : base + shortcut : bs + - name : wikipedia engine : mediawiki shortcut : wp diff --git a/searx/webapp.py b/searx/webapp.py @@ -408,17 +408,21 @@ def index(): # TODO, check if timezone is calculated right if 'publishedDate' in result: - result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') - if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): - timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) - minutes = int((timedifference.seconds / 60) % 60) - hours = int(timedifference.seconds / 60 / 60) - if hours == 0: - result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) - else: - result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa + try: # test if publishedDate >= 1900 (datetime module bug) + result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') + except ValueError: + result['publishedDate'] = None else: - result['publishedDate'] = format_date(result['publishedDate']) + if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): + timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) + minutes = int((timedifference.seconds / 60) % 60) + hours = int(timedifference.seconds / 60 / 60) + if hours == 0: + result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) + else: + result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa + else: + result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': return Response(json.dumps({'query': search.query,