logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git
commit: 3e3672e0790266fc7f2482fdd854d7789a915d4d
parent 6d28e9d6945b5510b3d861e20521554435a10f63
Author: jibe-b <user701@orange.fr>
Date:   Sat, 23 Sep 2017 14:16:06 +0200

[add] arxiv engine

Diffstat:

Asearx/engines/arxiv.py73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msearx/settings.yml6++++++
Atests/unit/engines/test_arxiv.py58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 137 insertions(+), 0 deletions(-)

diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +""" + ArXiV (Scientific preprints) + @website https://axiv.org + @provide-api yes (export.arxiv.org/api/query) + @using-api yes + @results XML-RSS + @stable yes + @parse url, title, publishedDate, content + More info on api: https://arxiv.org/help/api/user-manual +""" + +from lxml import html +from datetime import datetime +from searx.url_utils import urlencode + + +categories = ['science'] + +base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ + + '{query}&start={offset}&max_results={number_of_results}' + +# engine dependent config +number_of_results = 10 + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=query, + offset=offset, + number_of_results=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + search_results = html.fromstring(resp.text.encode('utf-8')).xpath('//entry') + + for entry in search_results: + title = entry.xpath('.//title')[0].text + + url = entry.xpath('.//id')[0].text + + content = entry.xpath('.//summary')[0].text + + # If a doi is available, add it to the snipppet + try: + doi = entry.xpath('.//link[@title="doi"]')[0].text + content = 'DOI: ' + doi + ' Abstract: ' + content + except: + pass + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ') + + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -60,6 +60,12 @@ engines: disabled : True shortcut : ai + - name : arxiv + engine : arxiv + shortcut : arx + categories : science + timeout : 4.0 + - name : base engine : base shortcut : bs diff --git a/tests/unit/engines/test_arxiv.py b/tests/unit/engines/test_arxiv.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import arxiv +from searx.testing import SearxTestCase + + +class TestBaseEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = arxiv.request(query, dicto) + self.assertIn('url', params) + self.assertIn('export.arxiv.org/api/', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, arxiv.response, None) + self.assertRaises(AttributeError, arxiv.response, []) + self.assertRaises(AttributeError, arxiv.response, '') + self.assertRaises(AttributeError, arxiv.response, '[]') + + response = mock.Mock(text='''<?xml version="1.0" encoding="UTF-8"?> +<feed xmlns="http://www.w3.org/2005/Atom"></feed>''') + self.assertEqual(arxiv.response(response), []) + + xml_mock = '''<?xml version="1.0" encoding="UTF-8"?> +<feed xmlns="http://www.w3.org/2005/Atom"> + <title type="html">ArXiv Query: search_query=all:test_query&amp;id_list=&amp;start=0&amp;max_results=1</title> + <id>http://arxiv.org/api/1</id> + <updated>2000-01-21T00:00:00-01:00</updated> + <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:totalResults> + <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex> + <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage> + <entry> + <id>http://arxiv.org/1</id> + <updated>2000-01-01T00:00:01Z</updated> + <published>2000-01-01T00:00:01Z</published> + <title>Mathematical proof.</title> + <summary>Mathematical formula.</summary> + <author> + <name>A. B.</name> + </author> + <link href="http://arxiv.org/1" rel="alternate" type="text/html"/> + <link title="pdf" href="http://arxiv.org/1" rel="related" type="application/pdf"/> + <category term="math.QA" scheme="http://arxiv.org/schemas/atom"/> + <category term="1" scheme="http://arxiv.org/schemas/atom"/> + </entry> +</feed> +''' + + response = mock.Mock(text=xml_mock.encode('utf-8')) + results = arxiv.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Mathematical proof.') + self.assertEqual(results[0]['content'], 'Mathematical formula.')