logo

searx

Unnamed repository; edit this file 'description' to name the repository.
commit: 3a2f29344a2846f530418632b88db4e345f32249
parent: a959977ab445bca91059d98bf8ca822fffc51fdf
Author: Adam Tauber <asciimoo@gmail.com>
Date:   Mon, 26 Oct 2015 09:29:20 +0100

Merge pull request #452 from pointhi/engine_fix

[enh] fix content fetching, parse published date from description for startpage and ixquick

Diffstat:

searx/engines/startpage.py | 43+++++++++++++++++++++++++++++++++++++------
searx/tests/engines/test_startpage.py | 6+++---
2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py @@ -12,6 +12,8 @@ from lxml import html from cgi import escape +from dateutil import parser +from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text @@ -79,15 +81,44 @@ def response(resp): title = escape(extract_text(link)) - if result.xpath('./p[@class="desc"]'): - content = escape(extract_text(result.xpath('./p[@class="desc"]'))) + if result.xpath('./p[@class="desc clk"]'): + content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) else: content = '' - # append result - results.append({'url': url, - 'title': title, - 'content': content}) + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + published_date = parser.parse(date_string, dayfirst=True) + + # fix content string + content = content[date_pos:] + + # check if search result starts with something like: "5 days ago ... " + elif re.match("^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content}) # return results return results diff --git a/searx/tests/engines/test_startpage.py b/searx/tests/engines/test_startpage.py @@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase): </a> <span id='title_stars_2' name='title_stars_2'> </span> </h3> - <p class='desc'> + <p class='desc clk'> This should be the content. </p> <p> @@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase): </a> <span id='title_stars_2' name='title_stars_2'> </span> </h3> - <p class='desc'> + <p class='desc clk'> This should be the content. </p> <p> @@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase): <h3> <span id='title_stars_2' name='title_stars_2'> </span> </h3> - <p class='desc'> + <p class='desc clk'> This should be the content. </p> <p>