commit: 4508c966677708a2926afb1d05f134f252d8f93a
parent a959977ab445bca91059d98bf8ca822fffc51fdf
Author: Thomas Pointhuber <thomas.pointhuber@gmx.at>
Date: Sat, 24 Oct 2015 16:15:30 +0200
[enh] fix content fetching, parse published date from description
Diffstat:
2 files changed, 40 insertions(+), 9 deletions(-)
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -79,15 +81,44 @@ def response(resp):
title = escape(extract_text(link))
- if result.xpath('./p[@class="desc"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+ if result.xpath('./p[@class="desc clk"]'):
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results
diff --git a/searx/tests/engines/test_startpage.py b/searx/tests/engines/test_startpage.py
@@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
- <p class='desc'>
+ <p class='desc clk'>
This should be the content.
</p>
<p>
@@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
- <p class='desc'>
+ <p class='desc clk'>
This should be the content.
</p>
<p>
@@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
<h3>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
- <p class='desc'>
+ <p class='desc clk'>
This should be the content.
</p>
<p>