commit: 3a2f29344a2846f530418632b88db4e345f32249
parent: a959977ab445bca91059d98bf8ca822fffc51fdf
Author: Adam Tauber <asciimoo@gmail.com>
Date: Mon, 26 Oct 2015 09:29:20 +0100
Merge pull request #452 from pointhi/engine_fix
[enh] fix content fetching, parse published date from description for startpage and ixquick
Diffstat:
2 files changed, 40 insertions(+), 9 deletions(-)
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -79,15 +81,44 @@ def response(resp):
title = escape(extract_text(link))
- if result.xpath('./p[@class="desc"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+ if result.xpath('./p[@class="desc clk"]'):
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results
diff --git a/searx/tests/engines/test_startpage.py b/searx/tests/engines/test_startpage.py
@@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
- <p class='desc'>
+ <p class='desc clk'>
This should be the content.
</p>
<p>
@@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
- <p class='desc'>
+ <p class='desc clk'>
This should be the content.
</p>
<p>
@@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
<h3>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
- <p class='desc'>
+ <p class='desc clk'>
This should be the content.
</p>
<p>