commit: 3854703d952d22257841f4f6d85921a340c72da5
parent: ce08abe2232473cf9c4e90e60b6510831a289ada
Author: Adam Tauber <asciimoo@gmail.com>
Date: Fri, 21 Mar 2014 16:33:17 +0100
[mod] ddg engine mods
Diffstat:
1 file changed, 45 insertions(+), 13 deletions(-)
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
@@ -1,29 +1,61 @@
-from json import loads
from urllib import urlencode
+from lxml.html import fromstring
from searx.utils import html_to_text
-url = 'https://duckduckgo.com/'
-search_url = url + 'd.js?{query}&p=1&s={offset}'
+url = 'https://duckduckgo.com/html?{query}&s={offset}'
locale = 'us-en'
-paging = True
-
-
def request(query, params):
offset = (params['pageno'] - 1) * 30
q = urlencode({'q': query,
'l': locale})
- params['url'] = search_url.format(query=q, offset=offset)
+ params['url'] = url.format(query=q, offset=offset)
return params
def response(resp):
+ result_xpath = '//div[@class="results_links results_links_deep web-result"]'
+ url_xpath = './/a[@class="large"]/@href'
+ title_xpath = './/a[@class="large"]//text()'
+ content_xpath = './/div[@class="snippet"]//text()'
results = []
- search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
- for r in search_res:
- if not r.get('t'):
+
+ doc = fromstring(resp.text)
+
+ for r in doc.xpath(result_xpath):
+ res_url = r.xpath(url_xpath)[-1]
+ if not res_url:
continue
- results.append({'title': r['t'],
- 'content': html_to_text(r['a']),
- 'url': r['u']})
+ title = html_to_text(''.join(r.xpath(title_xpath)))
+ content = html_to_text(''.join(r.xpath(content_xpath)))
+ results.append({'title': title,
+ 'content': content,
+ 'url': res_url})
+
return results
+
+
+#from json import loads
+#search_url = url + 'd.js?{query}&p=1&s={offset}'
+#
+#paging = True
+#
+#
+#def request(query, params):
+# offset = (params['pageno'] - 1) * 30
+# q = urlencode({'q': query,
+# 'l': locale})
+# params['url'] = search_url.format(query=q, offset=offset)
+# return params
+#
+#
+#def response(resp):
+# results = []
+# search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
+# for r in search_res:
+# if not r.get('t'):
+# continue
+# results.append({'title': r['t'],
+# 'content': html_to_text(r['a']),
+# 'url': r['u']})
+# return results