[enh] xpath engine absolute xpath support - searx - My custom branche(s) on searx, a meta-search engine

commit: 5d764f95cf44ab4c1ba83d7055297e3c4ea48c98
parent: c09d69bd2c2f0f8d37c03f94a2e6a97636fedba1
Author: asciimoo <asciimoo@gmail.com>
Date:   Sat, 26 Oct 2013 13:45:43 +0200

[enh] xpath engine absolute xpath support

Diffstat:
M searx/engines/xpath.py 19 ++++++++++++-------

1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
@@ -5,10 +5,10 @@ from cgi import escape
 from lxml.etree import _ElementStringResult
 
 search_url    = None
-results_xpath = None
 url_xpath     = None
 content_xpath = None
 title_xpath   = None
+results_xpath = ''
 
 def extract_url(xpath_results):
     url = ''
@@ -26,7 +26,7 @@ def extract_url(xpath_results):
         else:
             url = xpath_results[0].attrib.get('href')
     else:
-        raise Exception('Cannot handle xpath url resultset')
+        url = xpath_results.attrib.get('href')
     if not url.startswith('http://') or not url.startswith('https://'):
         url = 'http://'+url
     parsed_url = urlparse(url)
@@ -45,10 +45,15 @@ def response(resp):
     results = []
     dom = html.fromstring(resp.text)
     query = resp.search_params['query']
-    for result in dom.xpath(results_xpath):
-        url = extract_url(result.xpath(url_xpath))
-        title = ' '.join(result.xpath(title_xpath))
-        content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
-        results.append({'url': url, 'title': title, 'content': content})
+    if results_xpath:
+        for result in dom.xpath(results_xpath):
+            url = extract_url(result.xpath(url_xpath))
+            title = ' '.join(result.xpath(title_xpath))
+            content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
+            results.append({'url': url, 'title': title, 'content': content})
+    else:
+        for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
+            results.append({'url': url, 'title': title, 'content': content})
+
 
     return results