logo

searx

My custom branche(s) on searx, a meta-search engine
commit: 5d764f95cf44ab4c1ba83d7055297e3c4ea48c98
parent: c09d69bd2c2f0f8d37c03f94a2e6a97636fedba1
Author: asciimoo <asciimoo@gmail.com>
Date:   Sat, 26 Oct 2013 13:45:43 +0200

[enh] xpath engine absolute xpath support

Diffstat:

Msearx/engines/xpath.py19++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py @@ -5,10 +5,10 @@ from cgi import escape from lxml.etree import _ElementStringResult search_url = None -results_xpath = None url_xpath = None content_xpath = None title_xpath = None +results_xpath = '' def extract_url(xpath_results): url = '' @@ -26,7 +26,7 @@ def extract_url(xpath_results): else: url = xpath_results[0].attrib.get('href') else: - raise Exception('Cannot handle xpath url resultset') + url = xpath_results.attrib.get('href') if not url.startswith('http://') or not url.startswith('https://'): url = 'http://'+url parsed_url = urlparse(url) @@ -45,10 +45,15 @@ def response(resp): results = [] dom = html.fromstring(resp.text) query = resp.search_params['query'] - for result in dom.xpath(results_xpath): - url = extract_url(result.xpath(url_xpath)) - title = ' '.join(result.xpath(title_xpath)) - content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query)) - results.append({'url': url, 'title': title, 'content': content}) + if results_xpath: + for result in dom.xpath(results_xpath): + url = extract_url(result.xpath(url_xpath)) + title = ' '.join(result.xpath(title_xpath)) + content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query)) + results.append({'url': url, 'title': title, 'content': content}) + else: + for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)): + results.append({'url': url, 'title': title, 'content': content}) + return results