logo

searx

My custom branche(s) on searx, a meta-search engine
commit: badd9885459edf76c1f99f6e65feeb24185e88df
parent: 89b68242d3b4258b4144f9723b007ffa538d4475
Author: asciimoo <asciimoo@gmail.com>
Date:   Sat, 26 Oct 2013 02:22:20 +0200

[enh] xpath engine added

Diffstat:

Asearx/engines/xpath.py54++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+), 0 deletions(-)

diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py @@ -0,0 +1,54 @@ +from lxml import html +from urllib import urlencode +from urlparse import urlparse, urljoin +from cgi import escape +from lxml.etree import _ElementStringResult + +search_url = None +results_xpath = None +url_xpath = None +content_xpath = None +title_xpath = None + +def extract_url(xpath_results): + url = '' + parsed_search_url = urlparse(search_url) + if type(xpath_results) == list: + if not len(xpath_results): + raise Exception('Empty url resultset') + if type(xpath_results[0]) == _ElementStringResult: + url = ''.join(xpath_results) + if url.startswith('//'): + url = parsed_search_url.scheme+url + elif url.startswith('/'): + url = urljoin(search_url, url) + #TODO + else: + url = xpath_results[0].attrib.get('href') + else: + raise Exception('Cannot handle xpath url resultset') + if not url.startswith('http://') or not url.startswith('https://'): + url = 'http://'+url + parsed_url = urlparse(url) + if not parsed_url.netloc: + raise Exception('Cannot parse url') + return url + +def request(query, params): + query = urlencode({'q': query})[2:] + params['url'] = search_url.format(query=query) + params['query'] = query + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + query = resp.search_params['query'] + for result in dom.xpath(results_xpath): + url = extract_url(result.xpath(url_xpath)) + title = ' '.join(result.xpath(title_xpath)) + content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query)) + results.append({'url': url, 'title': title, 'content': content}) + + return results