logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git
commit: a46bbb40422564b5576b81c978fb734dbf45a9ce
parent 80f98d60413c742d603da8eae3596999942ae77a
Author: Thomas Pointhuber <thomas.pointhuber@gmx.at>
Date:   Tue,  2 Sep 2014 18:49:42 +0200

fix stackoverflow and add comments

Diffstat:

Msearx/engines/stackoverflow.py42+++++++++++++++++++++++++++++++++++-------
Msearx/settings.yml1-
2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py @@ -1,30 +1,58 @@ +## Stackoverflow (It) +# +# @website https://stackoverflow.com/ +# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content + from urlparse import urljoin from cgi import escape from urllib import urlencode from lxml import html +# engine dependent config categories = ['it'] +paging = True +# search-url url = 'http://stackoverflow.com/' search_url = url+'search?{query}&page={pageno}' -result_xpath = './/div[@class="excerpt"]//text()' -paging = True +# specific xpath variables +results_xpath = '//div[contains(@class,"question-summary")]' +link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' +title_xpath = './/text()' +content_xpath = './/div[@class="excerpt"]//text()' +# do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) - for result in dom.xpath('//div[@class="question-summary search-result"]'): - link = result.xpath('.//div[@class="result-link"]//a')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(' '.join(link.xpath('.//text()'))) - content = escape(' '.join(result.xpath(result_xpath))) - results.append({'url': href, 'title': title, 'content': content}) + title = escape(' '.join(link.xpath(title_xpath))) + content = escape(' '.join(result.xpath(content_xpath))) + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/settings.yml b/searx/settings.yml @@ -90,7 +90,6 @@ engines: - name : stackoverflow engine : stackoverflow - categories : it shortcut : st - name : startpage