logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

xpath.py (3761B)


  1. from lxml import html
  2. from lxml.etree import _ElementStringResult, _ElementUnicodeResult
  3. from searx.utils import html_to_text
  4. from searx.url_utils import unquote, urlencode, urljoin, urlparse
  5. search_url = None
  6. url_xpath = None
  7. content_xpath = None
  8. title_xpath = None
  9. paging = False
  10. suggestion_xpath = ''
  11. results_xpath = ''
  12. # parameters for engines with paging support
  13. #
  14. # number of results on each page
  15. # (only needed if the site requires not a page number, but an offset)
  16. page_size = 1
  17. # number of the first page (usually 0 or 1)
  18. first_page_num = 1
  19. '''
  20. if xpath_results is list, extract the text from each result and concat the list
  21. if xpath_results is a xml element, extract all the text node from it
  22. ( text_content() method from lxml )
  23. if xpath_results is a string element, then it's already done
  24. '''
  25. def extract_text(xpath_results):
  26. if type(xpath_results) == list:
  27. # it's list of result : concat everything using recursive call
  28. result = ''
  29. for e in xpath_results:
  30. result = result + extract_text(e)
  31. return result.strip()
  32. elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
  33. # it's a string
  34. return ''.join(xpath_results)
  35. else:
  36. # it's a element
  37. text = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
  38. text = text.strip().replace('\n', ' ')
  39. return ' '.join(text.split())
  40. def extract_url(xpath_results, search_url):
  41. if xpath_results == []:
  42. raise Exception('Empty url resultset')
  43. url = extract_text(xpath_results)
  44. if url.startswith('//'):
  45. # add http or https to this kind of url //example.com/
  46. parsed_search_url = urlparse(search_url)
  47. url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
  48. elif url.startswith('/'):
  49. # fix relative url to the search engine
  50. url = urljoin(search_url, url)
  51. # normalize url
  52. url = normalize_url(url)
  53. return url
  54. def normalize_url(url):
  55. parsed_url = urlparse(url)
  56. # add a / at this end of the url if there is no path
  57. if not parsed_url.netloc:
  58. raise Exception('Cannot parse url')
  59. if not parsed_url.path:
  60. url += '/'
  61. # FIXME : hack for yahoo
  62. if parsed_url.hostname == 'search.yahoo.com'\
  63. and parsed_url.path.startswith('/r'):
  64. p = parsed_url.path
  65. mark = p.find('/**')
  66. if mark != -1:
  67. return unquote(p[mark + 3:]).decode('utf-8')
  68. return url
  69. def request(query, params):
  70. query = urlencode({'q': query})[2:]
  71. fp = {'query': query}
  72. if paging and search_url.find('{pageno}') >= 0:
  73. fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
  74. params['url'] = search_url.format(**fp)
  75. params['query'] = query
  76. return params
  77. def response(resp):
  78. results = []
  79. dom = html.fromstring(resp.text)
  80. if results_xpath:
  81. for result in dom.xpath(results_xpath):
  82. url = extract_url(result.xpath(url_xpath), search_url)
  83. title = extract_text(result.xpath(title_xpath))
  84. content = extract_text(result.xpath(content_xpath))
  85. results.append({'url': url, 'title': title, 'content': content})
  86. else:
  87. for url, title, content in zip(
  88. (extract_url(x, search_url) for
  89. x in dom.xpath(url_xpath)),
  90. map(extract_text, dom.xpath(title_xpath)),
  91. map(extract_text, dom.xpath(content_xpath))
  92. ):
  93. results.append({'url': url, 'title': title, 'content': content})
  94. if not suggestion_xpath:
  95. return results
  96. for suggestion in dom.xpath(suggestion_xpath):
  97. results.append({'suggestion': extract_text(suggestion)})
  98. return results