logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

yandex.py (1689B)


  1. """
  2. Yahoo (Web)
  3. @website https://yandex.ru/
  4. @provide-api ?
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. """
  10. from lxml import html
  11. from searx import logger
  12. from searx.url_utils import urlencode
  13. logger = logger.getChild('yandex engine')
  14. # engine dependent config
  15. categories = ['general']
  16. paging = True
  17. language_support = True # TODO
  18. default_tld = 'com'
  19. language_map = {'ru': 'ru',
  20. 'ua': 'ua',
  21. 'be': 'by',
  22. 'kk': 'kz',
  23. 'tr': 'com.tr'}
  24. # search-url
  25. base_url = 'https://yandex.{tld}/'
  26. search_url = 'search/?{query}&p={page}'
  27. results_xpath = '//li[@class="serp-item"]'
  28. url_xpath = './/h2/a/@href'
  29. title_xpath = './/h2/a//text()'
  30. content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()'
  31. def request(query, params):
  32. lang = params['language'].split('-')[0]
  33. host = base_url.format(tld=language_map.get(lang) or default_tld)
  34. params['url'] = host + search_url.format(page=params['pageno'] - 1,
  35. query=urlencode({'text': query}))
  36. return params
  37. # get response from search-request
  38. def response(resp):
  39. dom = html.fromstring(resp.text)
  40. results = []
  41. for result in dom.xpath(results_xpath):
  42. try:
  43. res = {'url': result.xpath(url_xpath)[0],
  44. 'title': ''.join(result.xpath(title_xpath)),
  45. 'content': ''.join(result.xpath(content_xpath))}
  46. except:
  47. logger.exception('yandex parse crash')
  48. continue
  49. results.append(res)
  50. return results