logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

digg.py (2324B)


  1. """
  2. Digg (News, Social media)
  3. @website https://digg.com/
  4. @provide-api no
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content, publishedDate, thumbnail
  9. """
  10. import random
  11. import string
  12. from dateutil import parser
  13. from json import loads
  14. from lxml import html
  15. from searx.url_utils import quote_plus
  16. # engine dependent config
  17. categories = ['news', 'social media']
  18. paging = True
  19. # search-url
  20. base_url = 'https://digg.com/'
  21. search_url = base_url + 'api/search/{query}.json?position={position}&format=html'
  22. # specific xpath variables
  23. results_xpath = '//article'
  24. link_xpath = './/small[@class="time"]//a'
  25. title_xpath = './/h2//a//text()'
  26. content_xpath = './/p//text()'
  27. pubdate_xpath = './/time'
  28. digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
  29. string.digits + "+_"
  30. # do search-request
  31. def request(query, params):
  32. offset = (params['pageno'] - 1) * 10
  33. params['url'] = search_url.format(position=offset,
  34. query=quote_plus(query))
  35. params['cookies']['frontend.auid'] = ''.join(random.choice(
  36. digg_cookie_chars) for _ in range(22))
  37. return params
  38. # get response from search-request
  39. def response(resp):
  40. results = []
  41. search_result = loads(resp.text)
  42. if 'html' not in search_result or search_result['html'] == '':
  43. return results
  44. dom = html.fromstring(search_result['html'])
  45. # parse results
  46. for result in dom.xpath(results_xpath):
  47. url = result.attrib.get('data-contenturl')
  48. thumbnail = result.xpath('.//img')[0].attrib.get('src')
  49. title = ''.join(result.xpath(title_xpath))
  50. content = ''.join(result.xpath(content_xpath))
  51. pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
  52. publishedDate = parser.parse(pubdate)
  53. # http to https
  54. thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")
  55. # append result
  56. results.append({'url': url,
  57. 'title': title,
  58. 'content': content,
  59. 'template': 'videos.html',
  60. 'publishedDate': publishedDate,
  61. 'thumbnail': thumbnail})
  62. # return results
  63. return results