logo

searx

My custom branche(s) on searx, a meta-search engine git clone https://hacktivis.me/git/searx.git

twitter.py (2445B)


  1. """
  2. Twitter (Social media)
  3. @website https://twitter.com/
  4. @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. @todo publishedDate
  10. """
  11. from lxml import html
  12. from datetime import datetime
  13. from searx.engines.xpath import extract_text
  14. from searx.url_utils import urlencode, urljoin
  15. # engine dependent config
  16. categories = ['social media']
  17. language_support = True
  18. # search-url
  19. base_url = 'https://twitter.com/'
  20. search_url = base_url + 'search?'
  21. # specific xpath variables
  22. results_xpath = '//li[@data-item-type="tweet"]'
  23. avatar_xpath = './/img[contains(@class, "avatar")]/@src'
  24. link_xpath = './/small[@class="time"]//a'
  25. title_xpath = './/span[contains(@class, "username")]'
  26. content_xpath = './/p[contains(@class, "tweet-text")]'
  27. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  28. # do search-request
  29. def request(query, params):
  30. params['url'] = search_url + urlencode({'q': query})
  31. params['cookies']['lang'] = params['language'].split('-')[0]
  32. return params
  33. # get response from search-request
  34. def response(resp):
  35. results = []
  36. dom = html.fromstring(resp.text)
  37. # parse results
  38. for tweet in dom.xpath(results_xpath):
  39. try:
  40. link = tweet.xpath(link_xpath)[0]
  41. content = extract_text(tweet.xpath(content_xpath)[0])
  42. img_src = tweet.xpath(avatar_xpath)[0]
  43. img_src = img_src.replace('_bigger', '_normal')
  44. except Exception:
  45. continue
  46. url = urljoin(base_url, link.attrib.get('href'))
  47. title = extract_text(tweet.xpath(title_xpath))
  48. pubdate = tweet.xpath(timestamp_xpath)
  49. if len(pubdate) > 0:
  50. timestamp = float(pubdate[0].attrib.get('data-time'))
  51. publishedDate = datetime.fromtimestamp(timestamp, None)
  52. # append result
  53. results.append({'url': url,
  54. 'title': title,
  55. 'content': content,
  56. 'img_src': img_src,
  57. 'publishedDate': publishedDate})
  58. else:
  59. # append result
  60. results.append({'url': url,
  61. 'title': title,
  62. 'content': content,
  63. 'img_src': img_src})
  64. # return results
  65. return results