logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git

applepodcasts.py (3825B)


  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. clean_html,
  6. clean_podcast_url,
  7. get_element_by_class,
  8. int_or_none,
  9. parse_codecs,
  10. parse_iso8601,
  11. try_get,
  12. )
  13. class ApplePodcastsIE(InfoExtractor):
  14. _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
  15. _TESTS = [{
  16. 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
  17. 'md5': '41dc31cd650143e530d9423b6b5a344f',
  18. 'info_dict': {
  19. 'id': '1000482637777',
  20. 'ext': 'mp3',
  21. 'title': '207 - Whitney Webb Returns',
  22. 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
  23. 'upload_date': '20200705',
  24. 'timestamp': 1593932400,
  25. 'duration': 6454,
  26. 'series': 'The Tim Dillon Show',
  27. 'thumbnail': 're:.+[.](png|jpe?g|webp)',
  28. }
  29. }, {
  30. 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
  31. 'only_matching': True,
  32. }, {
  33. 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
  34. 'only_matching': True,
  35. }, {
  36. 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
  37. 'only_matching': True,
  38. }]
  39. def _real_extract(self, url):
  40. episode_id = self._match_id(url)
  41. webpage = self._download_webpage(url, episode_id)
  42. episode_data = {}
  43. ember_data = {}
  44. # new page type 2021-11
  45. amp_data = self._parse_json(self._search_regex(
  46. r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
  47. webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
  48. amp_data = try_get(amp_data,
  49. lambda a: self._parse_json(
  50. next(a[x] for x in iter(a) if episode_id in x),
  51. episode_id),
  52. dict) or {}
  53. amp_data = amp_data.get('d') or []
  54. episode_data = try_get(
  55. amp_data,
  56. lambda a: next(x for x in a
  57. if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
  58. dict)
  59. if not episode_data:
  60. # try pre 2021-11 page type: TODO: consider deleting if no longer used
  61. ember_data = self._parse_json(self._search_regex(
  62. r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
  63. webpage, 'ember data'), episode_id) or {}
  64. ember_data = ember_data.get(episode_id) or ember_data
  65. episode_data = try_get(ember_data, lambda x: x['data'], dict)
  66. episode = episode_data['attributes']
  67. description = episode.get('description') or {}
  68. series = None
  69. for inc in (amp_data or ember_data.get('included') or []):
  70. if inc.get('type') == 'media/podcast':
  71. series = try_get(inc, lambda x: x['attributes']['name'])
  72. series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
  73. info = [{
  74. 'id': episode_id,
  75. 'title': episode['name'],
  76. 'url': clean_podcast_url(episode['assetUrl']),
  77. 'description': description.get('standard') or description.get('short'),
  78. 'timestamp': parse_iso8601(episode.get('releaseDateTime')),
  79. 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
  80. 'series': series,
  81. 'thumbnail': self._og_search_thumbnail(webpage),
  82. }]
  83. self._sort_formats(info)
  84. info = info[0]
  85. codecs = parse_codecs(info.get('ext', 'mp3'))
  86. info.update(codecs)
  87. return info