logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git

ted.py (13942B)


  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import (
  6. compat_str,
  7. compat_urlparse
  8. )
  9. from ..utils import (
  10. extract_attributes,
  11. float_or_none,
  12. int_or_none,
  13. try_get,
  14. url_or_none,
  15. )
  16. class TEDIE(InfoExtractor):
  17. IE_NAME = 'ted'
  18. _VALID_URL = r'''(?x)
  19. (?P<proto>https?://)
  20. (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  21. (
  22. (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
  23. |
  24. ((?P<type_talk>talks)) # We have a simple talk
  25. |
  26. (?P<type_watch>watch)/[^/]+/[^/]+
  27. )
  28. (/lang/(.*?))? # The url may contain the language
  29. /(?P<name>[\w-]+) # Here goes the name and then ".html"
  30. .*)$
  31. '''
  32. _TESTS = [{
  33. 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  34. 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
  35. 'info_dict': {
  36. 'id': '102',
  37. 'ext': 'mp4',
  38. 'title': 'The illusion of consciousness',
  39. 'description': ('Philosopher Dan Dennett makes a compelling '
  40. 'argument that not only don\'t we understand our own '
  41. 'consciousness, but that half the time our brains are '
  42. 'actively fooling us.'),
  43. 'uploader': 'Dan Dennett',
  44. 'width': 853,
  45. 'duration': 1308,
  46. 'view_count': int,
  47. 'comment_count': int,
  48. 'tags': list,
  49. },
  50. 'params': {
  51. 'skip_download': True,
  52. },
  53. }, {
  54. # missing HTTP bitrates
  55. 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
  56. 'info_dict': {
  57. 'id': '6069',
  58. 'ext': 'mp4',
  59. 'title': 'The beauty and power of algorithms',
  60. 'thumbnail': r're:^https?://.+\.jpg',
  61. 'description': 'md5:734e352710fb00d840ab87ae31aaf688',
  62. 'uploader': 'Vishal Sikka',
  63. },
  64. 'params': {
  65. 'skip_download': True,
  66. },
  67. }, {
  68. 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  69. 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
  70. 'info_dict': {
  71. 'id': '1972',
  72. 'ext': 'mp4',
  73. 'title': 'Be passionate. Be courageous. Be your best.',
  74. 'uploader': 'Gabby Giffords and Mark Kelly',
  75. 'description': 'md5:5174aed4d0f16021b704120360f72b92',
  76. 'duration': 1128,
  77. },
  78. 'params': {
  79. 'skip_download': True,
  80. },
  81. }, {
  82. 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  83. 'info_dict': {
  84. 'id': '10',
  85. 'title': 'Who are the hackers?',
  86. 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
  87. },
  88. 'playlist_mincount': 6,
  89. }, {
  90. # contains a youtube video
  91. 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  92. 'add_ie': ['Youtube'],
  93. 'info_dict': {
  94. 'id': '_ZG8HBuDjgc',
  95. 'ext': 'webm',
  96. 'title': 'Douglas Adams: Parrots the Universe and Everything',
  97. 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  98. 'uploader': 'University of California Television (UCTV)',
  99. 'uploader_id': 'UCtelevision',
  100. 'upload_date': '20080522',
  101. },
  102. 'params': {
  103. 'skip_download': True,
  104. },
  105. }, {
  106. # no nativeDownloads
  107. 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
  108. 'info_dict': {
  109. 'id': '1792',
  110. 'ext': 'mp4',
  111. 'title': 'The orchestra in my mouth',
  112. 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
  113. 'uploader': 'Tom Thum',
  114. 'view_count': int,
  115. 'comment_count': int,
  116. 'tags': list,
  117. },
  118. 'params': {
  119. 'skip_download': True,
  120. },
  121. }, {
  122. # with own formats and private Youtube external
  123. 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
  124. 'only_matching': True,
  125. }]
  126. _NATIVE_FORMATS = {
  127. 'low': {'width': 320, 'height': 180},
  128. 'medium': {'width': 512, 'height': 288},
  129. 'high': {'width': 854, 'height': 480},
  130. }
  131. def _extract_info(self, webpage):
  132. info_json = self._search_regex(
  133. r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
  134. webpage, 'info json')
  135. return json.loads(info_json)
  136. def _real_extract(self, url):
  137. m = re.match(self._VALID_URL, url, re.VERBOSE)
  138. if m.group('type').startswith('embed'):
  139. desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  140. return self.url_result(desktop_url, 'TED')
  141. name = m.group('name')
  142. if m.group('type_talk'):
  143. return self._talk_info(url, name)
  144. elif m.group('type_watch'):
  145. return self._watch_info(url, name)
  146. else:
  147. return self._playlist_videos_info(url, name)
  148. def _playlist_videos_info(self, url, name):
  149. '''Returns the videos of the playlist'''
  150. webpage = self._download_webpage(url, name,
  151. 'Downloading playlist webpage')
  152. playlist_entries = []
  153. for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
  154. attrs = extract_attributes(entry)
  155. entry_url = compat_urlparse.urljoin(url, attrs['href'])
  156. playlist_entries.append(self.url_result(entry_url, self.ie_key()))
  157. final_url = self._og_search_url(webpage, fatal=False)
  158. playlist_id = (
  159. re.match(self._VALID_URL, final_url).group('playlist_id')
  160. if final_url else None)
  161. return self.playlist_result(
  162. playlist_entries, playlist_id=playlist_id,
  163. playlist_title=self._og_search_title(webpage, fatal=False),
  164. playlist_description=self._og_search_description(webpage))
  165. def _talk_info(self, url, video_name):
  166. webpage = self._download_webpage(url, video_name)
  167. info = self._extract_info(webpage)
  168. data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
  169. talk_info = data['talks'][0]
  170. title = talk_info['title'].strip()
  171. downloads = talk_info.get('downloads') or {}
  172. native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
  173. formats = [{
  174. 'url': format_url,
  175. 'format_id': format_id,
  176. } for (format_id, format_url) in native_downloads.items() if format_url is not None]
  177. subtitled_downloads = downloads.get('subtitledDownloads') or {}
  178. for lang, subtitled_download in subtitled_downloads.items():
  179. for q in self._NATIVE_FORMATS:
  180. q_url = subtitled_download.get(q)
  181. if not q_url:
  182. continue
  183. formats.append({
  184. 'url': q_url,
  185. 'format_id': '%s-%s' % (q, lang),
  186. 'language': lang,
  187. })
  188. if formats:
  189. for f in formats:
  190. finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
  191. if finfo:
  192. f.update(finfo)
  193. player_talk = talk_info['player_talks'][0]
  194. resources_ = player_talk.get('resources') or talk_info.get('resources')
  195. http_url = None
  196. for format_id, resources in resources_.items():
  197. if format_id == 'hls':
  198. if not isinstance(resources, dict):
  199. continue
  200. stream_url = url_or_none(resources.get('stream'))
  201. if not stream_url:
  202. continue
  203. formats.extend(self._extract_m3u8_formats(
  204. stream_url, video_name, 'mp4', m3u8_id=format_id,
  205. fatal=False))
  206. else:
  207. if not isinstance(resources, list):
  208. continue
  209. if format_id == 'h264':
  210. for resource in resources:
  211. h264_url = resource.get('file')
  212. if not h264_url:
  213. continue
  214. bitrate = int_or_none(resource.get('bitrate'))
  215. formats.append({
  216. 'url': h264_url,
  217. 'format_id': '%s-%sk' % (format_id, bitrate),
  218. 'tbr': bitrate,
  219. })
  220. if re.search(r'\d+k', h264_url):
  221. http_url = h264_url
  222. elif format_id == 'rtmp':
  223. streamer = talk_info.get('streamer')
  224. if not streamer:
  225. continue
  226. for resource in resources:
  227. formats.append({
  228. 'format_id': '%s-%s' % (format_id, resource.get('name')),
  229. 'url': streamer,
  230. 'play_path': resource['file'],
  231. 'ext': 'flv',
  232. 'width': int_or_none(resource.get('width')),
  233. 'height': int_or_none(resource.get('height')),
  234. 'tbr': int_or_none(resource.get('bitrate')),
  235. })
  236. m3u8_formats = list(filter(
  237. lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
  238. formats))
  239. if http_url:
  240. for m3u8_format in m3u8_formats:
  241. bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
  242. if not bitrate:
  243. continue
  244. bitrate_url = re.sub(r'\d+k', bitrate, http_url)
  245. if not self._is_valid_url(
  246. bitrate_url, video_name, '%s bitrate' % bitrate):
  247. continue
  248. f = m3u8_format.copy()
  249. f.update({
  250. 'url': bitrate_url,
  251. 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
  252. 'protocol': 'http',
  253. })
  254. if f.get('acodec') == 'none':
  255. del f['acodec']
  256. formats.append(f)
  257. audio_download = talk_info.get('audioDownload')
  258. if audio_download:
  259. formats.append({
  260. 'url': audio_download,
  261. 'format_id': 'audio',
  262. 'vcodec': 'none',
  263. })
  264. if not formats:
  265. external = player_talk.get('external')
  266. if isinstance(external, dict):
  267. service = external.get('service')
  268. if isinstance(service, compat_str):
  269. ext_url = None
  270. if service.lower() == 'youtube':
  271. ext_url = external.get('code')
  272. return self.url_result(ext_url or external['uri'])
  273. self._sort_formats(formats)
  274. video_id = compat_str(talk_info['id'])
  275. return {
  276. 'id': video_id,
  277. 'title': title,
  278. 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
  279. 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
  280. 'description': self._og_search_description(webpage),
  281. 'subtitles': self._get_subtitles(video_id, talk_info),
  282. 'formats': formats,
  283. 'duration': float_or_none(talk_info.get('duration')),
  284. 'view_count': int_or_none(data.get('viewed_count')),
  285. 'comment_count': int_or_none(
  286. try_get(data, lambda x: x['comments']['count'])),
  287. 'tags': try_get(talk_info, lambda x: x['tags'], list),
  288. }
  289. def _get_subtitles(self, video_id, talk_info):
  290. sub_lang_list = {}
  291. for language in try_get(
  292. talk_info,
  293. (lambda x: x['downloads']['languages'],
  294. lambda x: x['languages']), list):
  295. lang_code = language.get('languageCode') or language.get('ianaCode')
  296. if not lang_code:
  297. continue
  298. sub_lang_list[lang_code] = [
  299. {
  300. 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
  301. 'ext': ext,
  302. }
  303. for ext in ['ted', 'srt']
  304. ]
  305. return sub_lang_list
  306. def _watch_info(self, url, name):
  307. webpage = self._download_webpage(url, name)
  308. config_json = self._html_search_regex(
  309. r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
  310. webpage, 'config', default=None)
  311. if not config_json:
  312. embed_url = self._search_regex(
  313. r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
  314. return self.url_result(self._proto_relative_url(embed_url))
  315. config = json.loads(config_json)['config']
  316. video_url = config['video']['url']
  317. thumbnail = config.get('image', {}).get('url')
  318. title = self._html_search_regex(
  319. r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
  320. description = self._html_search_regex(
  321. [
  322. r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
  323. r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
  324. ],
  325. webpage, 'description', fatal=False)
  326. return {
  327. 'id': name,
  328. 'url': video_url,
  329. 'title': title,
  330. 'thumbnail': thumbnail,
  331. 'description': description,
  332. }