logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git

canvas.py (15030B)


  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .common import InfoExtractor
  5. from .gigya import GigyaBaseIE
  6. from ..compat import compat_HTTPError
  7. from ..utils import (
  8. ExtractorError,
  9. clean_html,
  10. extract_attributes,
  11. float_or_none,
  12. get_element_by_class,
  13. int_or_none,
  14. merge_dicts,
  15. str_or_none,
  16. strip_or_none,
  17. url_or_none,
  18. )
  19. class CanvasIE(InfoExtractor):
  20. _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
  21. _TESTS = [{
  22. 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  23. 'md5': '68993eda72ef62386a15ea2cf3c93107',
  24. 'info_dict': {
  25. 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  26. 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  27. 'ext': 'mp4',
  28. 'title': 'Nachtwacht: De Greystook',
  29. 'description': 'Nachtwacht: De Greystook',
  30. 'thumbnail': r're:^https?://.*\.jpg$',
  31. 'duration': 1468.04,
  32. },
  33. 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
  34. }, {
  35. 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  36. 'only_matching': True,
  37. }]
  38. _GEO_BYPASS = False
  39. _HLS_ENTRY_PROTOCOLS_MAP = {
  40. 'HLS': 'm3u8_native',
  41. 'HLS_AES': 'm3u8',
  42. }
  43. _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
  44. def _real_extract(self, url):
  45. mobj = re.match(self._VALID_URL, url)
  46. site_id, video_id = mobj.group('site_id'), mobj.group('id')
  47. data = None
  48. if site_id != 'vrtvideo':
  49. # Old API endpoint, serves more formats but may fail for some videos
  50. data = self._download_json(
  51. 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
  52. % (site_id, video_id), video_id, 'Downloading asset JSON',
  53. 'Unable to download asset JSON', fatal=False)
  54. # New API endpoint
  55. if not data:
  56. headers = self.geo_verification_headers()
  57. headers.update({'Content-Type': 'application/json'})
  58. token = self._download_json(
  59. '%s/tokens' % self._REST_API_BASE, video_id,
  60. 'Downloading token', data=b'', headers=headers)['vrtPlayerToken']
  61. data = self._download_json(
  62. '%s/videos/%s' % (self._REST_API_BASE, video_id),
  63. video_id, 'Downloading video JSON', query={
  64. 'vrtPlayerToken': token,
  65. 'client': '%s@PROD' % site_id,
  66. }, expected_status=400)
  67. if not data.get('title'):
  68. code = data.get('code')
  69. if code == 'AUTHENTICATION_REQUIRED':
  70. self.raise_login_required()
  71. elif code == 'INVALID_LOCATION':
  72. self.raise_geo_restricted(countries=['BE'])
  73. raise ExtractorError(data.get('message') or code, expected=True)
  74. title = data['title']
  75. description = data.get('description')
  76. formats = []
  77. for target in data['targetUrls']:
  78. format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
  79. if not format_url or not format_type:
  80. continue
  81. format_type = format_type.upper()
  82. if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
  83. formats.extend(self._extract_m3u8_formats(
  84. format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
  85. m3u8_id=format_type, fatal=False))
  86. elif format_type == 'HDS':
  87. formats.extend(self._extract_f4m_formats(
  88. format_url, video_id, f4m_id=format_type, fatal=False))
  89. elif format_type == 'MPEG_DASH':
  90. formats.extend(self._extract_mpd_formats(
  91. format_url, video_id, mpd_id=format_type, fatal=False))
  92. elif format_type == 'HSS':
  93. formats.extend(self._extract_ism_formats(
  94. format_url, video_id, ism_id='mss', fatal=False))
  95. else:
  96. formats.append({
  97. 'format_id': format_type,
  98. 'url': format_url,
  99. })
  100. self._sort_formats(formats)
  101. subtitles = {}
  102. subtitle_urls = data.get('subtitleUrls')
  103. if isinstance(subtitle_urls, list):
  104. for subtitle in subtitle_urls:
  105. subtitle_url = subtitle.get('url')
  106. if subtitle_url and subtitle.get('type') == 'CLOSED':
  107. subtitles.setdefault('nl', []).append({'url': subtitle_url})
  108. return {
  109. 'id': video_id,
  110. 'display_id': video_id,
  111. 'title': title,
  112. 'description': description,
  113. 'formats': formats,
  114. 'duration': float_or_none(data.get('duration'), 1000),
  115. 'thumbnail': data.get('posterImageUrl'),
  116. 'subtitles': subtitles,
  117. }
  118. class CanvasEenIE(InfoExtractor):
  119. IE_DESC = 'canvas.be and een.be'
  120. _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  121. _TESTS = [{
  122. 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
  123. 'md5': 'ed66976748d12350b118455979cca293',
  124. 'info_dict': {
  125. 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  126. 'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
  127. 'ext': 'flv',
  128. 'title': 'De afspraak veilt voor de Warmste Week',
  129. 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
  130. 'thumbnail': r're:^https?://.*\.jpg$',
  131. 'duration': 49.02,
  132. },
  133. 'expected_warnings': ['is not a supported codec'],
  134. }, {
  135. # with subtitles
  136. 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
  137. 'info_dict': {
  138. 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
  139. 'display_id': 'pieter-0167',
  140. 'ext': 'mp4',
  141. 'title': 'Pieter 0167',
  142. 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
  143. 'thumbnail': r're:^https?://.*\.jpg$',
  144. 'duration': 2553.08,
  145. 'subtitles': {
  146. 'nl': [{
  147. 'ext': 'vtt',
  148. }],
  149. },
  150. },
  151. 'params': {
  152. 'skip_download': True,
  153. },
  154. 'skip': 'Pagina niet gevonden',
  155. }, {
  156. 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
  157. 'info_dict': {
  158. 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
  159. 'display_id': 'emma-pakt-thilly-aan',
  160. 'ext': 'mp4',
  161. 'title': 'Emma pakt Thilly aan',
  162. 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
  163. 'thumbnail': r're:^https?://.*\.jpg$',
  164. 'duration': 118.24,
  165. },
  166. 'params': {
  167. 'skip_download': True,
  168. },
  169. 'expected_warnings': ['is not a supported codec'],
  170. }, {
  171. 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
  172. 'only_matching': True,
  173. }]
  174. def _real_extract(self, url):
  175. mobj = re.match(self._VALID_URL, url)
  176. site_id, display_id = mobj.group('site_id'), mobj.group('id')
  177. webpage = self._download_webpage(url, display_id)
  178. title = strip_or_none(self._search_regex(
  179. r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
  180. webpage, 'title', default=None) or self._og_search_title(
  181. webpage, default=None))
  182. video_id = self._html_search_regex(
  183. r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
  184. group='id')
  185. return {
  186. '_type': 'url_transparent',
  187. 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
  188. 'ie_key': CanvasIE.ie_key(),
  189. 'id': video_id,
  190. 'display_id': display_id,
  191. 'title': title,
  192. 'description': self._og_search_description(webpage),
  193. }
  194. class VrtNUIE(GigyaBaseIE):
  195. IE_DESC = 'VrtNU.be'
  196. _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
  197. _TESTS = [{
  198. # Available via old API endpoint
  199. 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',
  200. 'info_dict': {
  201. 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
  202. 'ext': 'mp4',
  203. 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)',
  204. 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7',
  205. 'duration': 1457.04,
  206. 'thumbnail': r're:^https?://.*\.jpg$',
  207. 'series': 'Postbus X',
  208. 'season': 'Seizoen 1989',
  209. 'season_number': 1989,
  210. 'episode': 'De zwarte weduwe',
  211. 'episode_number': 1,
  212. 'timestamp': 1595822400,
  213. 'upload_date': '20200727',
  214. },
  215. 'skip': 'This video is only available for registered users',
  216. 'params': {
  217. 'username': '<snip>',
  218. 'password': '<snip>',
  219. },
  220. 'expected_warnings': ['is not a supported codec'],
  221. }, {
  222. # Only available via new API endpoint
  223. 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
  224. 'info_dict': {
  225. 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
  226. 'ext': 'mp4',
  227. 'title': 'Aflevering 5',
  228. 'description': 'Wie valt door de mand tijdens een missie?',
  229. 'duration': 2967.06,
  230. 'season': 'Season 1',
  231. 'season_number': 1,
  232. 'episode_number': 5,
  233. },
  234. 'skip': 'This video is only available for registered users',
  235. 'params': {
  236. 'username': '<snip>',
  237. 'password': '<snip>',
  238. },
  239. 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
  240. }]
  241. _NETRC_MACHINE = 'vrtnu'
  242. _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
  243. _CONTEXT_ID = 'R3595707040'
  244. def _real_initialize(self):
  245. self._login()
  246. def _login(self):
  247. username, password = self._get_login_info()
  248. if username is None:
  249. return
  250. auth_data = {
  251. 'APIKey': self._APIKEY,
  252. 'targetEnv': 'jssdk',
  253. 'loginID': username,
  254. 'password': password,
  255. 'authMode': 'cookie',
  256. }
  257. auth_info = self._gigya_login(auth_data)
  258. # Sometimes authentication fails for no good reason, retry
  259. login_attempt = 1
  260. while login_attempt <= 3:
  261. try:
  262. # When requesting a token, no actual token is returned, but the
  263. # necessary cookies are set.
  264. self._request_webpage(
  265. 'https://token.vrt.be',
  266. None, note='Requesting a token', errnote='Could not get a token',
  267. headers={
  268. 'Content-Type': 'application/json',
  269. 'Referer': 'https://www.vrt.be/vrtnu/',
  270. },
  271. data=json.dumps({
  272. 'uid': auth_info['UID'],
  273. 'uidsig': auth_info['UIDSignature'],
  274. 'ts': auth_info['signatureTimestamp'],
  275. 'email': auth_info['profile']['email'],
  276. }).encode('utf-8'))
  277. except ExtractorError as e:
  278. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  279. login_attempt += 1
  280. self.report_warning('Authentication failed')
  281. self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
  282. else:
  283. raise e
  284. else:
  285. break
  286. def _real_extract(self, url):
  287. display_id = self._match_id(url)
  288. webpage = self._download_webpage(url, display_id)
  289. attrs = extract_attributes(self._search_regex(
  290. r'(<nui-media[^>]+>)', webpage, 'media element'))
  291. video_id = attrs['videoid']
  292. publication_id = attrs.get('publicationid')
  293. if publication_id:
  294. video_id = publication_id + '$' + video_id
  295. page = (self._parse_json(self._search_regex(
  296. r'digitalData\s*=\s*({.+?});', webpage, 'digial data',
  297. default='{}'), video_id, fatal=False) or {}).get('page') or {}
  298. info = self._search_json_ld(webpage, display_id, default={})
  299. return merge_dicts(info, {
  300. '_type': 'url_transparent',
  301. 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
  302. 'ie_key': CanvasIE.ie_key(),
  303. 'id': video_id,
  304. 'display_id': display_id,
  305. 'season_number': int_or_none(page.get('episode_season')),
  306. })
  307. class DagelijkseKostIE(InfoExtractor):
  308. IE_DESC = 'dagelijksekost.een.be'
  309. _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
  310. _TEST = {
  311. 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
  312. 'md5': '30bfffc323009a3e5f689bef6efa2365',
  313. 'info_dict': {
  314. 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
  315. 'display_id': 'hachis-parmentier-met-witloof',
  316. 'ext': 'mp4',
  317. 'title': 'Hachis parmentier met witloof',
  318. 'description': 'md5:9960478392d87f63567b5b117688cdc5',
  319. 'thumbnail': r're:^https?://.*\.jpg$',
  320. 'duration': 283.02,
  321. },
  322. 'expected_warnings': ['is not a supported codec'],
  323. }
  324. def _real_extract(self, url):
  325. display_id = self._match_id(url)
  326. webpage = self._download_webpage(url, display_id)
  327. title = strip_or_none(get_element_by_class(
  328. 'dish-metadata__title', webpage
  329. ) or self._html_search_meta(
  330. 'twitter:title', webpage))
  331. description = clean_html(get_element_by_class(
  332. 'dish-description', webpage)
  333. ) or self._html_search_meta(
  334. ('description', 'twitter:description', 'og:description'),
  335. webpage)
  336. video_id = self._html_search_regex(
  337. r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
  338. group='id')
  339. return {
  340. '_type': 'url_transparent',
  341. 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
  342. 'ie_key': CanvasIE.ie_key(),
  343. 'id': video_id,
  344. 'display_id': display_id,
  345. 'title': title,
  346. 'description': description,
  347. }