logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git

bbc.py (71662B)


  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import functools
  4. import itertools
  5. import json
  6. import re
  7. from .common import InfoExtractor
  8. from ..compat import (
  9. compat_etree_Element,
  10. compat_HTTPError,
  11. compat_parse_qs,
  12. compat_str,
  13. compat_urllib_error,
  14. compat_urllib_parse_urlparse,
  15. compat_urlparse,
  16. )
  17. from ..utils import (
  18. ExtractorError,
  19. OnDemandPagedList,
  20. clean_html,
  21. dict_get,
  22. float_or_none,
  23. get_element_by_class,
  24. int_or_none,
  25. js_to_json,
  26. parse_duration,
  27. parse_iso8601,
  28. strip_or_none,
  29. try_get,
  30. unescapeHTML,
  31. unified_timestamp,
  32. url_or_none,
  33. urlencode_postdata,
  34. urljoin,
  35. )
  36. class BBCCoUkIE(InfoExtractor):
  37. IE_NAME = 'bbc.co.uk'
  38. IE_DESC = 'BBC iPlayer'
  39. _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  40. _VALID_URL = r'''(?x)
  41. https?://
  42. (?:www\.)?bbc\.co\.uk/
  43. (?:
  44. programmes/(?!articles/)|
  45. iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  46. music/(?:clips|audiovideo/popular)[/#]|
  47. radio/player/|
  48. sounds/play/|
  49. events/[^/]+/play/[^/]+/
  50. )
  51. (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
  52. ''' % _ID_REGEX
  53. _LOGIN_URL = 'https://account.bbc.com/signin'
  54. _NETRC_MACHINE = 'bbc'
  55. _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  56. _MEDIA_SETS = [
  57. # Provides HQ HLS streams with even better quality that pc mediaset but fails
  58. # with geolocation in some cases when it's even not geo restricted at all (e.g.
  59. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  60. 'iptv-all',
  61. 'pc',
  62. ]
  63. _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  64. _TESTS = [
  65. {
  66. 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  67. 'info_dict': {
  68. 'id': 'b039d07m',
  69. 'ext': 'flv',
  70. 'title': 'Kaleidoscope, Leonard Cohen',
  71. 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  72. },
  73. 'params': {
  74. # rtmp download
  75. 'skip_download': True,
  76. }
  77. },
  78. {
  79. 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  80. 'info_dict': {
  81. 'id': 'b00yng1d',
  82. 'ext': 'flv',
  83. 'title': 'The Man in Black: Series 3: The Printed Name',
  84. 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  85. 'duration': 1800,
  86. },
  87. 'params': {
  88. # rtmp download
  89. 'skip_download': True,
  90. },
  91. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  92. },
  93. {
  94. 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  95. 'info_dict': {
  96. 'id': 'b00yng1d',
  97. 'ext': 'flv',
  98. 'title': 'The Voice UK: Series 3: Blind Auditions 5',
  99. 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
  100. 'duration': 5100,
  101. },
  102. 'params': {
  103. # rtmp download
  104. 'skip_download': True,
  105. },
  106. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  107. },
  108. {
  109. 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
  110. 'info_dict': {
  111. 'id': 'b03k3pb7',
  112. 'ext': 'flv',
  113. 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
  114. 'description': '2. Invasion',
  115. 'duration': 3600,
  116. },
  117. 'params': {
  118. # rtmp download
  119. 'skip_download': True,
  120. },
  121. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  122. }, {
  123. 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
  124. 'info_dict': {
  125. 'id': 'b04v209v',
  126. 'ext': 'flv',
  127. 'title': 'Pete Tong, The Essential New Tune Special',
  128. 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
  129. 'duration': 10800,
  130. },
  131. 'params': {
  132. # rtmp download
  133. 'skip_download': True,
  134. },
  135. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  136. }, {
  137. 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
  138. 'note': 'Audio',
  139. 'info_dict': {
  140. 'id': 'p022h44j',
  141. 'ext': 'flv',
  142. 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
  143. 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
  144. 'duration': 227,
  145. },
  146. 'params': {
  147. # rtmp download
  148. 'skip_download': True,
  149. }
  150. }, {
  151. 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
  152. 'note': 'Video',
  153. 'info_dict': {
  154. 'id': 'p025c103',
  155. 'ext': 'flv',
  156. 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
  157. 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
  158. 'duration': 226,
  159. },
  160. 'params': {
  161. # rtmp download
  162. 'skip_download': True,
  163. }
  164. }, {
  165. 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
  166. 'info_dict': {
  167. 'id': 'p02n76xf',
  168. 'ext': 'flv',
  169. 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
  170. 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
  171. 'duration': 3540,
  172. },
  173. 'params': {
  174. # rtmp download
  175. 'skip_download': True,
  176. },
  177. 'skip': 'geolocation',
  178. }, {
  179. 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
  180. 'info_dict': {
  181. 'id': 'b05zmgw1',
  182. 'ext': 'flv',
  183. 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
  184. 'title': 'Royal Academy Summer Exhibition',
  185. 'duration': 3540,
  186. },
  187. 'params': {
  188. # rtmp download
  189. 'skip_download': True,
  190. },
  191. 'skip': 'geolocation',
  192. }, {
  193. # iptv-all mediaset fails with geolocation however there is no geo restriction
  194. # for this programme at all
  195. 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
  196. 'info_dict': {
  197. 'id': 'b06rkms3',
  198. 'ext': 'flv',
  199. 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
  200. 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
  201. },
  202. 'params': {
  203. # rtmp download
  204. 'skip_download': True,
  205. },
  206. 'skip': 'Now it\'s really geo-restricted',
  207. }, {
  208. # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
  209. 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
  210. 'info_dict': {
  211. 'id': 'p028bfkj',
  212. 'ext': 'flv',
  213. 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
  214. 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
  215. },
  216. 'params': {
  217. # rtmp download
  218. 'skip_download': True,
  219. },
  220. }, {
  221. 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
  222. 'note': 'Audio',
  223. 'info_dict': {
  224. 'id': 'm0007jz9',
  225. 'ext': 'mp4',
  226. 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
  227. 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
  228. 'duration': 9840,
  229. },
  230. 'params': {
  231. # rtmp download
  232. 'skip_download': True,
  233. }
  234. }, {
  235. 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
  236. 'only_matching': True,
  237. }, {
  238. 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
  239. 'only_matching': True,
  240. }, {
  241. 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
  242. 'only_matching': True,
  243. }, {
  244. 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
  245. 'only_matching': True,
  246. }, {
  247. 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
  248. 'only_matching': True,
  249. }, {
  250. 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
  251. 'only_matching': True,
  252. }, {
  253. 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
  254. 'only_matching': True,
  255. }, {
  256. 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
  257. 'only_matching': True,
  258. }]
  259. def _login(self):
  260. username, password = self._get_login_info()
  261. if username is None:
  262. return
  263. login_page = self._download_webpage(
  264. self._LOGIN_URL, None, 'Downloading signin page')
  265. login_form = self._hidden_inputs(login_page)
  266. login_form.update({
  267. 'username': username,
  268. 'password': password,
  269. })
  270. post_url = urljoin(self._LOGIN_URL, self._search_regex(
  271. r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
  272. 'post url', default=self._LOGIN_URL, group='url'))
  273. response, urlh = self._download_webpage_handle(
  274. post_url, None, 'Logging in', data=urlencode_postdata(login_form),
  275. headers={'Referer': self._LOGIN_URL})
  276. if self._LOGIN_URL in urlh.geturl():
  277. error = clean_html(get_element_by_class('form-message', response))
  278. if error:
  279. raise ExtractorError(
  280. 'Unable to login: %s' % error, expected=True)
  281. raise ExtractorError('Unable to log in')
  282. def _real_initialize(self):
  283. self._login()
  284. class MediaSelectionError(Exception):
  285. def __init__(self, id):
  286. self.id = id
  287. def _extract_asx_playlist(self, connection, programme_id):
  288. asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
  289. return [ref.get('href') for ref in asx.findall('./Entry/ref')]
  290. def _extract_items(self, playlist):
  291. return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
  292. def _extract_medias(self, media_selection):
  293. error = media_selection.get('result')
  294. if error:
  295. raise BBCCoUkIE.MediaSelectionError(error)
  296. return media_selection.get('media') or []
  297. def _extract_connections(self, media):
  298. return media.get('connection') or []
  299. def _get_subtitles(self, media, programme_id):
  300. subtitles = {}
  301. for connection in self._extract_connections(media):
  302. cc_url = url_or_none(connection.get('href'))
  303. if not cc_url:
  304. continue
  305. captions = self._download_xml(
  306. cc_url, programme_id, 'Downloading captions', fatal=False)
  307. if not isinstance(captions, compat_etree_Element):
  308. continue
  309. subtitles['en'] = [
  310. {
  311. 'url': connection.get('href'),
  312. 'ext': 'ttml',
  313. },
  314. ]
  315. break
  316. return subtitles
  317. def _raise_extractor_error(self, media_selection_error):
  318. raise ExtractorError(
  319. '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
  320. expected=True)
  321. def _download_media_selector(self, programme_id):
  322. last_exception = None
  323. for media_set in self._MEDIA_SETS:
  324. try:
  325. return self._download_media_selector_url(
  326. self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
  327. except BBCCoUkIE.MediaSelectionError as e:
  328. if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
  329. last_exception = e
  330. continue
  331. self._raise_extractor_error(e)
  332. self._raise_extractor_error(last_exception)
  333. def _download_media_selector_url(self, url, programme_id=None):
  334. media_selection = self._download_json(
  335. url, programme_id, 'Downloading media selection JSON',
  336. expected_status=(403, 404))
  337. return self._process_media_selector(media_selection, programme_id)
  338. def _process_media_selector(self, media_selection, programme_id):
  339. formats = []
  340. subtitles = None
  341. urls = []
  342. for media in self._extract_medias(media_selection):
  343. kind = media.get('kind')
  344. if kind in ('video', 'audio'):
  345. bitrate = int_or_none(media.get('bitrate'))
  346. encoding = media.get('encoding')
  347. width = int_or_none(media.get('width'))
  348. height = int_or_none(media.get('height'))
  349. file_size = int_or_none(media.get('media_file_size'))
  350. for connection in self._extract_connections(media):
  351. href = connection.get('href')
  352. if href in urls:
  353. continue
  354. if href:
  355. urls.append(href)
  356. conn_kind = connection.get('kind')
  357. protocol = connection.get('protocol')
  358. supplier = connection.get('supplier')
  359. transfer_format = connection.get('transferFormat')
  360. format_id = supplier or conn_kind or protocol
  361. # ASX playlist
  362. if supplier == 'asx':
  363. for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
  364. formats.append({
  365. 'url': ref,
  366. 'format_id': 'ref%s_%s' % (i, format_id),
  367. })
  368. elif transfer_format == 'dash':
  369. formats.extend(self._extract_mpd_formats(
  370. href, programme_id, mpd_id=format_id, fatal=False))
  371. elif transfer_format == 'hls':
  372. # TODO: let expected_status be passed into _extract_xxx_formats() instead
  373. try:
  374. fmts = self._extract_m3u8_formats(
  375. href, programme_id, ext='mp4', entry_protocol='m3u8_native',
  376. m3u8_id=format_id, fatal=False)
  377. except ExtractorError as e:
  378. if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
  379. and e.exc_info[1].code in (403, 404)):
  380. raise
  381. fmts = []
  382. formats.extend(fmts)
  383. elif transfer_format == 'hds':
  384. formats.extend(self._extract_f4m_formats(
  385. href, programme_id, f4m_id=format_id, fatal=False))
  386. else:
  387. if not supplier and bitrate:
  388. format_id += '-%d' % bitrate
  389. fmt = {
  390. 'format_id': format_id,
  391. 'filesize': file_size,
  392. }
  393. if kind == 'video':
  394. fmt.update({
  395. 'width': width,
  396. 'height': height,
  397. 'tbr': bitrate,
  398. 'vcodec': encoding,
  399. })
  400. else:
  401. fmt.update({
  402. 'abr': bitrate,
  403. 'acodec': encoding,
  404. 'vcodec': 'none',
  405. })
  406. if protocol in ('http', 'https'):
  407. # Direct link
  408. fmt.update({
  409. 'url': href,
  410. })
  411. elif protocol == 'rtmp':
  412. application = connection.get('application', 'ondemand')
  413. auth_string = connection.get('authString')
  414. identifier = connection.get('identifier')
  415. server = connection.get('server')
  416. fmt.update({
  417. 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
  418. 'play_path': identifier,
  419. 'app': '%s?%s' % (application, auth_string),
  420. 'page_url': 'http://www.bbc.co.uk',
  421. 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
  422. 'rtmp_live': False,
  423. 'ext': 'flv',
  424. })
  425. else:
  426. continue
  427. formats.append(fmt)
  428. elif kind == 'captions':
  429. subtitles = self.extract_subtitles(media, programme_id)
  430. return formats, subtitles
  431. def _download_playlist(self, playlist_id):
  432. try:
  433. playlist = self._download_json(
  434. 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
  435. playlist_id, 'Downloading playlist JSON')
  436. version = playlist.get('defaultAvailableVersion')
  437. if version:
  438. smp_config = version['smpConfig']
  439. title = smp_config['title']
  440. description = smp_config['summary']
  441. for item in smp_config['items']:
  442. kind = item['kind']
  443. if kind not in ('programme', 'radioProgramme'):
  444. continue
  445. programme_id = item.get('vpid')
  446. duration = int_or_none(item.get('duration'))
  447. formats, subtitles = self._download_media_selector(programme_id)
  448. return programme_id, title, description, duration, formats, subtitles
  449. except ExtractorError as ee:
  450. if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
  451. raise
  452. # fallback to legacy playlist
  453. return self._process_legacy_playlist(playlist_id)
  454. def _process_legacy_playlist_url(self, url, display_id):
  455. playlist = self._download_legacy_playlist_url(url, display_id)
  456. return self._extract_from_legacy_playlist(playlist, display_id)
  457. def _process_legacy_playlist(self, playlist_id):
  458. return self._process_legacy_playlist_url(
  459. 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
  460. def _download_legacy_playlist_url(self, url, playlist_id=None):
  461. return self._download_xml(
  462. url, playlist_id, 'Downloading legacy playlist XML')
  463. def _extract_from_legacy_playlist(self, playlist, playlist_id):
  464. no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
  465. if no_items is not None:
  466. reason = no_items.get('reason')
  467. if reason == 'preAvailability':
  468. msg = 'Episode %s is not yet available' % playlist_id
  469. elif reason == 'postAvailability':
  470. msg = 'Episode %s is no longer available' % playlist_id
  471. elif reason == 'noMedia':
  472. msg = 'Episode %s is not currently available' % playlist_id
  473. else:
  474. msg = 'Episode %s is not available: %s' % (playlist_id, reason)
  475. raise ExtractorError(msg, expected=True)
  476. for item in self._extract_items(playlist):
  477. kind = item.get('kind')
  478. if kind not in ('programme', 'radioProgramme'):
  479. continue
  480. title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
  481. description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
  482. description = description_el.text if description_el is not None else None
  483. def get_programme_id(item):
  484. def get_from_attributes(item):
  485. for p in ('identifier', 'group'):
  486. value = item.get(p)
  487. if value and re.match(r'^[pb][\da-z]{7}$', value):
  488. return value
  489. get_from_attributes(item)
  490. mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
  491. if mediator is not None:
  492. return get_from_attributes(mediator)
  493. programme_id = get_programme_id(item)
  494. duration = int_or_none(item.get('duration'))
  495. if programme_id:
  496. formats, subtitles = self._download_media_selector(programme_id)
  497. else:
  498. formats, subtitles = self._process_media_selector(item, playlist_id)
  499. programme_id = playlist_id
  500. return programme_id, title, description, duration, formats, subtitles
  501. def _real_extract(self, url):
  502. group_id = self._match_id(url)
  503. webpage = self._download_webpage(url, group_id, 'Downloading video page')
  504. error = self._search_regex(
  505. r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
  506. webpage, 'error', default=None)
  507. if error:
  508. raise ExtractorError(error, expected=True)
  509. programme_id = None
  510. duration = None
  511. tviplayer = self._search_regex(
  512. r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
  513. webpage, 'player', default=None)
  514. if tviplayer:
  515. player = self._parse_json(tviplayer, group_id).get('player', {})
  516. duration = int_or_none(player.get('duration'))
  517. programme_id = player.get('vpid')
  518. if not programme_id:
  519. programme_id = self._search_regex(
  520. r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
  521. if programme_id:
  522. formats, subtitles = self._download_media_selector(programme_id)
  523. title = self._og_search_title(webpage, default=None) or self._html_search_regex(
  524. (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
  525. r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
  526. description = self._search_regex(
  527. (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
  528. r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
  529. webpage, 'description', default=None)
  530. if not description:
  531. description = self._html_search_meta('description', webpage)
  532. else:
  533. programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  534. self._sort_formats(formats)
  535. return {
  536. 'id': programme_id,
  537. 'title': title,
  538. 'description': description,
  539. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  540. 'duration': duration,
  541. 'formats': formats,
  542. 'subtitles': subtitles,
  543. }
  544. class BBCIE(BBCCoUkIE):
  545. IE_NAME = 'bbc'
  546. IE_DESC = 'BBC'
  547. _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
  548. _MEDIA_SETS = [
  549. 'mobile-tablet-main',
  550. 'pc',
  551. ]
  552. _TESTS = [{
  553. # article with multiple videos embedded with data-playable containing vpids
  554. 'url': 'http://www.bbc.com/news/world-europe-32668511',
  555. 'info_dict': {
  556. 'id': 'world-europe-32668511',
  557. 'title': 'Russia stages massive WW2 parade',
  558. 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
  559. },
  560. 'playlist_count': 2,
  561. }, {
  562. # article with multiple videos embedded with data-playable (more videos)
  563. 'url': 'http://www.bbc.com/news/business-28299555',
  564. 'info_dict': {
  565. 'id': 'business-28299555',
  566. 'title': 'Farnborough Airshow: Video highlights',
  567. 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
  568. },
  569. 'playlist_count': 9,
  570. 'skip': 'Save time',
  571. }, {
  572. # article with multiple videos embedded with `new SMP()`
  573. # broken
  574. 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
  575. 'info_dict': {
  576. 'id': '3662a707-0af9-3149-963f-47bea720b460',
  577. 'title': 'BUGGER',
  578. },
  579. 'playlist_count': 18,
  580. }, {
  581. # single video embedded with data-playable containing vpid
  582. 'url': 'http://www.bbc.com/news/world-europe-32041533',
  583. 'info_dict': {
  584. 'id': 'p02mprgb',
  585. 'ext': 'mp4',
  586. 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
  587. 'description': 'md5:2868290467291b37feda7863f7a83f54',
  588. 'duration': 47,
  589. 'timestamp': 1427219242,
  590. 'upload_date': '20150324',
  591. },
  592. 'params': {
  593. # rtmp download
  594. 'skip_download': True,
  595. }
  596. }, {
  597. # article with single video embedded with data-playable containing XML playlist
  598. # with direct video links as progressiveDownloadUrl (for now these are extracted)
  599. # and playlist with f4m and m3u8 as streamingUrl
  600. 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
  601. 'info_dict': {
  602. 'id': '150615_telabyad_kentin_cogu',
  603. 'ext': 'mp4',
  604. 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
  605. 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
  606. 'timestamp': 1434397334,
  607. 'upload_date': '20150615',
  608. },
  609. 'params': {
  610. 'skip_download': True,
  611. }
  612. }, {
  613. # single video embedded with data-playable containing XML playlists (regional section)
  614. 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
  615. 'info_dict': {
  616. 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
  617. 'ext': 'mp4',
  618. 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
  619. 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
  620. 'timestamp': 1434713142,
  621. 'upload_date': '20150619',
  622. },
  623. 'params': {
  624. 'skip_download': True,
  625. }
  626. }, {
  627. # single video from video playlist embedded with vxp-playlist-data JSON
  628. 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
  629. 'info_dict': {
  630. 'id': 'p02w6qjc',
  631. 'ext': 'mp4',
  632. 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
  633. 'duration': 56,
  634. 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
  635. },
  636. 'params': {
  637. 'skip_download': True,
  638. }
  639. }, {
  640. # single video story with digitalData
  641. 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
  642. 'info_dict': {
  643. 'id': 'p02q6gc4',
  644. 'ext': 'flv',
  645. 'title': 'Sri Lanka’s spicy secret',
  646. 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
  647. 'timestamp': 1437674293,
  648. 'upload_date': '20150723',
  649. },
  650. 'params': {
  651. # rtmp download
  652. 'skip_download': True,
  653. }
  654. }, {
  655. # single video story without digitalData
  656. 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
  657. 'info_dict': {
  658. 'id': 'p018zqqg',
  659. 'ext': 'mp4',
  660. 'title': 'Hyundai Santa Fe Sport: Rock star',
  661. 'description': 'md5:b042a26142c4154a6e472933cf20793d',
  662. 'timestamp': 1415867444,
  663. 'upload_date': '20141113',
  664. },
  665. 'params': {
  666. # rtmp download
  667. 'skip_download': True,
  668. }
  669. }, {
  670. # single video embedded with Morph
  671. 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
  672. 'info_dict': {
  673. 'id': 'p041vhd0',
  674. 'ext': 'mp4',
  675. 'title': "Nigeria v Japan - Men's First Round",
  676. 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
  677. 'duration': 7980,
  678. 'uploader': 'BBC Sport',
  679. 'uploader_id': 'bbc_sport',
  680. },
  681. 'params': {
  682. # m3u8 download
  683. 'skip_download': True,
  684. },
  685. 'skip': 'Georestricted to UK',
  686. }, {
  687. # single video with playlist.sxml URL in playlist param
  688. 'url': 'http://www.bbc.com/sport/0/football/33653409',
  689. 'info_dict': {
  690. 'id': 'p02xycnp',
  691. 'ext': 'mp4',
  692. 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
  693. 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
  694. 'duration': 140,
  695. },
  696. 'params': {
  697. # rtmp download
  698. 'skip_download': True,
  699. }
  700. }, {
  701. # article with multiple videos embedded with playlist.sxml in playlist param
  702. 'url': 'http://www.bbc.com/sport/0/football/34475836',
  703. 'info_dict': {
  704. 'id': '34475836',
  705. 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
  706. 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
  707. },
  708. 'playlist_count': 3,
  709. }, {
  710. # school report article with single video
  711. 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
  712. 'info_dict': {
  713. 'id': '35744779',
  714. 'title': 'School which breaks down barriers in Jerusalem',
  715. },
  716. 'playlist_count': 1,
  717. }, {
  718. # single video with playlist URL from weather section
  719. 'url': 'http://www.bbc.com/weather/features/33601775',
  720. 'only_matching': True,
  721. }, {
  722. # custom redirection to www.bbc.com
  723. # also, video with window.__INITIAL_DATA__
  724. 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
  725. 'info_dict': {
  726. 'id': 'p02xzws1',
  727. 'ext': 'mp4',
  728. 'title': "Pluto may have 'nitrogen glaciers'",
  729. 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
  730. 'thumbnail': r're:https?://.+/.+\.jpg',
  731. 'timestamp': 1437785037,
  732. 'upload_date': '20150725',
  733. },
  734. }, {
  735. # video with window.__INITIAL_DATA__ and value as JSON string
  736. 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
  737. 'info_dict': {
  738. 'id': 'p0b71qth',
  739. 'ext': 'mp4',
  740. 'title': 'Why France is making this woman a national hero',
  741. 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
  742. 'thumbnail': r're:https?://.+/.+\.jpg',
  743. 'timestamp': 1638230731,
  744. 'upload_date': '20211130',
  745. },
  746. }, {
  747. # single video article embedded with data-media-vpid
  748. 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
  749. 'only_matching': True,
  750. }, {
  751. # bbcthreeConfig
  752. 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
  753. 'info_dict': {
  754. 'id': 'p06556y7',
  755. 'ext': 'mp4',
  756. 'title': 'Things Not To Say to people that live on council estates',
  757. 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
  758. 'duration': 360,
  759. 'thumbnail': r're:https?://.+/.+\.jpg',
  760. },
  761. }, {
  762. # window.__PRELOADED_STATE__
  763. 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
  764. 'info_dict': {
  765. 'id': 'b0b9z4vz',
  766. 'ext': 'mp4',
  767. 'title': 'Prom 6: An American in Paris and Turangalila',
  768. 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
  769. 'uploader': 'Radio 3',
  770. 'uploader_id': 'bbc_radio_three',
  771. },
  772. }, {
  773. 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
  774. 'info_dict': {
  775. 'id': 'p06w9tws',
  776. 'ext': 'mp4',
  777. 'title': 'md5:2fabf12a726603193a2879a055f72514',
  778. 'description': 'Learn English words and phrases from this story',
  779. },
  780. 'add_ie': [BBCCoUkIE.ie_key()],
  781. }, {
  782. # BBC Reel
  783. 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
  784. 'info_dict': {
  785. 'id': 'p07c6sb9',
  786. 'ext': 'mp4',
  787. 'title': 'How positive thinking is harming your happiness',
  788. 'alt_title': 'The downsides of positive thinking',
  789. 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
  790. 'duration': 235,
  791. 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
  792. 'upload_date': '20190604',
  793. 'categories': ['Psychology'],
  794. },
  795. }]
  796. @classmethod
  797. def suitable(cls, url):
  798. EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
  799. return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
  800. else super(BBCIE, cls).suitable(url))
  801. def _extract_from_media_meta(self, media_meta, video_id):
  802. # Direct links to media in media metadata (e.g.
  803. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
  804. # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
  805. source_files = media_meta.get('sourceFiles')
  806. if source_files:
  807. return [{
  808. 'url': f['url'],
  809. 'format_id': format_id,
  810. 'ext': f.get('encoding'),
  811. 'tbr': float_or_none(f.get('bitrate'), 1000),
  812. 'filesize': int_or_none(f.get('filesize')),
  813. } for format_id, f in source_files.items() if f.get('url')], []
  814. programme_id = media_meta.get('externalId')
  815. if programme_id:
  816. return self._download_media_selector(programme_id)
  817. # Process playlist.sxml as legacy playlist
  818. href = media_meta.get('href')
  819. if href:
  820. playlist = self._download_legacy_playlist_url(href)
  821. _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
  822. return formats, subtitles
  823. return [], []
  824. def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
  825. programme_id, title, description, duration, formats, subtitles = \
  826. self._process_legacy_playlist_url(url, playlist_id)
  827. self._sort_formats(formats)
  828. return {
  829. 'id': programme_id,
  830. 'title': title,
  831. 'description': description,
  832. 'duration': duration,
  833. 'timestamp': timestamp,
  834. 'formats': formats,
  835. 'subtitles': subtitles,
  836. }
  837. def _real_extract(self, url):
  838. playlist_id = self._match_id(url)
  839. webpage = self._download_webpage(url, playlist_id)
  840. json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
  841. timestamp = json_ld_info.get('timestamp')
  842. playlist_title = json_ld_info.get('title')
  843. if not playlist_title:
  844. playlist_title = self._og_search_title(
  845. webpage, default=None) or self._html_search_regex(
  846. r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
  847. if playlist_title:
  848. playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
  849. playlist_description = json_ld_info.get(
  850. 'description') or self._og_search_description(webpage, default=None)
  851. if not timestamp:
  852. timestamp = parse_iso8601(self._search_regex(
  853. [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
  854. r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
  855. r'"datePublished":\s*"([^"]+)'],
  856. webpage, 'date', default=None))
  857. entries = []
  858. # article with multiple videos embedded with playlist.sxml (e.g.
  859. # http://www.bbc.com/sport/0/football/34475836)
  860. playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
  861. playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
  862. if playlists:
  863. entries = [
  864. self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
  865. for playlist_url in playlists]
  866. # news article with multiple videos embedded with data-playable
  867. data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
  868. if data_playables:
  869. for _, data_playable_json in data_playables:
  870. data_playable = self._parse_json(
  871. unescapeHTML(data_playable_json), playlist_id, fatal=False)
  872. if not data_playable:
  873. continue
  874. settings = data_playable.get('settings', {})
  875. if settings:
  876. # data-playable with video vpid in settings.playlistObject.items (e.g.
  877. # http://www.bbc.com/news/world-us-canada-34473351)
  878. playlist_object = settings.get('playlistObject', {})
  879. if playlist_object:
  880. items = playlist_object.get('items')
  881. if items and isinstance(items, list):
  882. title = playlist_object['title']
  883. description = playlist_object.get('summary')
  884. duration = int_or_none(items[0].get('duration'))
  885. programme_id = items[0].get('vpid')
  886. formats, subtitles = self._download_media_selector(programme_id)
  887. self._sort_formats(formats)
  888. entries.append({
  889. 'id': programme_id,
  890. 'title': title,
  891. 'description': description,
  892. 'timestamp': timestamp,
  893. 'duration': duration,
  894. 'formats': formats,
  895. 'subtitles': subtitles,
  896. })
  897. else:
  898. # data-playable without vpid but with a playlist.sxml URLs
  899. # in otherSettings.playlist (e.g.
  900. # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
  901. playlist = data_playable.get('otherSettings', {}).get('playlist', {})
  902. if playlist:
  903. entry = None
  904. for key in ('streaming', 'progressiveDownload'):
  905. playlist_url = playlist.get('%sUrl' % key)
  906. if not playlist_url:
  907. continue
  908. try:
  909. info = self._extract_from_playlist_sxml(
  910. playlist_url, playlist_id, timestamp)
  911. if not entry:
  912. entry = info
  913. else:
  914. entry['title'] = info['title']
  915. entry['formats'].extend(info['formats'])
  916. except ExtractorError as e:
  917. # Some playlist URL may fail with 500, at the same time
  918. # the other one may work fine (e.g.
  919. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
  920. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
  921. continue
  922. raise
  923. if entry:
  924. self._sort_formats(entry['formats'])
  925. entries.append(entry)
  926. if entries:
  927. return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  928. # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
  929. group_id = self._search_regex(
  930. r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
  931. webpage, 'group id', default=None)
  932. if group_id:
  933. return self.url_result(
  934. 'https://www.bbc.co.uk/programmes/%s' % group_id,
  935. ie=BBCCoUkIE.ie_key())
  936. # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
  937. programme_id = self._search_regex(
  938. [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
  939. r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
  940. r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
  941. webpage, 'vpid', default=None)
  942. if programme_id:
  943. formats, subtitles = self._download_media_selector(programme_id)
  944. self._sort_formats(formats)
  945. # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
  946. digital_data = self._parse_json(
  947. self._search_regex(
  948. r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
  949. programme_id, fatal=False)
  950. page_info = digital_data.get('page', {}).get('pageInfo', {})
  951. title = page_info.get('pageName') or self._og_search_title(webpage)
  952. description = page_info.get('description') or self._og_search_description(webpage)
  953. timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
  954. return {
  955. 'id': programme_id,
  956. 'title': title,
  957. 'description': description,
  958. 'timestamp': timestamp,
  959. 'formats': formats,
  960. 'subtitles': subtitles,
  961. }
  962. # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
  963. initial_data = self._parse_json(self._html_search_regex(
  964. r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
  965. webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
  966. if initial_data:
  967. init_data = try_get(
  968. initial_data, lambda x: x['initData']['items'][0], dict) or {}
  969. smp_data = init_data.get('smpData') or {}
  970. clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
  971. version_id = clip_data.get('versionID')
  972. if version_id:
  973. title = smp_data['title']
  974. formats, subtitles = self._download_media_selector(version_id)
  975. self._sort_formats(formats)
  976. image_url = smp_data.get('holdingImageURL')
  977. display_date = init_data.get('displayDate')
  978. topic_title = init_data.get('topicTitle')
  979. return {
  980. 'id': version_id,
  981. 'title': title,
  982. 'formats': formats,
  983. 'alt_title': init_data.get('shortTitle'),
  984. 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
  985. 'description': smp_data.get('summary') or init_data.get('shortSummary'),
  986. 'upload_date': display_date.replace('-', '') if display_date else None,
  987. 'subtitles': subtitles,
  988. 'duration': int_or_none(clip_data.get('duration')),
  989. 'categories': [topic_title] if topic_title else None,
  990. }
  991. # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
  992. # There are several setPayload calls may be present but the video
  993. # seems to be always related to the first one
  994. morph_payload = self._parse_json(
  995. self._search_regex(
  996. r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
  997. webpage, 'morph payload', default='{}'),
  998. playlist_id, fatal=False)
  999. if morph_payload:
  1000. components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
  1001. for component in components:
  1002. if not isinstance(component, dict):
  1003. continue
  1004. lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
  1005. if not lead_media:
  1006. continue
  1007. identifiers = lead_media.get('identifiers')
  1008. if not identifiers or not isinstance(identifiers, dict):
  1009. continue
  1010. programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
  1011. if not programme_id:
  1012. continue
  1013. title = lead_media.get('title') or self._og_search_title(webpage)
  1014. formats, subtitles = self._download_media_selector(programme_id)
  1015. self._sort_formats(formats)
  1016. description = lead_media.get('summary')
  1017. uploader = lead_media.get('masterBrand')
  1018. uploader_id = lead_media.get('mid')
  1019. duration = None
  1020. duration_d = lead_media.get('duration')
  1021. if isinstance(duration_d, dict):
  1022. duration = parse_duration(dict_get(
  1023. duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
  1024. return {
  1025. 'id': programme_id,
  1026. 'title': title,
  1027. 'description': description,
  1028. 'duration': duration,
  1029. 'uploader': uploader,
  1030. 'uploader_id': uploader_id,
  1031. 'formats': formats,
  1032. 'subtitles': subtitles,
  1033. }
  1034. preload_state = self._parse_json(self._search_regex(
  1035. r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
  1036. 'preload state', default='{}'), playlist_id, fatal=False)
  1037. if preload_state:
  1038. current_programme = preload_state.get('programmes', {}).get('current') or {}
  1039. programme_id = current_programme.get('id')
  1040. if current_programme and programme_id and current_programme.get('type') == 'playable_item':
  1041. title = current_programme.get('titles', {}).get('tertiary') or playlist_title
  1042. formats, subtitles = self._download_media_selector(programme_id)
  1043. self._sort_formats(formats)
  1044. synopses = current_programme.get('synopses') or {}
  1045. network = current_programme.get('network') or {}
  1046. duration = int_or_none(
  1047. current_programme.get('duration', {}).get('value'))
  1048. thumbnail = None
  1049. image_url = current_programme.get('image_url')
  1050. if image_url:
  1051. thumbnail = image_url.replace('{recipe}', 'raw')
  1052. return {
  1053. 'id': programme_id,
  1054. 'title': title,
  1055. 'description': dict_get(synopses, ('long', 'medium', 'short')),
  1056. 'thumbnail': thumbnail,
  1057. 'duration': duration,
  1058. 'uploader': network.get('short_title'),
  1059. 'uploader_id': network.get('id'),
  1060. 'formats': formats,
  1061. 'subtitles': subtitles,
  1062. }
  1063. bbc3_config = self._parse_json(
  1064. self._search_regex(
  1065. r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
  1066. 'bbcthree config', default='{}'),
  1067. playlist_id, transform_source=js_to_json, fatal=False) or {}
  1068. payload = bbc3_config.get('payload') or {}
  1069. if payload:
  1070. clip = payload.get('currentClip') or {}
  1071. clip_vpid = clip.get('vpid')
  1072. clip_title = clip.get('title')
  1073. if clip_vpid and clip_title:
  1074. formats, subtitles = self._download_media_selector(clip_vpid)
  1075. self._sort_formats(formats)
  1076. return {
  1077. 'id': clip_vpid,
  1078. 'title': clip_title,
  1079. 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
  1080. 'description': clip.get('description'),
  1081. 'duration': parse_duration(clip.get('duration')),
  1082. 'formats': formats,
  1083. 'subtitles': subtitles,
  1084. }
  1085. bbc3_playlist = try_get(
  1086. payload, lambda x: x['content']['bbcMedia']['playlist'],
  1087. dict)
  1088. if bbc3_playlist:
  1089. playlist_title = bbc3_playlist.get('title') or playlist_title
  1090. thumbnail = bbc3_playlist.get('holdingImageURL')
  1091. entries = []
  1092. for bbc3_item in bbc3_playlist['items']:
  1093. programme_id = bbc3_item.get('versionID')
  1094. if not programme_id:
  1095. continue
  1096. formats, subtitles = self._download_media_selector(programme_id)
  1097. self._sort_formats(formats)
  1098. entries.append({
  1099. 'id': programme_id,
  1100. 'title': playlist_title,
  1101. 'thumbnail': thumbnail,
  1102. 'timestamp': timestamp,
  1103. 'formats': formats,
  1104. 'subtitles': subtitles,
  1105. })
  1106. return self.playlist_result(
  1107. entries, playlist_id, playlist_title, playlist_description)
  1108. initial_data = self._search_regex(
  1109. r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
  1110. 'quoted preload state', default=None)
  1111. if initial_data is None:
  1112. initial_data = self._search_regex(
  1113. r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
  1114. 'preload state', default={})
  1115. else:
  1116. initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
  1117. initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
  1118. if initial_data:
  1119. def parse_media(media):
  1120. if not media:
  1121. return
  1122. for item in (try_get(media, lambda x: x['media']['items'], list) or []):
  1123. item_id = item.get('id')
  1124. item_title = item.get('title')
  1125. if not (item_id and item_title):
  1126. continue
  1127. formats, subtitles = self._download_media_selector(item_id)
  1128. self._sort_formats(formats)
  1129. item_desc = None
  1130. blocks = try_get(media, lambda x: x['summary']['blocks'], list)
  1131. if blocks:
  1132. summary = []
  1133. for block in blocks:
  1134. text = try_get(block, lambda x: x['model']['text'], compat_str)
  1135. if text:
  1136. summary.append(text)
  1137. if summary:
  1138. item_desc = '\n\n'.join(summary)
  1139. item_time = None
  1140. for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
  1141. if try_get(meta, lambda x: x['label']) == 'Published':
  1142. item_time = unified_timestamp(meta.get('timestamp'))
  1143. break
  1144. entries.append({
  1145. 'id': item_id,
  1146. 'title': item_title,
  1147. 'thumbnail': item.get('holdingImageUrl'),
  1148. 'formats': formats,
  1149. 'subtitles': subtitles,
  1150. 'timestamp': item_time,
  1151. 'description': strip_or_none(item_desc),
  1152. })
  1153. for resp in (initial_data.get('data') or {}).values():
  1154. name = resp.get('name')
  1155. if name == 'media-experience':
  1156. parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
  1157. elif name == 'article':
  1158. for block in (try_get(resp,
  1159. (lambda x: x['data']['blocks'],
  1160. lambda x: x['data']['content']['model']['blocks'],),
  1161. list) or []):
  1162. if block.get('type') != 'media':
  1163. continue
  1164. parse_media(block.get('model'))
  1165. return self.playlist_result(
  1166. entries, playlist_id, playlist_title, playlist_description)
  1167. def extract_all(pattern):
  1168. return list(filter(None, map(
  1169. lambda s: self._parse_json(s, playlist_id, fatal=False),
  1170. re.findall(pattern, webpage))))
  1171. # Multiple video article (e.g.
  1172. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
  1173. EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
  1174. entries = []
  1175. for match in extract_all(r'new\s+SMP\(({.+?})\)'):
  1176. embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
  1177. if embed_url and re.match(EMBED_URL, embed_url):
  1178. entries.append(embed_url)
  1179. entries.extend(re.findall(
  1180. r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
  1181. if entries:
  1182. return self.playlist_result(
  1183. [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
  1184. playlist_id, playlist_title, playlist_description)
  1185. # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
  1186. medias = extract_all(r"data-media-meta='({[^']+})'")
  1187. if not medias:
  1188. # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
  1189. media_asset = self._search_regex(
  1190. r'mediaAssetPage\.init\(\s*({.+?}), "/',
  1191. webpage, 'media asset', default=None)
  1192. if media_asset:
  1193. media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
  1194. medias = []
  1195. for video in media_asset_page.get('videos', {}).values():
  1196. medias.extend(video.values())
  1197. if not medias:
  1198. # Multiple video playlist with single `now playing` entry (e.g.
  1199. # http://www.bbc.com/news/video_and_audio/must_see/33767813)
  1200. vxp_playlist = self._parse_json(
  1201. self._search_regex(
  1202. r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
  1203. webpage, 'playlist data'),
  1204. playlist_id)
  1205. playlist_medias = []
  1206. for item in vxp_playlist:
  1207. media = item.get('media')
  1208. if not media:
  1209. continue
  1210. playlist_medias.append(media)
  1211. # Download single video if found media with asset id matching the video id from URL
  1212. if item.get('advert', {}).get('assetId') == playlist_id:
  1213. medias = [media]
  1214. break
  1215. # Fallback to the whole playlist
  1216. if not medias:
  1217. medias = playlist_medias
  1218. entries = []
  1219. for num, media_meta in enumerate(medias, start=1):
  1220. formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
  1221. if not formats:
  1222. continue
  1223. self._sort_formats(formats)
  1224. video_id = media_meta.get('externalId')
  1225. if not video_id:
  1226. video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
  1227. title = media_meta.get('caption')
  1228. if not title:
  1229. title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
  1230. duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
  1231. images = []
  1232. for image in media_meta.get('images', {}).values():
  1233. images.extend(image.values())
  1234. if 'image' in media_meta:
  1235. images.append(media_meta['image'])
  1236. thumbnails = [{
  1237. 'url': image.get('href'),
  1238. 'width': int_or_none(image.get('width')),
  1239. 'height': int_or_none(image.get('height')),
  1240. } for image in images]
  1241. entries.append({
  1242. 'id': video_id,
  1243. 'title': title,
  1244. 'thumbnails': thumbnails,
  1245. 'duration': duration,
  1246. 'timestamp': timestamp,
  1247. 'formats': formats,
  1248. 'subtitles': subtitles,
  1249. })
  1250. return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  1251. class BBCCoUkArticleIE(InfoExtractor):
  1252. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
  1253. IE_NAME = 'bbc.co.uk:article'
  1254. IE_DESC = 'BBC articles'
  1255. _TEST = {
  1256. 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
  1257. 'info_dict': {
  1258. 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
  1259. 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
  1260. 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
  1261. },
  1262. 'playlist_count': 4,
  1263. 'add_ie': ['BBCCoUk'],
  1264. }
  1265. def _real_extract(self, url):
  1266. playlist_id = self._match_id(url)
  1267. webpage = self._download_webpage(url, playlist_id)
  1268. title = self._og_search_title(webpage)
  1269. description = self._og_search_description(webpage).strip()
  1270. entries = [self.url_result(programme_url) for programme_url in re.findall(
  1271. r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
  1272. return self.playlist_result(entries, playlist_id, title, description)
  1273. class BBCCoUkPlaylistBaseIE(InfoExtractor):
  1274. def _entries(self, webpage, url, playlist_id):
  1275. single_page = 'page' in compat_urlparse.parse_qs(
  1276. compat_urlparse.urlparse(url).query)
  1277. for page_num in itertools.count(2):
  1278. for video_id in re.findall(
  1279. self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
  1280. yield self.url_result(
  1281. self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
  1282. if single_page:
  1283. return
  1284. next_page = self._search_regex(
  1285. r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
  1286. webpage, 'next page url', default=None, group='url')
  1287. if not next_page:
  1288. break
  1289. webpage = self._download_webpage(
  1290. compat_urlparse.urljoin(url, next_page), playlist_id,
  1291. 'Downloading page %d' % page_num, page_num)
  1292. def _real_extract(self, url):
  1293. playlist_id = self._match_id(url)
  1294. webpage = self._download_webpage(url, playlist_id)
  1295. title, description = self._extract_title_and_description(webpage)
  1296. return self.playlist_result(
  1297. self._entries(webpage, url, playlist_id),
  1298. playlist_id, title, description)
  1299. class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
  1300. _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
  1301. @staticmethod
  1302. def _get_default(episode, key, default_key='default'):
  1303. return try_get(episode, lambda x: x[key][default_key])
  1304. def _get_description(self, data):
  1305. synopsis = data.get(self._DESCRIPTION_KEY) or {}
  1306. return dict_get(synopsis, ('large', 'medium', 'small'))
  1307. def _fetch_page(self, programme_id, per_page, series_id, page):
  1308. elements = self._get_elements(self._call_api(
  1309. programme_id, per_page, page + 1, series_id))
  1310. for element in elements:
  1311. episode = self._get_episode(element)
  1312. episode_id = episode.get('id')
  1313. if not episode_id:
  1314. continue
  1315. thumbnail = None
  1316. image = self._get_episode_image(episode)
  1317. if image:
  1318. thumbnail = image.replace('{recipe}', 'raw')
  1319. category = self._get_default(episode, 'labels', 'category')
  1320. yield {
  1321. '_type': 'url',
  1322. 'id': episode_id,
  1323. 'title': self._get_episode_field(episode, 'subtitle'),
  1324. 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
  1325. 'thumbnail': thumbnail,
  1326. 'description': self._get_description(episode),
  1327. 'categories': [category] if category else None,
  1328. 'series': self._get_episode_field(episode, 'title'),
  1329. 'ie_key': BBCCoUkIE.ie_key(),
  1330. }
  1331. def _real_extract(self, url):
  1332. pid = self._match_id(url)
  1333. qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
  1334. series_id = qs.get('seriesId', [None])[0]
  1335. page = qs.get('page', [None])[0]
  1336. per_page = 36 if page else self._PAGE_SIZE
  1337. fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
  1338. entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
  1339. playlist_data = self._get_playlist_data(self._call_api(pid, 1))
  1340. return self.playlist_result(
  1341. entries, pid, self._get_playlist_title(playlist_data),
  1342. self._get_description(playlist_data))
  1343. class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
  1344. IE_NAME = 'bbc.co.uk:iplayer:episodes'
  1345. _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
  1346. _TESTS = [{
  1347. 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
  1348. 'info_dict': {
  1349. 'id': 'b05rcz9v',
  1350. 'title': 'The Disappearance',
  1351. 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
  1352. },
  1353. 'playlist_mincount': 8,
  1354. }, {
  1355. # all seasons
  1356. 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
  1357. 'info_dict': {
  1358. 'id': 'b094m5t9',
  1359. 'title': 'Doctor Foster',
  1360. 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
  1361. },
  1362. 'playlist_mincount': 10,
  1363. }, {
  1364. # explicit season
  1365. 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
  1366. 'info_dict': {
  1367. 'id': 'b094m5t9',
  1368. 'title': 'Doctor Foster',
  1369. 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
  1370. },
  1371. 'playlist_mincount': 5,
  1372. }, {
  1373. # all pages
  1374. 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
  1375. 'info_dict': {
  1376. 'id': 'm0004c4v',
  1377. 'title': 'Beechgrove',
  1378. 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
  1379. },
  1380. 'playlist_mincount': 37,
  1381. }, {
  1382. # explicit page
  1383. 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
  1384. 'info_dict': {
  1385. 'id': 'm0004c4v',
  1386. 'title': 'Beechgrove',
  1387. 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
  1388. },
  1389. 'playlist_mincount': 1,
  1390. }]
  1391. _PAGE_SIZE = 100
  1392. _DESCRIPTION_KEY = 'synopsis'
  1393. def _get_episode_image(self, episode):
  1394. return self._get_default(episode, 'image')
  1395. def _get_episode_field(self, episode, field):
  1396. return self._get_default(episode, field)
  1397. @staticmethod
  1398. def _get_elements(data):
  1399. return data['entities']['results']
  1400. @staticmethod
  1401. def _get_episode(element):
  1402. return element.get('episode') or {}
  1403. def _call_api(self, pid, per_page, page=1, series_id=None):
  1404. variables = {
  1405. 'id': pid,
  1406. 'page': page,
  1407. 'perPage': per_page,
  1408. }
  1409. if series_id:
  1410. variables['sliceId'] = series_id
  1411. return self._download_json(
  1412. 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
  1413. 'Content-Type': 'application/json'
  1414. }, data=json.dumps({
  1415. 'id': '5692d93d5aac8d796a0305e895e61551',
  1416. 'variables': variables,
  1417. }).encode('utf-8'))['data']['programme']
  1418. @staticmethod
  1419. def _get_playlist_data(data):
  1420. return data
  1421. def _get_playlist_title(self, data):
  1422. return self._get_default(data, 'title')
  1423. class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
  1424. IE_NAME = 'bbc.co.uk:iplayer:group'
  1425. _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
  1426. _TESTS = [{
  1427. # Available for over a year unlike 30 days for most other programmes
  1428. 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
  1429. 'info_dict': {
  1430. 'id': 'p02tcc32',
  1431. 'title': 'Bohemian Icons',
  1432. 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
  1433. },
  1434. 'playlist_mincount': 10,
  1435. }, {
  1436. # all pages
  1437. 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
  1438. 'info_dict': {
  1439. 'id': 'p081d7j7',
  1440. 'title': 'Music in Scotland',
  1441. 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
  1442. },
  1443. 'playlist_mincount': 47,
  1444. }, {
  1445. # explicit page
  1446. 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
  1447. 'info_dict': {
  1448. 'id': 'p081d7j7',
  1449. 'title': 'Music in Scotland',
  1450. 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
  1451. },
  1452. 'playlist_mincount': 11,
  1453. }]
  1454. _PAGE_SIZE = 200
  1455. _DESCRIPTION_KEY = 'synopses'
  1456. def _get_episode_image(self, episode):
  1457. return self._get_default(episode, 'images', 'standard')
  1458. def _get_episode_field(self, episode, field):
  1459. return episode.get(field)
  1460. @staticmethod
  1461. def _get_elements(data):
  1462. return data['elements']
  1463. @staticmethod
  1464. def _get_episode(element):
  1465. return element
  1466. def _call_api(self, pid, per_page, page=1, series_id=None):
  1467. return self._download_json(
  1468. 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
  1469. pid, query={
  1470. 'page': page,
  1471. 'per_page': per_page,
  1472. })['group_episodes']
  1473. @staticmethod
  1474. def _get_playlist_data(data):
  1475. return data['group']
  1476. def _get_playlist_title(self, data):
  1477. return data.get('title')
  1478. class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
  1479. IE_NAME = 'bbc.co.uk:playlist'
  1480. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
  1481. _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
  1482. _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
  1483. _TESTS = [{
  1484. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
  1485. 'info_dict': {
  1486. 'id': 'b05rcz9v',
  1487. 'title': 'The Disappearance - Clips - BBC Four',
  1488. 'description': 'French thriller serial about a missing teenager.',
  1489. },
  1490. 'playlist_mincount': 7,
  1491. }, {
  1492. # multipage playlist, explicit page
  1493. 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
  1494. 'info_dict': {
  1495. 'id': 'b00mfl7n',
  1496. 'title': 'Frozen Planet - Clips - BBC One',
  1497. 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
  1498. },
  1499. 'playlist_mincount': 24,
  1500. }, {
  1501. # multipage playlist, all pages
  1502. 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
  1503. 'info_dict': {
  1504. 'id': 'b00mfl7n',
  1505. 'title': 'Frozen Planet - Clips - BBC One',
  1506. 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
  1507. },
  1508. 'playlist_mincount': 142,
  1509. }, {
  1510. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
  1511. 'only_matching': True,
  1512. }, {
  1513. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
  1514. 'only_matching': True,
  1515. }, {
  1516. 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
  1517. 'only_matching': True,
  1518. }]
  1519. def _extract_title_and_description(self, webpage):
  1520. title = self._og_search_title(webpage, fatal=False)
  1521. description = self._og_search_description(webpage)
  1522. return title, description