logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

archiveorg.py (47378B)


  1. from __future__ import annotations
  2. import json
  3. import re
  4. import urllib.parse
  5. from .common import InfoExtractor
  6. from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
  7. from ..networking import HEADRequest
  8. from ..networking.exceptions import HTTPError
  9. from ..utils import (
  10. KNOWN_EXTENSIONS,
  11. ExtractorError,
  12. bug_reports_message,
  13. clean_html,
  14. dict_get,
  15. extract_attributes,
  16. get_element_by_id,
  17. int_or_none,
  18. join_nonempty,
  19. js_to_json,
  20. merge_dicts,
  21. mimetype2ext,
  22. orderedSet,
  23. parse_duration,
  24. parse_qs,
  25. str_or_none,
  26. str_to_int,
  27. traverse_obj,
  28. try_get,
  29. unified_strdate,
  30. unified_timestamp,
  31. url_or_none,
  32. urlhandle_detect_ext,
  33. variadic,
  34. )
  35. class ArchiveOrgIE(InfoExtractor):
  36. IE_NAME = 'archive.org'
  37. IE_DESC = 'archive.org video and audio'
  38. _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
  39. _TESTS = [{
  40. 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
  41. 'md5': '8af1d4cf447933ed3c7f4871162602db',
  42. 'info_dict': {
  43. 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
  44. 'ext': 'ogv',
  45. 'title': '1968 Demo - FJCC Conference Presentation Reel #1',
  46. 'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
  47. 'release_date': '19681210',
  48. 'timestamp': 1268695290,
  49. 'upload_date': '20100315',
  50. 'creators': ['SRI International'],
  51. 'uploader': 'laura@archive.org',
  52. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  53. 'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr',
  54. 'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect',
  55. },
  56. }, {
  57. 'url': 'https://archive.org/details/Cops1922',
  58. 'md5': '0869000b4ce265e8ca62738b336b268a',
  59. 'info_dict': {
  60. 'id': 'Cops1922',
  61. 'ext': 'mp4',
  62. 'title': 'Buster Keaton\'s "Cops" (1922)',
  63. 'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca',
  64. 'uploader': 'yorkmba99@hotmail.com',
  65. 'timestamp': 1387699629,
  66. 'upload_date': '20131222',
  67. 'display_id': 'Cops-v2.mp4',
  68. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  69. 'duration': 1091.96,
  70. },
  71. }, {
  72. 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
  73. 'only_matching': True,
  74. }, {
  75. 'url': 'https://archive.org/details/Election_Ads',
  76. 'md5': 'eec5cddebd4793c6a653b69c3b11f2e6',
  77. 'info_dict': {
  78. 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
  79. 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
  80. 'ext': 'mpg',
  81. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  82. 'duration': 59.77,
  83. 'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
  84. },
  85. }, {
  86. 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  87. 'md5': 'ea1eed8234e7d4165f38c8c769edef38',
  88. 'info_dict': {
  89. 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  90. 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  91. 'ext': 'mpg',
  92. 'timestamp': 1205588045,
  93. 'uploader': 'mikedavisstripmaster@yahoo.com',
  94. 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
  95. 'upload_date': '20080315',
  96. 'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  97. 'duration': 59.51,
  98. 'license': 'http://creativecommons.org/licenses/publicdomain/',
  99. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  100. },
  101. }, {
  102. 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
  103. 'md5': '7d07ffb42aba6537c28e053efa4b54c9',
  104. 'info_dict': {
  105. 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
  106. 'title': 'Turning',
  107. 'ext': 'flac',
  108. 'track': 'Turning',
  109. 'creators': ['Grateful Dead'],
  110. 'display_id': 'gd1977-05-08d01t01.flac',
  111. 'track_number': 1,
  112. 'album': '1977-05-08 - Barton Hall - Cornell University',
  113. 'duration': 39.8,
  114. },
  115. }, {
  116. 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
  117. 'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
  118. 'info_dict': {
  119. 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
  120. 'title': 'Deal',
  121. 'ext': 'flac',
  122. 'timestamp': 1205895624,
  123. 'uploader': 'mvernon54@yahoo.com',
  124. 'description': 'md5:6c921464414814720c6593810a5c7e3d',
  125. 'upload_date': '20080319',
  126. 'location': 'Barton Hall - Cornell University',
  127. 'duration': 438.68,
  128. 'track': 'Deal',
  129. 'creators': ['Grateful Dead'],
  130. 'album': '1977-05-08 - Barton Hall - Cornell University',
  131. 'release_date': '19770508',
  132. 'display_id': 'gd1977-05-08d01t07.flac',
  133. 'track_number': 7,
  134. },
  135. }, {
  136. # FIXME: give a better error message than just IndexError when all available formats are restricted
  137. 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
  138. 'md5': '7cb019baa9b332e82ea7c10403acd180',
  139. 'info_dict': {
  140. 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
  141. 'title': 'Bells Of Rostov',
  142. 'ext': 'mp3',
  143. },
  144. 'skip': 'restricted',
  145. }, {
  146. 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
  147. 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
  148. 'info_dict': {
  149. 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
  150. 'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
  151. 'ext': 'mp3',
  152. 'timestamp': 1569662587,
  153. 'uploader': 'associate-joygen-odiongan@archive.org',
  154. 'description': 'md5:012b2d668ae753be36896f343d12a236',
  155. 'upload_date': '20190928',
  156. },
  157. 'skip': 'restricted',
  158. }, {
  159. # Original formats are private
  160. 'url': 'https://archive.org/details/irelandthemakingofarepublic',
  161. 'info_dict': {
  162. 'id': 'irelandthemakingofarepublic',
  163. 'title': 'Ireland: The Making of a Republic',
  164. 'upload_date': '20160610',
  165. 'description': 'md5:f70956a156645a658a0dc9513d9e78b7',
  166. 'uploader': 'dimitrios@archive.org',
  167. 'creators': ['British Broadcasting Corporation', 'Time-Life Films'],
  168. 'timestamp': 1465594947,
  169. },
  170. 'playlist': [
  171. {
  172. 'md5': '0b211261b26590d49df968f71b90690d',
  173. 'info_dict': {
  174. 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov',
  175. 'ext': 'mp4',
  176. 'title': 'irelandthemakingofarepublicreel1_01.mov',
  177. 'duration': 130.46,
  178. 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg',
  179. 'display_id': 'irelandthemakingofarepublicreel1_01.mov',
  180. },
  181. }, {
  182. 'md5': '67335ee3b23a0da930841981c1e79b02',
  183. 'info_dict': {
  184. 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov',
  185. 'ext': 'mp4',
  186. 'duration': 1395.13,
  187. 'title': 'irelandthemakingofarepublicreel1_02.mov',
  188. 'display_id': 'irelandthemakingofarepublicreel1_02.mov',
  189. 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg',
  190. },
  191. }, {
  192. 'md5': 'e470e86787893603f4a341a16c281eb5',
  193. 'info_dict': {
  194. 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov',
  195. 'ext': 'mp4',
  196. 'duration': 1602.67,
  197. 'title': 'irelandthemakingofarepublicreel2.mov',
  198. 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg',
  199. 'display_id': 'irelandthemakingofarepublicreel2.mov',
  200. },
  201. },
  202. ],
  203. }, {
  204. # The reviewbody is None for one of the reviews; just need to extract data without crashing
  205. 'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
  206. 'info_dict': {
  207. 'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
  208. 'ext': 'mp3',
  209. 'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
  210. 'creators': ['Grateful Dead'],
  211. 'duration': 338.31,
  212. 'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
  213. 'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
  214. 'display_id': 'gd95-04-02d1t04.shn',
  215. 'location': 'Pyramid Arena',
  216. 'uploader': 'jon@archive.org',
  217. 'album': '1995-04-02 - Pyramid Arena',
  218. 'upload_date': '20040519',
  219. 'track_number': 4,
  220. 'release_date': '19950402',
  221. 'timestamp': 1084927901,
  222. },
  223. }]
  224. @staticmethod
  225. def _playlist_data(webpage):
  226. element = re.findall(r'''(?xs)
  227. <input
  228. (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  229. \s+class=['"]?js-play8-playlist['"]?
  230. (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  231. \s*/>
  232. ''', webpage)[0]
  233. return json.loads(extract_attributes(element)['value'])
  234. def _real_extract(self, url):
  235. video_id = urllib.parse.unquote_plus(self._match_id(url))
  236. identifier, _, entry_id = video_id.partition('/')
  237. # Archive.org metadata API doesn't clearly demarcate playlist entries
  238. # or subtitle tracks, so we get them from the embeddable player.
  239. embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
  240. playlist = self._playlist_data(embed_page)
  241. entries = {}
  242. for p in playlist:
  243. # If the user specified a playlist entry in the URL, ignore the
  244. # rest of the playlist.
  245. if entry_id and p['orig'] != entry_id:
  246. continue
  247. entries[p['orig']] = {
  248. 'formats': [],
  249. 'thumbnails': [],
  250. 'artist': p.get('artist'),
  251. 'track': p.get('title'),
  252. 'subtitles': {},
  253. }
  254. for track in p.get('tracks', []):
  255. if track['kind'] != 'subtitles':
  256. continue
  257. entries[p['orig']][track['label']] = {
  258. 'url': 'https://archive.org/' + track['file'].lstrip('/'),
  259. }
  260. metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
  261. m = metadata['metadata']
  262. identifier = m['identifier']
  263. info = {
  264. 'id': identifier,
  265. 'title': m['title'],
  266. 'description': clean_html(m.get('description')),
  267. 'uploader': dict_get(m, ['uploader', 'adder']),
  268. 'creators': traverse_obj(m, ('creator', {variadic}, {lambda x: x[0] and list(x)})),
  269. 'license': m.get('licenseurl'),
  270. 'release_date': unified_strdate(m.get('date')),
  271. 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
  272. 'webpage_url': f'https://archive.org/details/{identifier}',
  273. 'location': m.get('venue'),
  274. 'release_year': int_or_none(m.get('year'))}
  275. for f in metadata['files']:
  276. if f['name'] in entries:
  277. entries[f['name']] = merge_dicts(entries[f['name']], {
  278. 'id': identifier + '/' + f['name'],
  279. 'title': f.get('title') or f['name'],
  280. 'display_id': f['name'],
  281. 'description': clean_html(f.get('description')),
  282. 'creators': traverse_obj(f, ('creator', {variadic}, {lambda x: x[0] and list(x)})),
  283. 'duration': parse_duration(f.get('length')),
  284. 'track_number': int_or_none(f.get('track')),
  285. 'album': f.get('album'),
  286. 'discnumber': int_or_none(f.get('disc')),
  287. 'release_year': int_or_none(f.get('year'))})
  288. entry = entries[f['name']]
  289. elif traverse_obj(f, 'original', expected_type=str) in entries:
  290. entry = entries[f['original']]
  291. else:
  292. continue
  293. if f.get('format') == 'Thumbnail':
  294. entry['thumbnails'].append({
  295. 'id': f['name'],
  296. 'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
  297. 'width': int_or_none(f.get('width')),
  298. 'height': int_or_none(f.get('width')),
  299. 'filesize': int_or_none(f.get('size'))})
  300. _, has_ext, extension = f['name'].rpartition('.')
  301. if not has_ext:
  302. extension = None
  303. # We don't want to skip private formats if the user has access to them,
  304. # however without access to an account with such privileges we can't implement/test this.
  305. # For now to be safe, we will only skip them if there is no user logged in.
  306. is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))
  307. if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):
  308. entry['formats'].append({
  309. 'url': 'https://archive.org/download/' + identifier + '/' + urllib.parse.quote(f['name']),
  310. 'format': f.get('format'),
  311. 'width': int_or_none(f.get('width')),
  312. 'height': int_or_none(f.get('height')),
  313. 'filesize': int_or_none(f.get('size')),
  314. 'protocol': 'https',
  315. 'source_preference': 0 if f.get('source') == 'original' else -1,
  316. 'format_note': f.get('source'),
  317. })
  318. for entry in entries.values():
  319. entry['_format_sort_fields'] = ('source', )
  320. if len(entries) == 1:
  321. # If there's only one item, use it as the main info dict
  322. only_video = next(iter(entries.values()))
  323. if entry_id:
  324. info = merge_dicts(only_video, info)
  325. else:
  326. info = merge_dicts(info, only_video)
  327. else:
  328. # Otherwise, we have a playlist.
  329. info['_type'] = 'playlist'
  330. info['entries'] = list(entries.values())
  331. if metadata.get('reviews'):
  332. info['comments'] = []
  333. for review in metadata['reviews']:
  334. info['comments'].append({
  335. 'id': review.get('review_id'),
  336. 'author': review.get('reviewer'),
  337. 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
  338. 'timestamp': unified_timestamp(review.get('createdate')),
  339. 'parent': 'root'})
  340. return info
  341. class YoutubeWebArchiveIE(InfoExtractor):
  342. IE_NAME = 'web.archive:youtube'
  343. IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
  344. _VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|
  345. (?:https?://)?web\.archive\.org/
  346. (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
  347. (?:https?(?::|%3[Aa])//)?(?:
  348. (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
  349. |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
  350. )
  351. )(?P<id>[0-9A-Za-z_-]{11})
  352. (?(prefix)
  353. (?::(?P<date2>[0-9]{14}))?$|
  354. (?:%26|[#&]|$)
  355. )'''
  356. _TESTS = [
  357. {
  358. 'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs',
  359. 'info_dict': {
  360. 'id': 'aYAGB11YrSs',
  361. 'ext': 'webm',
  362. 'title': 'Team Fortress 2 - Sandviches!',
  363. 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf',
  364. 'upload_date': '20110926',
  365. 'uploader': 'Zeurel',
  366. 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',
  367. 'duration': 32,
  368. 'uploader_id': 'Zeurel',
  369. 'uploader_url': 'https://www.youtube.com/user/Zeurel',
  370. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  371. 'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg',
  372. },
  373. }, {
  374. # Internal link
  375. 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
  376. 'info_dict': {
  377. 'id': '97t7Xj_iBv0',
  378. 'ext': 'mp4',
  379. 'title': 'Why Machines That Bend Are Better',
  380. 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c',
  381. 'upload_date': '20190312',
  382. 'uploader': 'Veritasium',
  383. 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',
  384. 'duration': 771,
  385. 'uploader_id': '1veritasium',
  386. 'uploader_url': 'https://www.youtube.com/user/1veritasium',
  387. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  388. 'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA',
  389. },
  390. }, {
  391. # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.
  392. # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description
  393. 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
  394. 'info_dict': {
  395. 'id': 'AkhihxRKcrs',
  396. 'ext': 'webm',
  397. 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)',
  398. 'upload_date': '20120712',
  399. 'duration': 398,
  400. 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',
  401. 'uploader_id': 'machinima',
  402. 'uploader_url': 'https://www.youtube.com/user/machinima',
  403. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  404. 'uploader': 'machinima',
  405. },
  406. }, {
  407. # FLV video. Video file URL does not provide itag information
  408. 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
  409. 'info_dict': {
  410. 'id': 'jNQXAC9IVRw',
  411. 'ext': 'flv',
  412. 'title': 'Me at the zoo',
  413. 'upload_date': '20050423',
  414. 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A',
  415. 'duration': 19,
  416. 'description': 'md5:10436b12e07ac43ff8df65287a56efb4',
  417. 'uploader_id': 'jawed',
  418. 'uploader_url': 'https://www.youtube.com/user/jawed',
  419. 'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A',
  420. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  421. 'uploader': 'jawed',
  422. },
  423. }, {
  424. 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
  425. 'info_dict': {
  426. 'id': 'lTx3G6h2xyA',
  427. 'ext': 'flv',
  428. 'title': 'Madeon - Pop Culture (live mashup)',
  429. 'upload_date': '20110711',
  430. 'uploader': 'Madeon',
  431. 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w',
  432. 'duration': 204,
  433. 'description': 'md5:f7535343b6eda34a314eff8b85444680',
  434. 'uploader_id': 'itsmadeon',
  435. 'uploader_url': 'https://www.youtube.com/user/itsmadeon',
  436. 'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w',
  437. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  438. },
  439. }, {
  440. # First capture is of dead video, second is the oldest from CDX response.
  441. 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E',
  442. 'info_dict': {
  443. 'id': '1JYutPM8O6E',
  444. 'ext': 'mp4',
  445. 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',
  446. 'upload_date': '20160218',
  447. 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
  448. 'duration': 1235,
  449. 'description': 'md5:21032bae736421e89c2edf36d1936947',
  450. 'uploader_id': 'MachinimaETC',
  451. 'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
  452. 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
  453. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  454. 'uploader': 'ETC News',
  455. },
  456. }, {
  457. # First capture of dead video, capture date in link links to dead capture.
  458. 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E',
  459. 'info_dict': {
  460. 'id': '6FPhZJGvf4E',
  461. 'ext': 'mp4',
  462. 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',
  463. 'upload_date': '20160219',
  464. 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
  465. 'duration': 797,
  466. 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',
  467. 'uploader_id': 'MachinimaETC',
  468. 'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
  469. 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
  470. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  471. 'uploader': 'ETC News',
  472. },
  473. 'expected_warnings': [
  474. r'unable to download capture webpage \(it may not be archived\)',
  475. ],
  476. }, { # Very old YouTube page, has - YouTube in title.
  477. 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg',
  478. 'info_dict': {
  479. 'id': '-06-KB9XTzg',
  480. 'ext': 'flv',
  481. 'title': 'New Coin Hack!! 100% Safe!!',
  482. },
  483. }, {
  484. 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8',
  485. 'info_dict': {
  486. 'id': 'dWW7qP423y8',
  487. 'ext': 'mp4',
  488. 'title': 'It\'s Bootleg AirPods Time.',
  489. 'upload_date': '20211021',
  490. 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
  491. 'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
  492. 'duration': 810,
  493. 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',
  494. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  495. 'uploader': 'DankPods',
  496. },
  497. }, {
  498. # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093
  499. 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4',
  500. 'info_dict': {
  501. 'id': '6Dh-RL__uN4',
  502. 'ext': 'mp4',
  503. 'title': 'bitch lasagna',
  504. 'upload_date': '20181005',
  505. 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
  506. 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
  507. 'duration': 135,
  508. 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0',
  509. 'uploader': 'PewDiePie',
  510. 'uploader_id': 'PewDiePie',
  511. 'uploader_url': 'https://www.youtube.com/user/PewDiePie',
  512. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  513. },
  514. }, {
  515. # ~June 2010 Capture. swfconfig
  516. 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y',
  517. 'info_dict': {
  518. 'id': '8XeW5ilk-9Y',
  519. 'ext': 'flv',
  520. 'title': 'Story of Stuff, The Critique Part 4 of 4',
  521. 'duration': 541,
  522. 'description': 'md5:28157da06f2c5e94c97f7f3072509972',
  523. 'uploader': 'HowTheWorldWorks',
  524. 'uploader_id': 'HowTheWorldWorks',
  525. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  526. 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',
  527. 'upload_date': '20090520',
  528. },
  529. }, {
  530. # Jan 2011: watch-video-date/eow-date surrounded by whitespace
  531. 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc',
  532. 'info_dict': {
  533. 'id': 'Q_yjX80U7Yc',
  534. 'ext': 'flv',
  535. 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest',
  536. 'uploader_id': 'claybutlermusic',
  537. 'description': 'md5:4595264559e3d0a0ceb3f011f6334543',
  538. 'upload_date': '20090803',
  539. 'uploader': 'claybutlermusic',
  540. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  541. 'duration': 132,
  542. 'uploader_url': 'https://www.youtube.com/user/claybutlermusic',
  543. },
  544. }, {
  545. # ~May 2009 swfArgs. ytcfg is spread out over various vars
  546. 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY',
  547. 'info_dict': {
  548. 'id': 'c5uJgG05xUY',
  549. 'ext': 'webm',
  550. 'title': 'Story of Stuff, The Critique Part 1 of 4',
  551. 'uploader_id': 'HowTheWorldWorks',
  552. 'uploader': 'HowTheWorldWorks',
  553. 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',
  554. 'upload_date': '20090513',
  555. 'description': 'md5:4ca77d79538064e41e4cc464e93f44f0',
  556. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  557. 'duration': 754,
  558. },
  559. }, {
  560. # ~June 2012. Upload date is in another lang so cannot extract.
  561. 'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA',
  562. 'info_dict': {
  563. 'id': 'xWTLLl-dQaA',
  564. 'ext': 'mp4',
  565. 'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)',
  566. 'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy',
  567. 'description': 'md5:e25f0133aaf9e6793fb81c18021d193e',
  568. 'uploader_id': 'BlackNerdComedy',
  569. 'uploader': 'BlackNerdComedy',
  570. 'duration': 182,
  571. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  572. },
  573. }, {
  574. # ~July 2013
  575. 'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM',
  576. 'info_dict': {
  577. 'id': '9eO1aasHyTM',
  578. 'ext': 'mp4',
  579. 'title': 'Polar-oid',
  580. 'description': 'Cameras and bears are dangerous!',
  581. 'uploader_url': 'https://www.youtube.com/user/punkybird',
  582. 'uploader_id': 'punkybird',
  583. 'duration': 202,
  584. 'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ',
  585. 'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ',
  586. 'upload_date': '20060428',
  587. 'uploader': 'punkybird',
  588. },
  589. }, {
  590. # April 2020: Player response in player config
  591. 'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en',
  592. 'info_dict': {
  593. 'id': 'Cf7vS8jc7dY',
  594. 'ext': 'mp4',
  595. 'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated',
  596. 'duration': 64,
  597. 'upload_date': '20200408',
  598. 'uploader_id': 'GameGrumps',
  599. 'uploader': 'GameGrumps',
  600. 'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ',
  601. 'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ',
  602. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  603. 'description': 'md5:c625bb3c02c4f5fb4205971e468fa341',
  604. 'uploader_url': 'https://www.youtube.com/user/GameGrumps',
  605. },
  606. }, {
  607. # watch7-user-header with yt-user-info
  608. 'url': 'ytarchive:kbh4T_b4Ixw:20160307085057',
  609. 'info_dict': {
  610. 'id': 'kbh4T_b4Ixw',
  611. 'ext': 'mp4',
  612. 'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix',
  613. 'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA',
  614. 'uploader': 'Nelward music',
  615. 'duration': 213,
  616. 'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d',
  617. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  618. 'upload_date': '20150503',
  619. 'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA',
  620. },
  621. }, {
  622. # April 2012
  623. 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU',
  624. 'info_dict': {
  625. 'id': 'SOm7mPoPskU',
  626. 'ext': 'mp4',
  627. 'title': 'Boyfriend - Justin Bieber Parody',
  628. 'uploader_url': 'https://www.youtube.com/user/thecomputernerd01',
  629. 'uploader': 'thecomputernerd01',
  630. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  631. 'description': 'md5:dd7fa635519c2a5b4d566beaecad7491',
  632. 'duration': 200,
  633. 'upload_date': '20120407',
  634. 'uploader_id': 'thecomputernerd01',
  635. },
  636. }, {
  637. 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
  638. 'only_matching': True,
  639. }, {
  640. 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M',
  641. 'only_matching': True,
  642. }, {
  643. # Video not archived, only capture is unavailable video page
  644. 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',
  645. 'only_matching': True,
  646. }, { # Encoded url
  647. 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',
  648. 'only_matching': True,
  649. }, {
  650. 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',
  651. 'only_matching': True,
  652. }, {
  653. 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&amp;search=soccer',
  654. 'only_matching': True,
  655. }, {
  656. 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
  657. 'only_matching': True,
  658. }, {
  659. 'url': 'ytarchive:BaW_jenozKc:20050214000000',
  660. 'only_matching': True,
  661. }, {
  662. 'url': 'ytarchive:BaW_jenozKc',
  663. 'only_matching': True,
  664. },
  665. ]
  666. _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
  667. _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x:
  668. (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*|
  669. {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}
  670. )'''
  671. _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers
  672. _YT_ALL_THUMB_SERVERS = orderedSet(
  673. [*_YT_DEFAULT_THUMB_SERVERS, 'img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(5), 9)]])
  674. _WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/'
  675. _OLDEST_CAPTURE_DATE = 20050214000000
  676. _NEWEST_CAPTURE_DATE = 20500101000000
  677. def _call_cdx_api(self, item_id, url, filters: list | None = None, collapse: list | None = None, query: dict | None = None, note=None, fatal=False):
  678. # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
  679. query = {
  680. 'url': url,
  681. 'output': 'json',
  682. 'fl': 'original,mimetype,length,timestamp',
  683. 'limit': 500,
  684. 'filter': ['statuscode:200'] + (filters or []),
  685. 'collapse': collapse or [],
  686. **(query or {}),
  687. }
  688. res = self._download_json(
  689. 'https://web.archive.org/cdx/search/cdx', item_id,
  690. note or 'Downloading CDX API JSON', query=query, fatal=fatal)
  691. if isinstance(res, list) and len(res) >= 2:
  692. # format response to make it easier to use
  693. return [dict(zip(res[0], v)) for v in res[1:]]
  694. elif not isinstance(res, list) or len(res) != 0:
  695. self.report_warning('Error while parsing CDX API response' + bug_reports_message())
  696. def _extract_webpage_title(self, webpage):
  697. page_title = self._html_extract_title(webpage, default='')
  698. # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
  699. return self._html_search_regex(
  700. r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
  701. page_title, 'title', default='')
  702. def _extract_metadata(self, video_id, webpage):
  703. search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
  704. player_response = self._search_json(
  705. self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response',
  706. video_id, default={})
  707. initial_data = self._search_json(
  708. self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={})
  709. ytcfg = {}
  710. for j in re.findall(r'yt\.setConfig\(\s*(?P<json>{\s*(?s:.+?)\s*})\s*\);', webpage): # ~June 2010
  711. ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {})
  712. # XXX: this also may contain a 'ptchn' key
  713. player_config = (
  714. self._search_json(
  715. r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=',
  716. webpage, 'player config', video_id, default=None)
  717. or ytcfg.get('PLAYER_CONFIG') or {})
  718. # XXX: this may also contain a 'creator' key.
  719. swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={})
  720. if swf_args and not traverse_obj(player_config, ('args',)):
  721. player_config['args'] = swf_args
  722. if not player_response:
  723. # April 2020
  724. player_response = self._parse_json(
  725. traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False)
  726. initial_data_video = traverse_obj(
  727. initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
  728. expected_type=dict, get_all=False, default={})
  729. video_details = traverse_obj(
  730. player_response, 'videoDetails', expected_type=dict, get_all=False, default={})
  731. microformats = traverse_obj(
  732. player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={})
  733. video_title = (
  734. video_details.get('title')
  735. or YoutubeBaseInfoExtractor._get_text(microformats, 'title')
  736. or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title')
  737. or traverse_obj(player_config, ('args', 'title'))
  738. or self._extract_webpage_title(webpage)
  739. or search_meta(['og:title', 'twitter:title', 'title']))
  740. def id_from_url(url, type_):
  741. return self._search_regex(
  742. rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None)
  743. # XXX: would the get_elements_by_... functions be better suited here?
  744. _CHANNEL_URL_HREF_RE = r'href="[^"]*(?P<url>https?://www\.youtube\.com/(?:user|channel)/[^"]+)"'
  745. uploader_or_channel_url = self._search_regex(
  746. [fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024
  747. fr'<div\s*id=\"(?:watch-channel-stats|watch-headline-user-info)\"[^>]*>\s*<a[^>]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012
  748. webpage, 'uploader or channel url', default=None)
  749. owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2
  750. # Uploader refers to the /user/ id ONLY
  751. uploader_id = (
  752. id_from_url(owner_profile_url, 'user')
  753. or id_from_url(uploader_or_channel_url, 'user')
  754. or ytcfg.get('VIDEO_USERNAME'))
  755. uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None
  756. # XXX: do we want to differentiate uploader and channel?
  757. uploader = (
  758. self._search_regex(
  759. [r'<a\s*id="watch-username"[^>]*>\s*<strong>([^<]+)</strong>', # June 2010
  760. r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009
  761. r'<div\s*\bid=\"watch-channel-stats"[^>]*>\s*<a[^>]*>\s*(.+?)\s*</a', # ~May 2009
  762. r'<a\s*id="watch-userbanner"[^>]*title="\s*(.+?)\s*"'], # ~June 2012
  763. webpage, 'uploader', default=None)
  764. or self._html_search_regex(
  765. [r'(?s)<div\s*class="yt-user-info".*?<a[^>]*[^>]*>\s*(.*?)\s*</a', # March 2016
  766. r'(?s)<a[^>]*yt-user-name[^>]*>\s*(.*?)\s*</a'], # july 2013
  767. get_element_by_id('watch7-user-header', webpage), 'uploader', default=None)
  768. or self._html_search_regex(
  769. r'<button\s*href="/user/[^>]*>\s*<span[^>]*>\s*(.+?)\s*<', # April 2012
  770. get_element_by_id('watch-headline-user-info', webpage), 'uploader', default=None)
  771. or traverse_obj(player_config, ('args', 'creator'))
  772. or video_details.get('author'))
  773. channel_id = str_or_none(
  774. video_details.get('channelId')
  775. or microformats.get('externalChannelId')
  776. or search_meta('channelId')
  777. or self._search_regex(
  778. r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6
  779. webpage, 'channel id', default=None, group='id')
  780. or id_from_url(owner_profile_url, 'channel')
  781. or id_from_url(uploader_or_channel_url, 'channel')
  782. or traverse_obj(player_config, ('args', 'ucid')))
  783. channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None
  784. duration = int_or_none(
  785. video_details.get('lengthSeconds')
  786. or microformats.get('lengthSeconds')
  787. or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False)
  788. or parse_duration(search_meta('duration')))
  789. description = (
  790. video_details.get('shortDescription')
  791. or YoutubeBaseInfoExtractor._get_text(microformats, 'description')
  792. or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23
  793. or search_meta(['description', 'og:description', 'twitter:description']))
  794. upload_date = unified_strdate(
  795. dict_get(microformats, ('uploadDate', 'publishDate'))
  796. or search_meta(['uploadDate', 'datePublished'])
  797. or self._search_regex(
  798. [r'(?s)id="eow-date.*?>\s*(.*?)\s*</span>',
  799. r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520
  800. r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively)
  801. webpage, 'upload date', default=None))
  802. return {
  803. 'title': video_title,
  804. 'description': description,
  805. 'upload_date': upload_date,
  806. 'uploader': uploader,
  807. 'channel_id': channel_id,
  808. 'channel_url': channel_url,
  809. 'duration': duration,
  810. 'uploader_url': uploader_url,
  811. 'uploader_id': uploader_id,
  812. }
  813. def _extract_thumbnails(self, video_id):
  814. try_all = 'thumbnails' in self._configuration_arg('check_all')
  815. thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format(
  816. webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server)
  817. for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))]
  818. thumbnails = []
  819. for url in thumbnail_base_urls:
  820. response = self._call_cdx_api(
  821. video_id, url, filters=['mimetype:image/(?:webp|jpeg)'],
  822. collapse=['urlkey'], query={'matchType': 'prefix'})
  823. if not response:
  824. continue
  825. thumbnails.extend(
  826. {
  827. 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'),
  828. 'filesize': int_or_none(thumbnail_dict.get('length')),
  829. 'preference': int_or_none(thumbnail_dict.get('length')),
  830. } for thumbnail_dict in response)
  831. if not try_all:
  832. break
  833. self._remove_duplicate_formats(thumbnails)
  834. return thumbnails
  835. def _get_capture_dates(self, video_id, url_date):
  836. capture_dates = []
  837. # Note: CDX API will not find watch pages with extra params in the url.
  838. response = self._call_cdx_api(
  839. video_id, f'https://www.youtube.com/watch?v={video_id}',
  840. filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or []
  841. all_captures = sorted(int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None)
  842. # Prefer the new polymer UI captures as we support extracting more metadata from them
  843. # WBM captures seem to all switch to this layout ~July 2020
  844. modern_captures = [x for x in all_captures if x >= 20200701000000]
  845. if modern_captures:
  846. capture_dates.append(modern_captures[0])
  847. capture_dates.append(url_date)
  848. if all_captures:
  849. capture_dates.append(all_captures[0])
  850. if 'captures' in self._configuration_arg('check_all'):
  851. capture_dates.extend(modern_captures + all_captures)
  852. # Fallbacks if any of the above fail
  853. capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
  854. return orderedSet(filter(None, capture_dates))
  855. def _real_extract(self, url):
  856. video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
  857. url_date = url_date or url_date_2
  858. urlh = None
  859. retry_manager = self.RetryManager(fatal=False)
  860. for retry in retry_manager:
  861. try:
  862. urlh = self._request_webpage(
  863. HEADRequest(f'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{video_id}'),
  864. video_id, note='Fetching archived video file url', expected_status=True)
  865. except ExtractorError as e:
  866. # HTTP Error 404 is expected if the video is not saved.
  867. if isinstance(e.cause, HTTPError) and e.cause.status == 404:
  868. self.raise_no_formats(
  869. 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True)
  870. else:
  871. retry.error = e
  872. if retry_manager.error:
  873. self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id)
  874. capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
  875. self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
  876. info = {'id': video_id}
  877. for capture in capture_dates:
  878. webpage = self._download_webpage(
  879. (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
  880. video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
  881. note='Downloading capture webpage')
  882. current_info = self._extract_metadata(video_id, webpage or '')
  883. # Try avoid getting deleted video metadata
  884. if current_info.get('title'):
  885. info = merge_dicts(info, current_info)
  886. if 'captures' not in self._configuration_arg('check_all'):
  887. break
  888. info['thumbnails'] = self._extract_thumbnails(video_id)
  889. if urlh:
  890. url = urllib.parse.unquote(urlh.url)
  891. video_file_url_qs = parse_qs(url)
  892. # Attempt to recover any ext & format info from playback url & response headers
  893. fmt = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
  894. itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
  895. if itag and itag in YoutubeIE._formats:
  896. fmt.update(YoutubeIE._formats[itag])
  897. fmt.update({'format_id': itag})
  898. else:
  899. mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
  900. ext = (mimetype2ext(mime)
  901. or urlhandle_detect_ext(urlh)
  902. or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))
  903. fmt.update({'ext': ext})
  904. info['formats'] = [fmt]
  905. if not info.get('duration'):
  906. info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
  907. if not info.get('title'):
  908. info['title'] = video_id
  909. return info