logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

wistia.py (16395B)


  1. import base64
  2. import re
  3. import urllib.parse
  4. from .common import InfoExtractor
  5. from ..networking import HEADRequest
  6. from ..networking.exceptions import HTTPError
  7. from ..utils import (
  8. ExtractorError,
  9. determine_ext,
  10. filter_dict,
  11. float_or_none,
  12. int_or_none,
  13. parse_qs,
  14. traverse_obj,
  15. try_get,
  16. update_url_query,
  17. urlhandle_detect_ext,
  18. )
  19. class WistiaBaseIE(InfoExtractor):
  20. _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})'
  21. _VALID_URL_BASE = r'https?://(?:\w+\.)?wistia\.(?:net|com)/(?:embed/)?'
  22. _EMBED_BASE_URL = 'http://fast.wistia.net/embed/'
  23. def _download_embed_config(self, config_type, config_id, referer):
  24. base_url = self._EMBED_BASE_URL + f'{config_type}/{config_id}'
  25. video_password = self.get_param('videopassword')
  26. embed_config = self._download_json(
  27. base_url + '.json', config_id, headers={
  28. 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this.
  29. }, query=filter_dict({'password': video_password}))
  30. error = traverse_obj(embed_config, 'error')
  31. if error:
  32. raise ExtractorError(
  33. f'Error while getting the playlist: {error}', expected=True)
  34. if traverse_obj(embed_config, (
  35. 'media', ('embed_options', 'embedOptions'), 'plugin',
  36. 'passwordProtectedVideo', 'on', any)) == 'true':
  37. if video_password:
  38. raise ExtractorError('Invalid video password', expected=True)
  39. raise ExtractorError(
  40. 'This content is password-protected. Use the --video-password option', expected=True)
  41. return embed_config
  42. def _get_real_ext(self, url):
  43. ext = determine_ext(url, default_ext='bin')
  44. if ext == 'bin':
  45. urlh = self._request_webpage(
  46. HEADRequest(url), None, note='Checking media extension',
  47. errnote='HEAD request returned error', fatal=False)
  48. if urlh:
  49. ext = urlhandle_detect_ext(urlh, default='bin')
  50. return 'mp4' if ext == 'mov' else ext
  51. def _extract_media(self, embed_config):
  52. data = embed_config['media']
  53. video_id = data['hashedId']
  54. title = data['name']
  55. formats = []
  56. thumbnails = []
  57. for a in data['assets']:
  58. aurl = a.get('url')
  59. if not aurl:
  60. continue
  61. astatus = a.get('status')
  62. atype = a.get('type')
  63. if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):
  64. continue
  65. elif atype in ('still', 'still_image'):
  66. thumbnails.append({
  67. 'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'),
  68. 'width': int_or_none(a.get('width')),
  69. 'height': int_or_none(a.get('height')),
  70. 'filesize': int_or_none(a.get('size')),
  71. })
  72. else:
  73. aext = a.get('ext') or self._get_real_ext(aurl)
  74. display_name = a.get('display_name')
  75. format_id = atype
  76. if atype and atype.endswith('_video') and display_name:
  77. format_id = f'{atype[:-6]}-{display_name}'
  78. f = {
  79. 'format_id': format_id,
  80. 'url': aurl,
  81. 'tbr': int_or_none(a.get('bitrate')) or None,
  82. 'quality': 1 if atype == 'original' else None,
  83. }
  84. if display_name == 'Audio':
  85. f.update({
  86. 'vcodec': 'none',
  87. })
  88. else:
  89. f.update({
  90. 'width': int_or_none(a.get('width')),
  91. 'height': int_or_none(a.get('height')),
  92. 'vcodec': a.get('codec'),
  93. })
  94. if a.get('container') == 'm3u8' or aext == 'm3u8':
  95. ts_f = f.copy()
  96. ts_f.update({
  97. 'ext': 'ts',
  98. 'format_id': f['format_id'].replace('hls-', 'ts-'),
  99. 'url': f['url'].replace('.bin', '.ts'),
  100. })
  101. formats.append(ts_f)
  102. f.update({
  103. 'ext': 'mp4',
  104. 'protocol': 'm3u8_native',
  105. })
  106. else:
  107. f.update({
  108. 'container': a.get('container'),
  109. 'ext': aext,
  110. 'filesize': int_or_none(a.get('size')),
  111. })
  112. formats.append(f)
  113. subtitles = {}
  114. for caption in data.get('captions', []):
  115. language = caption.get('language')
  116. if not language:
  117. continue
  118. subtitles[language] = [{
  119. 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language,
  120. }]
  121. return {
  122. 'id': video_id,
  123. 'title': title,
  124. 'description': data.get('seoDescription'),
  125. 'formats': formats,
  126. 'thumbnails': thumbnails,
  127. 'duration': float_or_none(data.get('duration')),
  128. 'timestamp': int_or_none(data.get('createdAt')),
  129. 'subtitles': subtitles,
  130. }
  131. @classmethod
  132. def _extract_from_webpage(cls, url, webpage):
  133. from .teachable import TeachableIE
  134. if list(TeachableIE._extract_embed_urls(url, webpage)):
  135. return
  136. yield from super()._extract_from_webpage(url, webpage)
  137. @classmethod
  138. def _extract_wistia_async_embed(cls, webpage):
  139. # https://wistia.com/support/embed-and-share/video-on-your-website
  140. # https://wistia.com/support/embed-and-share/channel-embeds
  141. yield from re.finditer(
  142. r'''(?sx)
  143. <(?:div|section)[^>]+class=([\"'])(?:(?!\1).)*?(?P<type>wistia[a-z_0-9]+)\s*\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1
  144. ''', webpage)
  145. @classmethod
  146. def _extract_url_media_id(cls, url):
  147. mobj = re.search(r'(?:wmediaid|wvideo(?:id)?)]?=(?P<id>[a-z0-9]{10})', urllib.parse.unquote_plus(url))
  148. if mobj:
  149. return mobj.group('id')
  150. class WistiaIE(WistiaBaseIE):
  151. _VALID_URL = rf'(?:wistia:|{WistiaBaseIE._VALID_URL_BASE}(?:iframe|medias)/){WistiaBaseIE._VALID_ID_REGEX}'
  152. _EMBED_REGEX = [
  153. r'''(?x)
  154. <(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\']
  155. (?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})
  156. ''']
  157. _TESTS = [{
  158. # with hls video
  159. 'url': 'wistia:807fafadvk',
  160. 'md5': 'daff0f3687a41d9a71b40e0e8c2610fe',
  161. 'info_dict': {
  162. 'id': '807fafadvk',
  163. 'ext': 'mp4',
  164. 'title': 'Drip Brennan Dunn Workshop',
  165. 'description': 'a JV Webinars video',
  166. 'upload_date': '20160518',
  167. 'timestamp': 1463607249,
  168. 'duration': 4987.11,
  169. },
  170. 'skip': 'video unavailable',
  171. }, {
  172. 'url': 'wistia:a6ndpko1wg',
  173. 'md5': '10c1ce9c4dde638202513ed17a3767bd',
  174. 'info_dict': {
  175. 'id': 'a6ndpko1wg',
  176. 'ext': 'mp4',
  177. 'title': 'Episode 2: Boxed Water\'s retention is thirsty',
  178. 'upload_date': '20210324',
  179. 'description': 'md5:da5994c2c2d254833b412469d9666b7a',
  180. 'duration': 966.0,
  181. 'timestamp': 1616614369,
  182. 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png',
  183. },
  184. }, {
  185. 'url': 'wistia:5vd7p4bct5',
  186. 'md5': 'b9676d24bf30945d97060638fbfe77f0',
  187. 'info_dict': {
  188. 'id': '5vd7p4bct5',
  189. 'ext': 'mp4',
  190. 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679',
  191. 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f',
  192. 'upload_date': '20220915',
  193. 'timestamp': 1663258727,
  194. 'duration': 623.019,
  195. 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$',
  196. },
  197. }, {
  198. 'url': 'wistia:sh7fpupwlt',
  199. 'only_matching': True,
  200. }, {
  201. 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
  202. 'only_matching': True,
  203. }, {
  204. 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt',
  205. 'only_matching': True,
  206. }, {
  207. 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json',
  208. 'only_matching': True,
  209. }]
  210. _WEBPAGE_TESTS = [{
  211. 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool',
  212. 'info_dict': {
  213. 'id': 'cqwukac3z1',
  214. 'ext': 'mp4',
  215. 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content',
  216. 'duration': 158.125,
  217. 'timestamp': 1618974400,
  218. 'description': 'md5:27abc99a758573560be72600ef95cece',
  219. 'upload_date': '20210421',
  220. 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg',
  221. },
  222. }, {
  223. 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
  224. 'md5': 'b9676d24bf30945d97060638fbfe77f0',
  225. 'info_dict': {
  226. 'id': '5vd7p4bct5',
  227. 'ext': 'mp4',
  228. 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
  229. 'upload_date': '20220915',
  230. 'timestamp': 1663258727,
  231. 'duration': 623.019,
  232. 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg',
  233. 'description': 'a Paywall Videos video',
  234. },
  235. }]
  236. def _real_extract(self, url):
  237. video_id = self._match_id(url)
  238. embed_config = self._download_embed_config('medias', video_id, url)
  239. return self._extract_media(embed_config)
  240. @classmethod
  241. def _extract_embed_urls(cls, url, webpage):
  242. urls = list(super()._extract_embed_urls(url, webpage))
  243. for match in cls._extract_wistia_async_embed(webpage):
  244. if match.group('type') != 'wistia_channel':
  245. urls.append('wistia:{}'.format(match.group('id')))
  246. for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})',
  247. webpage):
  248. urls.append('wistia:{}'.format(match.group('id')))
  249. if not WistiaChannelIE._extract_embed_urls(url, webpage): # Fallback
  250. media_id = cls._extract_url_media_id(url)
  251. if media_id:
  252. urls.append('wistia:{}'.format(match.group('id')))
  253. return urls
  254. class WistiaPlaylistIE(WistiaBaseIE):
  255. _VALID_URL = rf'{WistiaBaseIE._VALID_URL_BASE}playlists/{WistiaBaseIE._VALID_ID_REGEX}'
  256. _TEST = {
  257. 'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc',
  258. 'info_dict': {
  259. 'id': 'aodt9etokc',
  260. },
  261. 'playlist_count': 3,
  262. }
  263. def _real_extract(self, url):
  264. playlist_id = self._match_id(url)
  265. playlist = self._download_embed_config('playlists', playlist_id, url)
  266. entries = []
  267. for media in (try_get(playlist, lambda x: x[0]['medias']) or []):
  268. embed_config = media.get('embed_config')
  269. if not embed_config:
  270. continue
  271. entries.append(self._extract_media(embed_config))
  272. return self.playlist_result(entries, playlist_id)
  273. class WistiaChannelIE(WistiaBaseIE):
  274. _VALID_URL = rf'(?:wistiachannel:|{WistiaBaseIE._VALID_URL_BASE}channel/){WistiaBaseIE._VALID_ID_REGEX}'
  275. _TESTS = [{
  276. # JSON Embed API returns 403, should fall back to webpage
  277. 'url': 'https://fast.wistia.net/embed/channel/yvyvu7wjbg?wchannelid=yvyvu7wjbg',
  278. 'info_dict': {
  279. 'id': 'yvyvu7wjbg',
  280. 'title': 'Copysmith Tutorials and Education!',
  281. 'description': 'Learn all things Copysmith via short and informative videos!',
  282. },
  283. 'playlist_mincount': 7,
  284. 'expected_warnings': ['falling back to webpage'],
  285. }, {
  286. 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l',
  287. 'info_dict': {
  288. 'id': '3802iirk0l',
  289. 'title': 'The Roof',
  290. },
  291. 'playlist_mincount': 20,
  292. }, {
  293. # link to popup video, follow --no-playlist
  294. 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n',
  295. 'info_dict': {
  296. 'id': 'sp5dqjzw3n',
  297. 'ext': 'mp4',
  298. 'title': 'The Roof S2: The Modern CRO',
  299. 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png',
  300. 'duration': 86.487,
  301. 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n',
  302. 'timestamp': 1619790290,
  303. 'upload_date': '20210430',
  304. },
  305. 'params': {'noplaylist': True, 'skip_download': True},
  306. }]
  307. _WEBPAGE_TESTS = [{
  308. 'url': 'https://www.profitwell.com/recur/boxed-out',
  309. 'info_dict': {
  310. 'id': '6jyvmqz6zs',
  311. 'title': 'Boxed Out',
  312. 'description': 'md5:14a8a93a1dbe236718e6a59f8c8c7bae',
  313. },
  314. 'playlist_mincount': 30,
  315. }, {
  316. # section instead of div
  317. 'url': 'https://360learning.com/studio/onboarding-joei/',
  318. 'info_dict': {
  319. 'id': 'z874k93n2o',
  320. 'title': 'Onboarding Joei.',
  321. 'description': 'Coming to you weekly starting Feb 19th.',
  322. },
  323. 'playlist_mincount': 20,
  324. }, {
  325. 'url': 'https://amplitude.com/amplify-sessions?amp%5Bwmediaid%5D=pz0m0l0if3&amp%5Bwvideo%5D=pz0m0l0if3&wchannelid=emyjmwjf79&wmediaid=i8um783bdt',
  326. 'info_dict': {
  327. 'id': 'pz0m0l0if3',
  328. 'title': 'A Framework for Improving Product Team Performance',
  329. 'ext': 'mp4',
  330. 'timestamp': 1653935275,
  331. 'upload_date': '20220530',
  332. 'description': 'Learn how to help your company improve and achieve your product related goals.',
  333. 'duration': 1854.39,
  334. 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png',
  335. },
  336. 'params': {'noplaylist': True, 'skip_download': True},
  337. }]
  338. def _real_extract(self, url):
  339. channel_id = self._match_id(url)
  340. media_id = self._extract_url_media_id(url)
  341. if not self._yes_playlist(channel_id, media_id, playlist_label='channel'):
  342. return self.url_result(f'wistia:{media_id}', 'Wistia')
  343. try:
  344. data = self._download_embed_config('channel', channel_id, url)
  345. except (ExtractorError, HTTPError):
  346. # Some channels give a 403 from the JSON API
  347. self.report_warning('Failed to download channel data from API, falling back to webpage.')
  348. webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id)
  349. data = self._parse_json(
  350. self._search_regex(rf'wchanneljsonp-{channel_id}\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)', webpage, 'jsonp', channel_id),
  351. channel_id, transform_source=lambda x: urllib.parse.unquote_plus(base64.b64decode(x).decode('utf-8')))
  352. # XXX: can there be more than one series?
  353. series = traverse_obj(data, ('series', 0), default={})
  354. entries = [
  355. self.url_result(f'wistia:{video["hashedId"]}', WistiaIE, title=video.get('name'))
  356. for video in traverse_obj(series, ('sections', ..., 'videos', ...)) or []
  357. if video.get('hashedId')
  358. ]
  359. return self.playlist_result(
  360. entries, channel_id, playlist_title=series.get('title'), playlist_description=series.get('description'))
  361. @classmethod
  362. def _extract_embed_urls(cls, url, webpage):
  363. yield from super()._extract_embed_urls(url, webpage)
  364. for match in cls._extract_wistia_async_embed(webpage):
  365. if match.group('type') == 'wistia_channel':
  366. # original url may contain wmediaid query param
  367. yield update_url_query(f'wistiachannel:{match.group("id")}', parse_qs(url))