logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git

linuxacademy.py (9511B)


  1. from __future__ import unicode_literals
  2. import json
  3. import random
  4. import re
  5. from .common import InfoExtractor
  6. from ..compat import (
  7. compat_b64decode,
  8. compat_HTTPError,
  9. compat_str,
  10. )
  11. from ..utils import (
  12. clean_html,
  13. ExtractorError,
  14. js_to_json,
  15. parse_duration,
  16. try_get,
  17. unified_timestamp,
  18. urlencode_postdata,
  19. urljoin,
  20. )
  21. class LinuxAcademyIE(InfoExtractor):
  22. _VALID_URL = r'''(?x)
  23. https?://
  24. (?:www\.)?linuxacademy\.com/cp/
  25. (?:
  26. courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
  27. modules/view/id/(?P<course_id>\d+)
  28. )
  29. '''
  30. _TESTS = [{
  31. 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
  32. 'info_dict': {
  33. 'id': '7971-2',
  34. 'ext': 'mp4',
  35. 'title': 'What Is Data Science',
  36. 'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
  37. 'timestamp': 1607387907,
  38. 'upload_date': '20201208',
  39. 'duration': 304,
  40. },
  41. 'params': {
  42. 'skip_download': True,
  43. },
  44. 'skip': 'Requires Linux Academy account credentials',
  45. }, {
  46. 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
  47. 'only_matching': True,
  48. }, {
  49. 'url': 'https://linuxacademy.com/cp/modules/view/id/154',
  50. 'info_dict': {
  51. 'id': '154',
  52. 'title': 'AWS Certified Cloud Practitioner',
  53. 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
  54. 'duration': 28835,
  55. },
  56. 'playlist_count': 41,
  57. 'skip': 'Requires Linux Academy account credentials',
  58. }]
  59. _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
  60. _ORIGIN_URL = 'https://linuxacademy.com'
  61. _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
  62. _NETRC_MACHINE = 'linuxacademy'
  63. def _real_initialize(self):
  64. self._login()
  65. def _login(self):
  66. username, password = self._get_login_info()
  67. if username is None:
  68. return
  69. def random_string():
  70. return ''.join([
  71. random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
  72. for _ in range(32)])
  73. webpage, urlh = self._download_webpage_handle(
  74. self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
  75. 'client_id': self._CLIENT_ID,
  76. 'response_type': 'token id_token',
  77. 'response_mode': 'web_message',
  78. 'redirect_uri': self._ORIGIN_URL,
  79. 'scope': 'openid email user_impersonation profile',
  80. 'audience': self._ORIGIN_URL,
  81. 'state': random_string(),
  82. 'nonce': random_string(),
  83. })
  84. login_data = self._parse_json(
  85. self._search_regex(
  86. r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
  87. 'login info', group='value'), None,
  88. transform_source=lambda x: compat_b64decode(x).decode('utf-8')
  89. )['extraParams']
  90. login_data.update({
  91. 'client_id': self._CLIENT_ID,
  92. 'redirect_uri': self._ORIGIN_URL,
  93. 'tenant': 'lacausers',
  94. 'connection': 'Username-Password-Authentication',
  95. 'username': username,
  96. 'password': password,
  97. 'sso': 'true',
  98. })
  99. login_state_url = urlh.geturl()
  100. try:
  101. login_page = self._download_webpage(
  102. 'https://login.linuxacademy.com/usernamepassword/login', None,
  103. 'Downloading login page', data=json.dumps(login_data).encode(),
  104. headers={
  105. 'Content-Type': 'application/json',
  106. 'Origin': 'https://login.linuxacademy.com',
  107. 'Referer': login_state_url,
  108. })
  109. except ExtractorError as e:
  110. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  111. error = self._parse_json(e.cause.read(), None)
  112. message = error.get('description') or error['code']
  113. raise ExtractorError(
  114. '%s said: %s' % (self.IE_NAME, message), expected=True)
  115. raise
  116. callback_page, urlh = self._download_webpage_handle(
  117. 'https://login.linuxacademy.com/login/callback', None,
  118. 'Downloading callback page',
  119. data=urlencode_postdata(self._hidden_inputs(login_page)),
  120. headers={
  121. 'Content-Type': 'application/x-www-form-urlencoded',
  122. 'Origin': 'https://login.linuxacademy.com',
  123. 'Referer': login_state_url,
  124. })
  125. access_token = self._search_regex(
  126. r'access_token=([^=&]+)', urlh.geturl(),
  127. 'access token', default=None)
  128. if not access_token:
  129. access_token = self._parse_json(
  130. self._search_regex(
  131. r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
  132. 'authorization response'), None,
  133. transform_source=js_to_json)['response']['access_token']
  134. self._download_webpage(
  135. 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
  136. % access_token, None, 'Downloading token validation page')
  137. def _real_extract(self, url):
  138. mobj = re.match(self._VALID_URL, url)
  139. chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
  140. item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
  141. webpage = self._download_webpage(url, item_id)
  142. # course path
  143. if course_id:
  144. module = self._parse_json(
  145. self._search_regex(
  146. r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'),
  147. item_id)
  148. entries = []
  149. chapter_number = None
  150. chapter = None
  151. chapter_id = None
  152. for item in module['items']:
  153. if not isinstance(item, dict):
  154. continue
  155. def type_field(key):
  156. return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
  157. type_fields = (type_field('name'), type_field('slug'))
  158. # Move to next module section
  159. if 'section' in type_fields:
  160. chapter = item.get('course_name')
  161. chapter_id = item.get('course_module')
  162. chapter_number = 1 if not chapter_number else chapter_number + 1
  163. continue
  164. # Skip non-lessons
  165. if 'lesson' not in type_fields:
  166. continue
  167. lesson_url = urljoin(url, item.get('url'))
  168. if not lesson_url:
  169. continue
  170. title = item.get('title') or item.get('lesson_name')
  171. description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
  172. entries.append({
  173. '_type': 'url_transparent',
  174. 'url': lesson_url,
  175. 'ie_key': LinuxAcademyIE.ie_key(),
  176. 'title': title,
  177. 'description': description,
  178. 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
  179. 'duration': parse_duration(item.get('duration')),
  180. 'chapter': chapter,
  181. 'chapter_id': chapter_id,
  182. 'chapter_number': chapter_number,
  183. })
  184. return {
  185. '_type': 'playlist',
  186. 'entries': entries,
  187. 'id': course_id,
  188. 'title': module.get('title'),
  189. 'description': module.get('md_desc') or clean_html(module.get('desc')),
  190. 'duration': parse_duration(module.get('duration')),
  191. }
  192. # single video path
  193. m3u8_url = self._parse_json(
  194. self._search_regex(
  195. r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
  196. item_id)[0]['file']
  197. formats = self._extract_m3u8_formats(
  198. m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
  199. m3u8_id='hls')
  200. self._sort_formats(formats)
  201. info = {
  202. 'id': item_id,
  203. 'formats': formats,
  204. }
  205. lesson = self._parse_json(
  206. self._search_regex(
  207. (r'window\.lesson\s*=\s*({.+?})\s*;',
  208. r'player\.lesson\s*=\s*({.+?})\s*;'),
  209. webpage, 'lesson', default='{}'), item_id, fatal=False)
  210. if lesson:
  211. info.update({
  212. 'title': lesson.get('lesson_name'),
  213. 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
  214. 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
  215. 'duration': parse_duration(lesson.get('duration')),
  216. })
  217. if not info.get('title'):
  218. info['title'] = self._search_regex(
  219. (r'>Lecture\s*:\s*(?P<value>[^<]+)',
  220. r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
  221. 'title', group='value')
  222. return info