commit: 20218040db2b1e063191cc470ce403d35d394e2a
parent c69e71733d9619cb1a2bee769b9a381b52901de3
Author: Sergey M․ <dstftw@gmail.com>
Date: Tue, 5 Nov 2019 05:21:16 +0700
[scte] Add extractor (closes #22975)
Diffstat:
2 files changed, 148 insertions(+), 0 deletions(-)
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
@@ -980,6 +980,10 @@ from .sbs import SBSIE
from .screencast import ScreencastIE
from .screencastomatic import ScreencastOMaticIE
from .scrippsnetworks import ScrippsNetworksWatchIE
+from .scte import (
+ SCTEIE,
+ SCTECourseIE,
+)
from .seeker import SeekerIE
from .senateisvp import SenateISVPIE
from .sendtonews import SendtoNewsIE
diff --git a/youtube_dl/extractor/scte.py b/youtube_dl/extractor/scte.py
@@ -0,0 +1,144 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ decode_packed_codes,
+ ExtractorError,
+ urlencode_postdata,
+)
+
+
+class SCTEBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
+ _NETRC_MACHINE = 'scte'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_popup = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login popup')
+
+ def is_logged(webpage):
+ return any(re.search(p, webpage) for p in (
+ r'class=["\']welcome\b', r'>Sign Out<'))
+
+ # already logged in
+ if is_logged(login_popup):
+ return
+
+ login_form = self._hidden_inputs(login_popup)
+
+ login_form.update({
+ 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username,
+ 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password,
+ 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on',
+ })
+
+ response = self._download_webpage(
+ self._LOGIN_URL, None, 'Logging in',
+ data=urlencode_postdata(login_form))
+
+ if '|pageRedirect|' not in response and not is_logged(response):
+ error = self._html_search_regex(
+ r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class SCTEIE(SCTEBaseIE):
+ _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484',
+ 'info_dict': {
+ 'title': 'Introduction to DOCSIS Engineering Professional',
+ 'id': '31484',
+ },
+ 'playlist_count': 5,
+ 'skip': 'Requires account credentials',
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title')
+
+ context_id = self._search_regex(r'context-(\d+)', webpage, video_id)
+ content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id
+ context = decode_packed_codes(self._download_webpage(
+ '%smobile/data.js' % content_base, video_id))
+
+ data = self._parse_xml(
+ self._search_regex(
+ r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"),
+ video_id)
+
+ entries = []
+ for asset in data.findall('.//asset'):
+ asset_url = asset.get('url')
+ if not asset_url or not asset_url.endswith('.mp4'):
+ continue
+ asset_id = self._search_regex(
+ r'video_([^_]+)_', asset_url, 'asset id', default=None)
+ if not asset_id:
+ continue
+ entries.append({
+ 'id': asset_id,
+ 'title': title,
+ 'url': content_base + asset_url,
+ })
+
+ return self.playlist_result(entries, video_id, title)
+
+
+class SCTECourseIE(SCTEBaseIE):
+ _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://learning.scte.org/course/view.php?id=3639',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://learning.scte.org/course/view.php?id=3073',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, course_id)
+
+ title = self._search_regex(
+ r'<h1>(.+?)</h1>', webpage, 'title', default=None)
+
+ entries = []
+ for mobj in re.finditer(
+ r'''(?x)
+ <a[^>]+
+ href=(["\'])
+ (?P<url>
+ https?://learning\.scte\.org/mod/
+ (?P<kind>scorm|subcourse)/view\.php?(?:(?!\1).)*?
+ \bid=\d+
+ )
+ ''',
+ webpage):
+ item_url = mobj.group('url')
+ if item_url == url:
+ continue
+ ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm'
+ else SCTECourseIE.ie_key())
+ entries.append(self.url_result(item_url, ie=ie))
+
+ return self.playlist_result(entries, course_id, title)