commit: b31b5f4434b52816f3a5a1ae2cbe1d162be0fbd0
parent 86f2fa1590991fffae7b1daacae9164771312c0b
Author: Sergey M․ <dstftw@gmail.com>
Date: Fri, 20 Nov 2020 23:21:52 +0700
[youtube] Improve yt initial data extraction (closes #27093)
Diffstat:
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
@@ -283,6 +283,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
},
}
+ _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
+
def _call_api(self, ep, query, video_id):
data = self._DEFAULT_API_DATA.copy()
data.update(query)
@@ -299,8 +301,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _extract_yt_initial_data(self, video_id, webpage):
return self._parse_json(
self._search_regex(
- r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;',
- webpage, 'yt initial data'),
+ (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
+ self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
video_id)
@@ -1066,6 +1068,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True,
},
},
+ {
+ # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
+ 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
+ 'info_dict': {
+ 'id': 'CHqg6qOn4no',
+ 'ext': 'mp4',
+ 'title': 'Part 77 Sort a list of simple types in c#',
+ 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
+ 'upload_date': '20130831',
+ 'uploader_id': 'kudvenkat',
+ 'uploader': 'kudvenkat',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
def __init__(self, *args, **kwargs):