[YouTube] Bypass age-gating for certain restricted videos - youtube-dl - [mirror] Download/Watch videos from video hosters

commit: 58988c1421b88875a33015b08e4d2ada43021e09
parent e19ec5232216fd801ded88728df5b50bfb05c1cc
Author: dirkf <fieldhouse@gmx.net>
Date:   Mon, 31 Jan 2022 04:28:54 +0000

[YouTube] Bypass age-gating for certain restricted videos

* Use TVHTML5_SIMPLY_EMBEDDED_PLAYER client

* Also add and fix tests

* Introduce and use new utility function `update_url()`

Diffstat:
M youtube_dl/extractor/youtube.py 202 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M youtube_dl/utils.py 11 +++++++++++

2 files changed, 168 insertions(+), 45 deletions(-)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
@@ -42,6 +42,7 @@ from ..utils import (
     unescapeHTML,
     unified_strdate,
     unsmuggle_url,
+    update_url,
     update_url_query,
     url_or_none,
     urlencode_postdata,
@@ -286,15 +287,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
     _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
     _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
 
-    def _call_api(self, ep, query, video_id, fatal=True):
+    def _call_api(self, ep, query, video_id, fatal=True, headers=None):
         data = self._DEFAULT_API_DATA.copy()
         data.update(query)
+        real_headers = {'content-type': 'application/json'}
+        if headers:
+            real_headers.update(headers)
 
         return self._download_json(
             'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
             note='Downloading API JSON', errnote='Unable to download API page',
             data=json.dumps(data).encode('utf8'), fatal=fatal,
-            headers={'content-type': 'application/json'},
+            headers=real_headers,
             query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
 
     def _extract_yt_initial_data(self, video_id, webpage):
@@ -515,6 +519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'uploader': 'Philipp Hagemeister',
                 'uploader_id': 'phihag',
                 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+                'channel': 'Philipp Hagemeister',
                 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
                 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
                 'upload_date': '20121002',
@@ -524,10 +529,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'duration': 10,
                 'view_count': int,
                 'like_count': int,
-                'dislike_count': int,
+                'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
                 'start_time': 1,
                 'end_time': 9,
-            }
+            },
         },
         {
             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
@@ -562,7 +567,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'duration': 10,
                 'view_count': int,
                 'like_count': int,
-                'dislike_count': int,
             },
             'params': {
                 'skip_download': True,
@@ -621,8 +625,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
             }
         },
-        # Normal age-gate video (No vevo, embed allowed), available via embed page
+        # Age-gated videos
         {
+            'note': 'Age-gated video (No vevo, embed allowed)',
             'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
             'info_dict': {
                 'id': 'HtVdAasjOgU',
@@ -631,17 +636,97 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
                 'duration': 142,
                 'uploader': 'The Witcher',
-                'uploader_id': 'WitcherGame',
-                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
                 'upload_date': '20140605',
+                'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg',
+                'age_limit': 18,
+                'categories': ['Gaming'],
+                'tags': 'count:17',
+                'channel': 'The Witcher',
+                'channel_url': 'https://www.youtube.com/channel/UCzybXLxv08IApdjdN0mJhEg',
+                'channel_id': 'UCzybXLxv08IApdjdN0mJhEg',
+                'view_count': int,
+                'like_count': int,
+            },
+        },
+        {
+            'note': 'Age-gated video with embed allowed in public site',
+            'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
+            'info_dict': {
+                'id': 'HsUATh_Nc2U',
+                'ext': 'mp4',
+                'title': 'Godzilla 2 (Official Video)',
+                'description': 'md5:bf77e03fcae5529475e500129b05668a',
+                'duration': 177,
+                'uploader': 'FlyingKitty',
+                'upload_date': '20200408',
+                'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg',
                 'age_limit': 18,
+                'categories': ['Entertainment'],
+                'tags': ['Flyingkitty', 'godzilla 2'],
+                'channel': 'FlyingKitty',
+                'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg',
+                'channel_id': 'UCYQT13AtrJC0gsM1far_zJg',
+                'view_count': int,
+                'like_count': int,
             },
         },
         {
-            # Age-gated video only available with authentication (unavailable
-            # via embed page workaround)
+            'note': 'Age-gated video embedable only with clientScreen=EMBED',
+            'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
+            'info_dict': {
+                'id': 'Tq92D6wQ1mg',
+                'ext': 'mp4',
+                'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
+                'description': 'md5:17eccca93a786d51bc67646756894066',
+                'duration': 106,
+                'uploader': 'Projekt Melody',
+                'upload_date': '20191227',
+                'age_limit': 18,
+                'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg',
+                'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'],
+                'categories': ['Entertainment'],
+                'channel': 'Projekt Melody',
+                'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+                'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
+                'view_count': int,
+                'like_count': int,
+            },
+        },
+        {
+            'note': 'Non-Age-gated non-embeddable video',
+            'url': 'https://youtube.com/watch?v=MeJVWBSsPAY',
+            'info_dict': {
+                'id': 'MeJVWBSsPAY',
+                'ext': 'mp4',
+                'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
+                'description': 'Fan Video. Music & Lyrics by OOMPH!.',
+                'duration': 210,
+                'uploader': 'Herr Lurik',
+                'uploader_id': 'st3in234',
+                'upload_date': '20130730',
+                'uploader_url': 'http://www.youtube.com/user/st3in234',
+                'age_limit': 0,
+                'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/hqdefault.jpg',
+                'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'],
+                'categories': ['Music'],
+                'channel': 'Herr Lurik',
+                'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA',
+                'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA',
+                'artist': 'OOMPH!',
+                'view_count': int,
+                'like_count': int,
+            },
+        },
+        {
+            'note': 'Non-bypassable age-gated video',
+            'url': 'https://youtube.com/watch?v=Cr381pDsSsA',
+            'only_matching': True,
+        },
+        {
+            'note': 'Age-gated video only available with authentication (not via embed workaround)',
             'url': 'XgnwCQzjau8',
             'only_matching': True,
+            'skip': '''This video has been removed for violating YouTube's Community Guidelines''',
         },
         # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
         # YouTube Red ad is not captured for creator
@@ -670,17 +755,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'info_dict': {
                 'id': 'lqQg6PlCWgI',
                 'ext': 'mp4',
+                'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
+                'description': r're:(?s)(?:.+\s)?HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games\s*',
                 'duration': 6085,
                 'upload_date': '20150827',
                 'uploader_id': 'olympic',
                 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
-                'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
-                'uploader': 'Olympic',
-                'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
+                'uploader': r're:Olympics?',
+                'age_limit': 0,
+                'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg',
+                'categories': ['Sports'],
+                'tags': ['Hockey', '2012-07-31', '31 July 2012', 'Riverbank Arena', 'Session', 'Olympics', 'Olympic Games', 'London 2012', '2012 Summer Olympics', 'Summer Games'],
+                'channel': 'Olympics',
+                'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q',
+                'channel_id': 'UCTl3QQTvqHFjurroKxexy2Q',
+                'view_count': int,
+                'like_count': int,
             },
-            'params': {
-                'skip_download': 'requires avconv',
-            }
         },
         # Non-square pixels
         {
@@ -1683,27 +1774,52 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             player_response = self._call_api(
                 'player', {'videoId': video_id}, video_id)
 
-        playability_status = player_response.get('playabilityStatus') or {}
-        if playability_status.get('reason') == 'Sign in to confirm your age':
-            video_info = self._download_webpage(
-                base_url + 'get_video_info', video_id,
-                'Refetching age-gated info webpage',
-                'unable to download video info webpage', query={
-                    'video_id': video_id,
-                    'eurl': 'https://youtube.googleapis.com/v/' + video_id,
-                    'html5': 1,
-                    # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544
-                    'c': 'TVHTML5',
-                    'cver': '6.20180913',
-                }, fatal=False)
-            if video_info:
-                pr = self._parse_json(
-                    try_get(
-                        compat_parse_qs(video_info),
-                        lambda x: x['player_response'][0], compat_str) or '{}',
-                    video_id, fatal=False)
-                if pr and isinstance(pr, dict):
-                    player_response = pr
+        def is_agegated(playability):
+            if not isinstance(playability, dict):
+                return
+
+            if playability.get('desktopLegacyAgeGateReason'):
+                return True
+
+            reasons = filter(None, (playability.get(r) for r in ('status', 'reason')))
+            AGE_GATE_REASONS = (
+                'confirm your age', 'age-restricted', 'inappropriate',  # reason
+                'age_verification_required', 'age_check_required',  # status
+            )
+            return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons)
+
+        def get_playability_status(response):
+            return try_get(response, lambda x: x['playabilityStatus'], dict) or {}
+
+        playability_status = get_playability_status(player_response)
+        if (is_agegated(playability_status)
+                and int_or_none(self._downloader.params.get('age_limit'), default=18) >= 18):
+
+            self.report_age_confirmation()
+
+            # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233
+            pb_context = {'html5Preference': 'HTML5_PREF_WANTS'}
+            query = {
+                'playbackContext': {'contentPlaybackContext': {'html5Preference': 'HTML5_PREF_WANTS'}},
+                'contentCheckOk': True,
+                'racyCheckOk': True,
+                'context': {
+                    'client': {'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0', 'hl': 'en', 'clientScreen': 'EMBED'},
+                    'thirdParty': {'embedUrl': 'https://google.com'},
+                },
+                'videoId': video_id,
+            }
+            headers = {
+                'X-YouTube-Client-Name': '85',
+                'X-YouTube-Client-Version': '2.0',
+                'Origin': 'https://www.youtube.com'
+            }
+
+            video_info = self._call_api('player', query, video_id, fatal=False, headers=headers)
+            age_gate_status = get_playability_status(video_info)
+            if age_gate_status.get('status') == 'OK':
+                player_response = video_info
+                playability_status = age_gate_status
 
         trailer_video_id = try_get(
             playability_status,
@@ -1932,12 +2048,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             for thumbnail in (try_get(
                     container,
                     lambda x: x['thumbnail']['thumbnails'], list) or []):
-                thumbnail_url = thumbnail.get('url')
+                thumbnail_url = url_or_none(thumbnail.get('url'))
                 if not thumbnail_url:
                     continue
                 thumbnails.append({
                     'height': int_or_none(thumbnail.get('height')),
-                    'url': thumbnail_url,
+                    'url': update_url(thumbnail_url, query=None, fragment=None),
                     'width': int_or_none(thumbnail.get('width')),
                 })
             if thumbnails:
@@ -2142,6 +2258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     sbr_tooltip = try_get(
                         vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
                     if sbr_tooltip:
+                        # however dislike_count was hidden by YT, as if there could ever be dislikable content on YT
                         like_count, dislike_count = sbr_tooltip.split(' / ')
                         info.update({
                             'like_count': str_to_int(like_count),
@@ -2411,7 +2528,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
             'tags': list,
             'view_count': int,
             'like_count': int,
-            'dislike_count': int,
         },
         'params': {
             'skip_download': True,
@@ -2438,7 +2554,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
             'categories': ['News & Politics'],
             'tags': list,
             'like_count': int,
-            'dislike_count': int,
         },
         'params': {
             'skip_download': True,
@@ -2458,7 +2573,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
             'categories': ['News & Politics'],
             'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
             'like_count': int,
-            'dislike_count': int,
         },
         'params': {
             'skip_download': True,
@@ -3043,8 +3157,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
 
     def _real_extract(self, url):
         item_id = self._match_id(url)
-        url = compat_urlparse.urlunparse(
-            compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
+        url = update_url(url, netloc='www.youtube.com')
         # Handle both video/playlist URLs
         qs = parse_qs(url)
         video_id = qs.get('v', [None])[0]
@@ -3178,7 +3291,6 @@ class YoutubeYtBeIE(InfoExtractor):
             'categories': ['Nonprofits & Activism'],
             'tags': list,
             'like_count': int,
-            'dislike_count': int,
         },
         'params': {
             'noplaylist': True,
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
@@ -4121,6 +4121,17 @@ def update_url_query(url, query):
         query=compat_urllib_parse_urlencode(qs, True)))
 
 
+def update_url(url, **kwargs):
+    """Replace URL components specified by kwargs
+       url: compat_str or parsed URL tuple
+       returns: compat_str"""
+    if not kwargs:
+        return compat_urlparse.urlunparse(url) if isinstance(url, tuple) else url
+    if not isinstance(url, tuple):
+        url = compat_urlparse.urlparse(url)
+    return compat_urlparse.urlunparse(url._replace(**kwargs))
+
+
 def update_Request(req, url=None, data=None, headers={}, query={}):
     req_headers = req.headers.copy()
     req_headers.update(headers)

M	youtube_dl/extractor/youtube.py	202	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M	youtube_dl/utils.py	11	+++++++++++