commit: 6c370bc149945e26c6e359147221b3aa96e0ea75
parent 37fd242342a04613317fd7f85170a20fb7323efe
Author: Remita Amine <remitamine@gmail.com>
Date: Thu, 10 Dec 2020 14:52:35 +0100
[facebook] Add support for Relay based pages(closes #26823)
Diffstat:
1 file changed, 66 insertions(+), 5 deletions(-)
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
@@ -16,11 +16,13 @@ from ..utils import (
clean_html,
error_to_compat_str,
ExtractorError,
+ float_or_none,
get_element_by_id,
int_or_none,
js_to_json,
limit_length,
parse_count,
+ qualities,
sanitized_Request,
try_get,
urlencode_postdata,
@@ -327,6 +329,14 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or [])
+ formats = []
+
+ def extract_dash_manifest(video):
+ dash_manifest = video.get('dash_manifest')
+ if dash_manifest:
+ formats.extend(self._parse_mpd_formats(
+ compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+
if not video_data:
server_js_data = self._parse_json(self._search_regex([
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)',
@@ -335,6 +345,61 @@ class FacebookIE(InfoExtractor):
video_data = extract_from_jsmods_instances(server_js_data)
if not video_data:
+ graphql_data = self._parse_json(self._search_regex(
+ r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);',
+ webpage, 'graphql data', default='{}'), video_id, fatal=False) or {}
+ for require in (graphql_data.get('require') or []):
+ if require[0] == 'RelayPrefetchedStreamCache':
+ def parse_graphql_video(video):
+ q = qualities(['sd', 'hd'])
+ for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
+ playable_url = video.get('playable_url' + suffix)
+ if not playable_url:
+ continue
+ formats.append({
+ 'format_id': format_id,
+ 'quality': q(format_id),
+ 'url': playable_url,
+ })
+ extract_dash_manifest(video)
+ self._sort_formats(formats)
+ v_id = video.get('videoId') or video.get('id') or video_id
+ info = {
+ 'id': v_id,
+ 'formats': formats,
+ 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
+ 'uploader_id': try_get(video, lambda x: x['owner']['id']),
+ 'timestamp': int_or_none(video.get('publish_time')),
+ 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
+ }
+ description = try_get(video, lambda x: x['savable_description']['text'])
+ title = video.get('name')
+ if title:
+ info.update({
+ 'title': title,
+ 'description': description,
+ })
+ else:
+ info['title'] = description or 'Facebook video #%s' % v_id
+ return webpage, info
+
+ data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
+
+ attachments = try_get(data, [
+ lambda x: x['video']['story']['attachments'],
+ lambda x: x['video']['creation_story']['attachments'],
+ lambda x: x['node']['comet_sections']['content']['story']['attachments']
+ ], list) or []
+ for attachment in attachments:
+ media = attachment.get('media') or try_get(attachment, lambda x: x['style_type_renderer']['attachment']['media'], dict) or {}
+ if media.get('__typename') == 'Video':
+ return parse_graphql_video(media)
+
+ video = data.get('video') or {}
+ if video:
+ return parse_graphql_video(video)
+
+ if not video_data:
if not fatal_if_no_video:
return webpage, False
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
@@ -375,7 +440,6 @@ class FacebookIE(InfoExtractor):
raise ExtractorError('Cannot parse data')
subtitles = {}
- formats = []
for f in video_data:
format_id = f['stream_type']
if f and isinstance(f, dict):
@@ -394,10 +458,7 @@ class FacebookIE(InfoExtractor):
'url': src,
'preference': preference,
})
- dash_manifest = f[0].get('dash_manifest')
- if dash_manifest:
- formats.extend(self._parse_mpd_formats(
- compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+ extract_dash_manifest(f[0])
subtitles_src = f[0].get('subtitles_src')
if subtitles_src:
subtitles.setdefault('en', []).append({'url': subtitles_src})