logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

bluesky.py (16963B)


  1. from .common import InfoExtractor
  2. from ..utils import (
  3. ExtractorError,
  4. format_field,
  5. int_or_none,
  6. mimetype2ext,
  7. orderedSet,
  8. parse_iso8601,
  9. truncate_string,
  10. update_url_query,
  11. url_basename,
  12. url_or_none,
  13. variadic,
  14. )
  15. from ..utils.traversal import traverse_obj
  16. class BlueskyIE(InfoExtractor):
  17. _VALID_URL = [
  18. r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P<handle>[\w.:%-]+)/post/(?P<id>\w+)',
  19. r'at://(?P<handle>[\w.:%-]+)/app\.bsky\.feed\.post/(?P<id>\w+)',
  20. ]
  21. _TESTS = [{
  22. 'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g',
  23. 'md5': '375539c1930ab05d15585ed772ab54fd',
  24. 'info_dict': {
  25. 'id': '3l4omssdl632g',
  26. 'ext': 'mp4',
  27. 'uploader': 'Blu3Blu3Lilith',
  28. 'uploader_id': 'blu3blue.bsky.social',
  29. 'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social',
  30. 'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
  31. 'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2',
  32. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  33. 'title': 'OMG WE HAVE VIDEOS NOW',
  34. 'description': 'OMG WE HAVE VIDEOS NOW',
  35. 'upload_date': '20240921',
  36. 'timestamp': 1726940605,
  37. 'like_count': int,
  38. 'repost_count': int,
  39. 'comment_count': int,
  40. 'tags': [],
  41. },
  42. }, {
  43. 'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g',
  44. 'md5': 'b9e344fdbce9f2852c668a97efefb105',
  45. 'info_dict': {
  46. 'id': '3l3vgf77uco2g',
  47. 'ext': 'mp4',
  48. 'uploader': 'Bluesky',
  49. 'uploader_id': 'bsky.app',
  50. 'uploader_url': 'https://bsky.app/profile/bsky.app',
  51. 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
  52. 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
  53. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  54. 'title': 'Bluesky now has video! Update your app to versi...',
  55. 'alt_title': 'Bluesky video feature announcement',
  56. 'description': r're:(?s)Bluesky now has video! .{239}',
  57. 'upload_date': '20240911',
  58. 'timestamp': 1726074716,
  59. 'like_count': int,
  60. 'repost_count': int,
  61. 'comment_count': int,
  62. 'tags': [],
  63. 'subtitles': {
  64. 'en': 'mincount:1',
  65. },
  66. },
  67. }, {
  68. 'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c',
  69. 'md5': '5f2df8c200b5633eb7fb2c984d29772f',
  70. 'info_dict': {
  71. 'id': '3l4qhp7bcs52c',
  72. 'ext': 'mp4',
  73. 'uploader': 'souris',
  74. 'uploader_id': 'souris.moe',
  75. 'uploader_url': 'https://bsky.app/profile/souris.moe',
  76. 'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
  77. 'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp',
  78. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  79. 'title': 'Bluesky video #3l4qhp7bcs52c',
  80. 'upload_date': '20240922',
  81. 'timestamp': 1727003838,
  82. 'like_count': int,
  83. 'repost_count': int,
  84. 'comment_count': int,
  85. 'tags': [],
  86. },
  87. }, {
  88. 'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e',
  89. 'md5': '1af9c7fda061cf7593bbffca89e43d1c',
  90. 'info_dict': {
  91. 'id': '3l3w4tnezek2e',
  92. 'ext': 'mp4',
  93. 'uploader': 'clean',
  94. 'uploader_id': 'de1.pds.tentacle.expert',
  95. 'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert',
  96. 'channel_id': 'did:web:de1.tentacle.expert',
  97. 'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert',
  98. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  99. 'title': 'Bluesky video #3l3w4tnezek2e',
  100. 'upload_date': '20240911',
  101. 'timestamp': 1726098823,
  102. 'like_count': int,
  103. 'repost_count': int,
  104. 'comment_count': int,
  105. 'tags': [],
  106. },
  107. }, {
  108. 'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o',
  109. 'info_dict': {
  110. 'id': 'XxK3t_5V3ao',
  111. 'ext': 'mp4',
  112. 'uploader': 'yunayu',
  113. 'uploader_id': '@yunayuispink',
  114. 'uploader_url': 'https://www.youtube.com/@yunayuispink',
  115. 'channel': 'yunayu',
  116. 'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w',
  117. 'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w',
  118. 'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp',
  119. 'description': r're:Have a good goodx10000day',
  120. 'title': '5min vs 5hours drawing',
  121. 'availability': 'public',
  122. 'live_status': 'not_live',
  123. 'playable_in_embed': True,
  124. 'upload_date': '20241026',
  125. 'timestamp': 1729967784,
  126. 'duration': 321,
  127. 'age_limit': 0,
  128. 'like_count': int,
  129. 'view_count': int,
  130. 'comment_count': int,
  131. 'channel_follower_count': int,
  132. 'categories': ['Entertainment'],
  133. 'tags': [],
  134. },
  135. 'add_ie': ['Youtube'],
  136. }, {
  137. 'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m',
  138. 'info_dict': {
  139. 'id': '222792849',
  140. 'ext': 'mp3',
  141. 'uploader': 'LASERBAT',
  142. 'uploader_id': 'laserbatx',
  143. 'uploader_url': 'https://laserbatx.bandcamp.com',
  144. 'artists': ['LASERBAT'],
  145. 'album_artists': ['LASERBAT'],
  146. 'album': 'Hari Nezumi [EP]',
  147. 'track': 'Forward to the End',
  148. 'title': 'LASERBAT - Forward to the End',
  149. 'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg',
  150. 'duration': 228.571,
  151. 'track_id': '222792849',
  152. 'release_date': '20230423',
  153. 'upload_date': '20230423',
  154. 'timestamp': 1682276040.0,
  155. 'release_timestamp': 1682276040.0,
  156. 'track_number': 1,
  157. },
  158. 'add_ie': ['Bandcamp'],
  159. }, {
  160. 'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j',
  161. 'md5': 'b9e344fdbce9f2852c668a97efefb105',
  162. 'info_dict': {
  163. 'id': '3l3vgf77uco2g',
  164. 'ext': 'mp4',
  165. 'uploader': 'Bluesky',
  166. 'uploader_id': 'bsky.app',
  167. 'uploader_url': 'https://bsky.app/profile/bsky.app',
  168. 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
  169. 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
  170. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  171. 'title': 'Bluesky now has video! Update your app to versi...',
  172. 'alt_title': 'Bluesky video feature announcement',
  173. 'description': r're:(?s)Bluesky now has video! .{239}',
  174. 'upload_date': '20240911',
  175. 'timestamp': 1726074716,
  176. 'like_count': int,
  177. 'repost_count': int,
  178. 'comment_count': int,
  179. 'tags': [],
  180. 'subtitles': {
  181. 'en': 'mincount:1',
  182. },
  183. },
  184. }, {
  185. 'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f',
  186. 'md5': '8775118b235cf9fa6b5ad30f95cda75c',
  187. 'info_dict': {
  188. 'id': '3l7rdfxhyds2f',
  189. 'ext': 'mp4',
  190. 'uploader': 'cinnamon',
  191. 'uploader_id': 'alt.bun.how',
  192. 'uploader_url': 'https://bsky.app/profile/alt.bun.how',
  193. 'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide',
  194. 'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide',
  195. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  196. 'title': 'crazy that i look like this tbh',
  197. 'description': 'crazy that i look like this tbh',
  198. 'upload_date': '20241030',
  199. 'timestamp': 1730332128,
  200. 'like_count': int,
  201. 'repost_count': int,
  202. 'comment_count': int,
  203. 'tags': ['sexual'],
  204. 'age_limit': 18,
  205. },
  206. }, {
  207. 'url': 'at://did:plc:ia76kvnndjutgedggx2ibrem/app.bsky.feed.post/3l6zrz6zyl2dr',
  208. 'md5': '71b0eb6d85d03145e6af6642c7fc6d78',
  209. 'info_dict': {
  210. 'id': '3l6zrz6zyl2dr',
  211. 'ext': 'mp4',
  212. 'uploader': 'mary🐇',
  213. 'uploader_id': 'mary.my.id',
  214. 'uploader_url': 'https://bsky.app/profile/mary.my.id',
  215. 'channel_id': 'did:plc:ia76kvnndjutgedggx2ibrem',
  216. 'channel_url': 'https://bsky.app/profile/did:plc:ia76kvnndjutgedggx2ibrem',
  217. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  218. 'title': 'Bluesky video #3l6zrz6zyl2dr',
  219. 'upload_date': '20241021',
  220. 'timestamp': 1729523172,
  221. 'like_count': int,
  222. 'repost_count': int,
  223. 'comment_count': int,
  224. 'tags': [],
  225. },
  226. }, {
  227. 'url': 'https://bsky.app/profile/purpleicetea.bsky.social/post/3l7gv55dc2o2w',
  228. 'info_dict': {
  229. 'id': '3l7gv55dc2o2w',
  230. },
  231. 'playlist': [{
  232. 'info_dict': {
  233. 'id': '3l7gv55dc2o2w',
  234. 'ext': 'mp4',
  235. 'upload_date': '20241026',
  236. 'description': 'One of my favorite videos',
  237. 'comment_count': int,
  238. 'uploader_url': 'https://bsky.app/profile/purpleicetea.bsky.social',
  239. 'uploader': 'Purple.Ice.Tea',
  240. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  241. 'channel_url': 'https://bsky.app/profile/did:plc:bjh5ffwya5f53dfy47dezuwx',
  242. 'like_count': int,
  243. 'channel_id': 'did:plc:bjh5ffwya5f53dfy47dezuwx',
  244. 'repost_count': int,
  245. 'timestamp': 1729973202,
  246. 'tags': [],
  247. 'uploader_id': 'purpleicetea.bsky.social',
  248. 'title': 'One of my favorite videos',
  249. },
  250. }, {
  251. 'info_dict': {
  252. 'id': '3l77u64l7le2e',
  253. 'ext': 'mp4',
  254. 'title': 'hearing people on twitter say that bluesky isn\'...',
  255. 'like_count': int,
  256. 'uploader_id': 'thafnine.net',
  257. 'uploader_url': 'https://bsky.app/profile/thafnine.net',
  258. 'upload_date': '20241024',
  259. 'channel_url': 'https://bsky.app/profile/did:plc:6ttyq36rhiyed7wu3ws7dmqj',
  260. 'description': r're:(?s)hearing people on twitter say that bluesky .{93}',
  261. 'tags': [],
  262. 'alt_title': 'md5:9b1ee1937fb3d1a81e932f9ec14d560e',
  263. 'uploader': 'T9',
  264. 'channel_id': 'did:plc:6ttyq36rhiyed7wu3ws7dmqj',
  265. 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
  266. 'timestamp': 1729731642,
  267. 'comment_count': int,
  268. 'repost_count': int,
  269. },
  270. }],
  271. }]
  272. _BLOB_URL_TMPL = '{}/xrpc/com.atproto.sync.getBlob'
  273. def _get_service_endpoint(self, did, video_id):
  274. if did.startswith('did:web:'):
  275. url = f'https://{did[8:]}/.well-known/did.json'
  276. else:
  277. url = f'https://plc.directory/{did}'
  278. services = self._download_json(
  279. url, video_id, 'Fetching service endpoint', 'Falling back to bsky.social', fatal=False)
  280. return traverse_obj(
  281. services, ('service', lambda _, x: x['type'] == 'AtprotoPersonalDataServer',
  282. 'serviceEndpoint', {url_or_none}, any)) or 'https://bsky.social'
  283. def _real_extract(self, url):
  284. handle, video_id = self._match_valid_url(url).group('handle', 'id')
  285. post = self._download_json(
  286. 'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
  287. video_id, query={
  288. 'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
  289. 'depth': 0,
  290. 'parentHeight': 0,
  291. })['thread']['post']
  292. entries = []
  293. # app.bsky.embed.video.view/app.bsky.embed.external.view
  294. entries.extend(self._extract_videos(post, video_id))
  295. # app.bsky.embed.recordWithMedia.view
  296. entries.extend(self._extract_videos(
  297. post, video_id, embed_path=('embed', 'media'), record_subpath=('embed', 'media')))
  298. # app.bsky.embed.record.view
  299. if nested_post := traverse_obj(post, ('embed', 'record', ('record', None), {dict}, any)):
  300. entries.extend(self._extract_videos(
  301. nested_post, video_id, embed_path=('embeds', 0), record_path='value'))
  302. if not entries:
  303. raise ExtractorError('No video could be found in this post', expected=True)
  304. if len(entries) == 1:
  305. return entries[0]
  306. return self.playlist_result(entries, video_id)
  307. @staticmethod
  308. def _build_profile_url(path):
  309. return format_field(path, None, 'https://bsky.app/profile/%s', default=None)
  310. def _extract_videos(self, root, video_id, embed_path='embed', record_path='record', record_subpath='embed'):
  311. embed_path = variadic(embed_path, (str, bytes, dict, set))
  312. record_path = variadic(record_path, (str, bytes, dict, set))
  313. record_subpath = variadic(record_subpath, (str, bytes, dict, set))
  314. entries = []
  315. if external_uri := traverse_obj(root, (
  316. ((*record_path, *record_subpath), embed_path), 'external', 'uri', {url_or_none}, any)):
  317. entries.append(self.url_result(external_uri))
  318. if playlist := traverse_obj(root, (*embed_path, 'playlist', {url_or_none})):
  319. formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  320. playlist, video_id, 'mp4', m3u8_id='hls', fatal=False)
  321. else:
  322. return entries
  323. video_cid = traverse_obj(
  324. root, (*embed_path, 'cid', {str}),
  325. (*record_path, *record_subpath, 'video', 'ref', '$link', {str}))
  326. did = traverse_obj(root, ('author', 'did', {str}))
  327. if did and video_cid:
  328. endpoint = self._get_service_endpoint(did, video_id)
  329. formats.append({
  330. 'format_id': 'blob',
  331. 'url': update_url_query(
  332. self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': video_cid}),
  333. **traverse_obj(root, (*embed_path, 'aspectRatio', {
  334. 'width': ('width', {int_or_none}),
  335. 'height': ('height', {int_or_none}),
  336. })),
  337. **traverse_obj(root, (*record_path, *record_subpath, 'video', {
  338. 'filesize': ('size', {int_or_none}),
  339. 'ext': ('mimeType', {mimetype2ext}),
  340. })),
  341. })
  342. for sub_data in traverse_obj(root, (
  343. *record_path, *record_subpath, 'captions', lambda _, v: v['file']['ref']['$link'])):
  344. subtitles.setdefault(sub_data.get('lang') or 'und', []).append({
  345. 'url': update_url_query(
  346. self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': sub_data['file']['ref']['$link']}),
  347. 'ext': traverse_obj(sub_data, ('file', 'mimeType', {mimetype2ext})),
  348. })
  349. entries.append({
  350. 'id': video_id,
  351. 'formats': formats,
  352. 'subtitles': subtitles,
  353. **traverse_obj(root, {
  354. 'id': ('uri', {url_basename}),
  355. 'thumbnail': (*embed_path, 'thumbnail', {url_or_none}),
  356. 'alt_title': (*embed_path, 'alt', {str}, filter),
  357. 'uploader': ('author', 'displayName', {str}),
  358. 'uploader_id': ('author', 'handle', {str}),
  359. 'uploader_url': ('author', 'handle', {self._build_profile_url}),
  360. 'channel_id': ('author', 'did', {str}),
  361. 'channel_url': ('author', 'did', {self._build_profile_url}),
  362. 'like_count': ('likeCount', {int_or_none}),
  363. 'repost_count': ('repostCount', {int_or_none}),
  364. 'comment_count': ('replyCount', {int_or_none}),
  365. 'timestamp': ('indexedAt', {parse_iso8601}),
  366. 'tags': ('labels', ..., 'val', {str}, all, {orderedSet}),
  367. 'age_limit': (
  368. 'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any),
  369. 'description': (*record_path, 'text', {str}, filter),
  370. 'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}),
  371. }),
  372. })
  373. return entries