logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

soundcloud.py (40811B)


  1. import functools
  2. import itertools
  3. import json
  4. import re
  5. from .common import InfoExtractor, SearchInfoExtractor
  6. from ..networking import HEADRequest
  7. from ..networking.exceptions import HTTPError
  8. from ..utils import (
  9. ExtractorError,
  10. float_or_none,
  11. int_or_none,
  12. join_nonempty,
  13. mimetype2ext,
  14. parse_qs,
  15. str_or_none,
  16. try_call,
  17. unified_timestamp,
  18. update_url_query,
  19. url_or_none,
  20. urlhandle_detect_ext,
  21. )
  22. from ..utils.traversal import traverse_obj
  23. class SoundcloudEmbedIE(InfoExtractor):
  24. _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)'
  25. _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1']
  26. _TEST = {
  27. # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/
  28. 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey',
  29. 'only_matching': True,
  30. }
  31. def _real_extract(self, url):
  32. query = parse_qs(url)
  33. api_url = query['url'][0]
  34. secret_token = query.get('secret_token')
  35. if secret_token:
  36. api_url = update_url_query(api_url, {'secret_token': secret_token[0]})
  37. return self.url_result(api_url)
  38. class SoundcloudBaseIE(InfoExtractor):
  39. _NETRC_MACHINE = 'soundcloud'
  40. _API_V2_BASE = 'https://api-v2.soundcloud.com/'
  41. _BASE_URL = 'https://soundcloud.com/'
  42. _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
  43. _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
  44. _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
  45. _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
  46. _HEADERS = {}
  47. _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
  48. _ARTWORK_MAP = {
  49. 'mini': 16,
  50. 'tiny': 20,
  51. 'small': 32,
  52. 'badge': 47,
  53. 't67x67': 67,
  54. 'large': 100,
  55. 't300x300': 300,
  56. 'crop': 400,
  57. 't500x500': 500,
  58. 'original': 0,
  59. }
  60. _DEFAULT_FORMATS = ['http_aac', 'hls_aac', 'http_opus', 'hls_opus', 'http_mp3', 'hls_mp3']
  61. @functools.cached_property
  62. def _is_requested(self):
  63. return re.compile(r'|'.join(set(
  64. re.escape(pattern).replace(r'\*', r'.*') if pattern != 'default'
  65. else '|'.join(map(re.escape, self._DEFAULT_FORMATS))
  66. for pattern in self._configuration_arg('formats', ['default'], ie_key=SoundcloudIE)
  67. ))).fullmatch
  68. def _store_client_id(self, client_id):
  69. self.cache.store('soundcloud', 'client_id', client_id)
  70. def _update_client_id(self):
  71. webpage = self._download_webpage('https://soundcloud.com/', None)
  72. for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
  73. script = self._download_webpage(src, None, fatal=False)
  74. if script:
  75. client_id = self._search_regex(
  76. r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
  77. script, 'client id', default=None)
  78. if client_id:
  79. self._CLIENT_ID = client_id
  80. self._store_client_id(client_id)
  81. return
  82. raise ExtractorError('Unable to extract client id')
  83. def _call_api(self, *args, **kwargs):
  84. non_fatal = kwargs.get('fatal') is False
  85. if non_fatal:
  86. del kwargs['fatal']
  87. query = kwargs.get('query', {}).copy()
  88. for _ in range(2):
  89. query['client_id'] = self._CLIENT_ID
  90. kwargs['query'] = query
  91. try:
  92. return self._download_json(*args, **kwargs)
  93. except ExtractorError as e:
  94. if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
  95. self._store_client_id(None)
  96. self._update_client_id()
  97. continue
  98. elif non_fatal:
  99. self.report_warning(str(e))
  100. return False
  101. raise
  102. def _initialize_pre_login(self):
  103. self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
  104. def _verify_oauth_token(self, token):
  105. if self._request_webpage(
  106. self._API_VERIFY_AUTH_TOKEN % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
  107. None, note='Verifying login token...', fatal=False,
  108. data=json.dumps({'session': {'access_token': token}}).encode()):
  109. self._HEADERS['Authorization'] = f'OAuth {token}'
  110. self.report_login()
  111. else:
  112. self.report_warning('Provided authorization token is invalid. Continuing as guest')
  113. def _real_initialize(self):
  114. if self._HEADERS:
  115. return
  116. if token := try_call(lambda: self._get_cookies(self._BASE_URL)['oauth_token'].value):
  117. self._verify_oauth_token(token)
  118. def _perform_login(self, username, password):
  119. if username != 'oauth':
  120. raise ExtractorError(
  121. 'Login using username and password is not currently supported. '
  122. 'Use "--username oauth --password <oauth_token>" to login using an oauth token, '
  123. f'or else {self._login_hint(method="cookies")}', expected=True)
  124. if self._HEADERS:
  125. return
  126. self._verify_oauth_token(password)
  127. r'''
  128. def genDevId():
  129. def genNumBlock():
  130. return ''.join([str(random.randrange(10)) for i in range(6)])
  131. return '-'.join([genNumBlock() for i in range(4)])
  132. payload = {
  133. 'client_id': self._CLIENT_ID,
  134. 'recaptcha_pubkey': 'null',
  135. 'recaptcha_response': 'null',
  136. 'credentials': {
  137. 'identifier': username,
  138. 'password': password
  139. },
  140. 'signature': self.sign(username, password, self._CLIENT_ID),
  141. 'device_id': genDevId(),
  142. 'user_agent': self._USER_AGENT
  143. }
  144. response = self._call_api(
  145. self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
  146. None, note='Verifying login token...', fatal=False,
  147. data=json.dumps(payload).encode())
  148. if token := traverse_obj(response, ('session', 'access_token', {str})):
  149. self._HEADERS['Authorization'] = f'OAuth {token}'
  150. self.report_login()
  151. return
  152. raise ExtractorError('Unable to get access token, login may have failed', expected=True)
  153. '''
  154. # signature generation
  155. def sign(self, user, pw, clid):
  156. a = 33
  157. i = 1
  158. s = 440123
  159. w = 117
  160. u = 1800000
  161. l = 1042
  162. b = 37
  163. k = 37
  164. c = 5
  165. n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY
  166. y = '8' # _REV
  167. r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT
  168. e = user # _USERNAME
  169. t = clid # _CLIENT_ID
  170. d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
  171. h = n + y + d + r + e + t + d + n
  172. m = 8011470
  173. for f in range(len(h)):
  174. m = (m >> 1) + ((1 & m) << 23)
  175. m += ord(h[f])
  176. m &= 16777215
  177. # c is not even needed
  178. return f'{y}:{d}:{m:x}:{c}'
  179. def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False):
  180. track_id = str(info['id'])
  181. format_urls = set()
  182. formats = []
  183. has_drm = False
  184. query = {'client_id': self._CLIENT_ID}
  185. if secret_token:
  186. query['secret_token'] = secret_token
  187. if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'):
  188. try:
  189. # Do not use _call_api(); HTTP Error codes have different meanings for this request
  190. download_data = self._download_json(
  191. f'{self._API_V2_BASE}tracks/{track_id}/download', track_id,
  192. 'Downloading original download format info JSON', query=query, headers=self._HEADERS)
  193. except ExtractorError as e:
  194. if isinstance(e.cause, HTTPError) and e.cause.status == 401:
  195. self.report_warning(
  196. 'Original download format is only available '
  197. f'for registered users. {self._login_hint()}')
  198. elif isinstance(e.cause, HTTPError) and e.cause.status == 403:
  199. self.write_debug('Original download format is not available for this client')
  200. else:
  201. self.report_warning(e.msg)
  202. download_data = None
  203. if redirect_url := traverse_obj(download_data, ('redirectUri', {url_or_none})):
  204. urlh = self._request_webpage(
  205. HEADRequest(redirect_url), track_id, 'Checking original download format availability',
  206. 'Original download format is not available', fatal=False)
  207. if urlh:
  208. format_url = urlh.url
  209. format_urls.add(format_url)
  210. formats.append({
  211. 'format_id': 'download',
  212. 'ext': urlhandle_detect_ext(urlh, default='mp3'),
  213. 'filesize': int_or_none(urlh.headers.get('Content-Length')),
  214. 'url': format_url,
  215. 'quality': 10,
  216. 'format_note': 'Original',
  217. 'vcodec': 'none',
  218. })
  219. def invalid_url(url):
  220. return not url or url in format_urls
  221. # New API
  222. for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']) and v['preset'])):
  223. if extract_flat:
  224. break
  225. format_url = t['url']
  226. preset = t['preset']
  227. preset_base = preset.partition('_')[0]
  228. protocol = traverse_obj(t, ('format', 'protocol', {str})) or 'http'
  229. if protocol.startswith(('ctr-', 'cbc-')):
  230. has_drm = True
  231. continue
  232. if protocol == 'progressive':
  233. protocol = 'http'
  234. if protocol != 'hls' and '/hls' in format_url:
  235. protocol = 'hls'
  236. if protocol == 'encrypted-hls' or '/encrypted-hls' in format_url:
  237. protocol = 'hls-aes'
  238. short_identifier = f'{protocol}_{preset_base}'
  239. if preset_base == 'abr':
  240. self.write_debug(f'Skipping broken "{short_identifier}" format')
  241. continue
  242. if not self._is_requested(short_identifier):
  243. self.write_debug(f'"{short_identifier}" is not a requested format, skipping')
  244. continue
  245. # XXX: if not extract_flat, 429 error must be caught where _extract_info_dict is called
  246. stream_url = traverse_obj(self._call_api(
  247. format_url, track_id, f'Downloading {short_identifier} format info JSON',
  248. query=query, headers=self._HEADERS), ('url', {url_or_none}))
  249. if invalid_url(stream_url):
  250. continue
  251. format_urls.add(stream_url)
  252. mime_type = traverse_obj(t, ('format', 'mime_type', {str}))
  253. codec = self._search_regex(r'codecs="([^"]+)"', mime_type, 'codec', default=None)
  254. ext = {
  255. 'mp4a': 'm4a',
  256. 'opus': 'opus',
  257. }.get(codec[:4] if codec else None) or mimetype2ext(mime_type, default=None)
  258. if not ext or ext == 'm3u8':
  259. ext = preset_base
  260. is_premium = t.get('quality') == 'hq'
  261. abr = int_or_none(
  262. self._search_regex(r'(\d+)k$', preset, 'abr', default=None)
  263. or self._search_regex(r'\.(\d+)\.(?:opus|mp3)[/?]', stream_url, 'abr', default=None)
  264. or (256 if (is_premium and 'aac' in preset) else None))
  265. is_preview = (t.get('snipped')
  266. or '/preview/' in format_url
  267. or re.search(r'/(?:preview|playlist)/0/30/', stream_url))
  268. formats.append({
  269. 'format_id': join_nonempty(protocol, preset, is_preview and 'preview', delim='_'),
  270. 'url': stream_url,
  271. 'ext': ext,
  272. 'acodec': codec,
  273. 'vcodec': 'none',
  274. 'abr': abr,
  275. 'protocol': 'm3u8_native' if protocol in ('hls', 'hls-aes') else 'http',
  276. 'container': 'm4a_dash' if ext == 'm4a' else None,
  277. 'quality': 5 if is_premium else 0 if (abr and abr >= 160) else -1,
  278. 'format_note': 'Premium' if is_premium else None,
  279. 'preference': -10 if is_preview else None,
  280. })
  281. if not formats:
  282. if has_drm:
  283. self.report_drm(track_id)
  284. if info.get('policy') == 'BLOCK':
  285. self.raise_geo_restricted(metadata_available=True)
  286. user = info.get('user') or {}
  287. thumbnails = []
  288. artwork_url = info.get('artwork_url')
  289. thumbnail = artwork_url or user.get('avatar_url')
  290. if isinstance(thumbnail, str):
  291. if re.search(self._IMAGE_REPL_RE, thumbnail):
  292. for image_id, size in self._ARTWORK_MAP.items():
  293. i = {
  294. 'id': image_id,
  295. 'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.jpg', thumbnail),
  296. }
  297. if image_id == 'tiny' and not artwork_url:
  298. size = 18
  299. elif image_id == 'original':
  300. i['preference'] = 10
  301. if size:
  302. i.update({
  303. 'width': size,
  304. 'height': size,
  305. })
  306. thumbnails.append(i)
  307. else:
  308. thumbnails = [{'url': thumbnail}]
  309. def extract_count(key):
  310. return int_or_none(info.get(f'{key}_count'))
  311. return {
  312. 'id': track_id,
  313. 'uploader': user.get('username'),
  314. 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
  315. 'uploader_url': user.get('permalink_url'),
  316. 'timestamp': unified_timestamp(info.get('created_at')),
  317. 'title': info.get('title'),
  318. 'description': info.get('description'),
  319. 'thumbnails': thumbnails,
  320. 'duration': float_or_none(info.get('duration'), 1000),
  321. 'webpage_url': info.get('permalink_url'),
  322. 'license': info.get('license'),
  323. 'view_count': extract_count('playback'),
  324. 'like_count': extract_count('favoritings') or extract_count('likes'),
  325. 'comment_count': extract_count('comment'),
  326. 'repost_count': extract_count('reposts'),
  327. 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)),
  328. 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)),
  329. 'formats': formats if not extract_flat else None,
  330. }
  331. @classmethod
  332. def _resolv_url(cls, url):
  333. return cls._API_V2_BASE + 'resolve?url=' + url
  334. class SoundcloudIE(SoundcloudBaseIE):
  335. """Information extractor for soundcloud.com
  336. To access the media, the uid of the song and a stream token
  337. must be extracted from the page source and the script must make
  338. a request to media.soundcloud.com/crossdomain.xml. Then
  339. the media can be grabbed by requesting from an url composed
  340. of the stream token and uid
  341. """
  342. _VALID_URL = r'''(?x)^(?:https?://)?
  343. (?:(?:(?:www\.|m\.)?soundcloud\.com/
  344. (?!stations/track)
  345. (?P<uploader>[\w\d-]+)/
  346. (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
  347. (?P<title>[\w\d-]+)
  348. (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))?
  349. (?:[?].*)?$)
  350. |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
  351. (?:/?\?secret_token=(?P<secret_token>[^&]+))?)
  352. )
  353. '''
  354. IE_NAME = 'soundcloud'
  355. _TESTS = [
  356. {
  357. 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
  358. 'md5': 'de9bac153e7427a7333b4b0c1b6a18d2',
  359. 'info_dict': {
  360. 'id': '62986583',
  361. 'ext': 'opus',
  362. 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
  363. 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
  364. 'uploader': 'E.T. ExTerrestrial Music',
  365. 'uploader_id': '1571244',
  366. 'timestamp': 1349920598,
  367. 'upload_date': '20121011',
  368. 'duration': 143.216,
  369. 'license': 'all-rights-reserved',
  370. 'view_count': int,
  371. 'like_count': int,
  372. 'comment_count': int,
  373. 'repost_count': int,
  374. 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg',
  375. 'uploader_url': 'https://soundcloud.com/ethmusic',
  376. },
  377. },
  378. # geo-restricted
  379. {
  380. 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
  381. 'info_dict': {
  382. 'id': '47127627',
  383. 'ext': 'opus',
  384. 'title': 'Goldrushed',
  385. 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
  386. 'uploader': 'The Royal Concept',
  387. 'uploader_id': '9615865',
  388. 'timestamp': 1337635207,
  389. 'upload_date': '20120521',
  390. 'duration': 227.155,
  391. 'license': 'all-rights-reserved',
  392. 'view_count': int,
  393. 'like_count': int,
  394. 'comment_count': int,
  395. 'repost_count': int,
  396. 'uploader_url': 'https://soundcloud.com/the-concept-band',
  397. 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg',
  398. 'genres': ['Alternative'],
  399. 'artists': ['The Royal Concept'],
  400. },
  401. },
  402. # private link
  403. {
  404. 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
  405. 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
  406. 'info_dict': {
  407. 'id': '123998367',
  408. 'ext': 'mp3',
  409. 'title': 'Youtube - Dl Test Video \'\' Ä↭',
  410. 'description': 'test chars: "\'/\\ä↭',
  411. 'uploader': 'jaimeMF',
  412. 'uploader_id': '69767071',
  413. 'timestamp': 1386604920,
  414. 'upload_date': '20131209',
  415. 'duration': 9.927,
  416. 'license': 'all-rights-reserved',
  417. 'view_count': int,
  418. 'like_count': int,
  419. 'comment_count': int,
  420. 'repost_count': int,
  421. 'uploader_url': 'https://soundcloud.com/jaimemf',
  422. 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
  423. 'genres': ['youtubedl'],
  424. },
  425. },
  426. # private link (alt format)
  427. {
  428. 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp',
  429. 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
  430. 'info_dict': {
  431. 'id': '123998367',
  432. 'ext': 'mp3',
  433. 'title': 'Youtube - Dl Test Video \'\' Ä↭',
  434. 'description': 'test chars: "\'/\\ä↭',
  435. 'uploader': 'jaimeMF',
  436. 'uploader_id': '69767071',
  437. 'timestamp': 1386604920,
  438. 'upload_date': '20131209',
  439. 'duration': 9.927,
  440. 'license': 'all-rights-reserved',
  441. 'view_count': int,
  442. 'like_count': int,
  443. 'comment_count': int,
  444. 'repost_count': int,
  445. 'uploader_url': 'https://soundcloud.com/jaimemf',
  446. 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
  447. 'genres': ['youtubedl'],
  448. },
  449. },
  450. # downloadable song
  451. {
  452. 'url': 'https://soundcloud.com/the80m/the-following',
  453. 'md5': '9ffcddb08c87d74fb5808a3c183a1d04',
  454. 'info_dict': {
  455. 'id': '343609555',
  456. 'ext': 'wav',
  457. 'title': 'The Following',
  458. 'description': '',
  459. 'uploader': '80M',
  460. 'uploader_id': '312384765',
  461. 'uploader_url': 'https://soundcloud.com/the80m',
  462. 'upload_date': '20170922',
  463. 'timestamp': 1506120436,
  464. 'duration': 397.228,
  465. 'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg',
  466. 'license': 'all-rights-reserved',
  467. 'like_count': int,
  468. 'comment_count': int,
  469. 'repost_count': int,
  470. 'view_count': int,
  471. 'genres': ['Dance & EDM'],
  472. 'artists': ['80M'],
  473. },
  474. },
  475. # private link, downloadable format
  476. {
  477. 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd',
  478. 'md5': '64a60b16e617d41d0bef032b7f55441e',
  479. 'info_dict': {
  480. 'id': '340344461',
  481. 'ext': 'wav',
  482. 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
  483. 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
  484. 'uploader': 'Ori Uplift Music',
  485. 'uploader_id': '12563093',
  486. 'timestamp': 1504206263,
  487. 'upload_date': '20170831',
  488. 'duration': 7449.096,
  489. 'license': 'all-rights-reserved',
  490. 'view_count': int,
  491. 'like_count': int,
  492. 'comment_count': int,
  493. 'repost_count': int,
  494. 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg',
  495. 'uploader_url': 'https://soundcloud.com/oriuplift',
  496. 'genres': ['Trance'],
  497. 'artists': ['Ori Uplift'],
  498. },
  499. },
  500. # no album art, use avatar pic for thumbnail
  501. {
  502. 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real',
  503. 'md5': '59c7872bc44e5d99b7211891664760c2',
  504. 'info_dict': {
  505. 'id': '309699954',
  506. 'ext': 'mp3',
  507. 'title': 'Sideways (Prod. Mad Real)',
  508. 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
  509. 'uploader': 'garyvee',
  510. 'uploader_id': '2366352',
  511. 'timestamp': 1488152409,
  512. 'upload_date': '20170226',
  513. 'duration': 207.012,
  514. 'thumbnail': r're:https?://.*\.jpg',
  515. 'license': 'all-rights-reserved',
  516. 'view_count': int,
  517. 'like_count': int,
  518. 'comment_count': int,
  519. 'repost_count': int,
  520. 'uploader_url': 'https://soundcloud.com/garyvee',
  521. 'artists': ['MadReal'],
  522. },
  523. 'params': {
  524. 'skip_download': True,
  525. },
  526. },
  527. {
  528. 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
  529. 'md5': '8227c3473a4264df6b02ad7e5b7527ac',
  530. 'info_dict': {
  531. 'id': '583011102',
  532. 'ext': 'opus',
  533. 'title': 'Mezzo Valzer',
  534. 'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a',
  535. 'uploader': 'Giovanni Sarani',
  536. 'uploader_id': '3352531',
  537. 'timestamp': 1551394171,
  538. 'upload_date': '20190228',
  539. 'duration': 180.157,
  540. 'thumbnail': r're:https?://.*\.jpg',
  541. 'license': 'all-rights-reserved',
  542. 'view_count': int,
  543. 'like_count': int,
  544. 'comment_count': int,
  545. 'repost_count': int,
  546. 'genres': ['Piano'],
  547. 'uploader_url': 'https://soundcloud.com/giovannisarani',
  548. },
  549. },
  550. {
  551. # AAC HQ format available (account with active subscription needed)
  552. 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1',
  553. 'only_matching': True,
  554. },
  555. {
  556. # Go+ (account with active subscription needed)
  557. 'url': 'https://soundcloud.com/taylorswiftofficial/look-what-you-made-me-do',
  558. 'only_matching': True,
  559. },
  560. ]
  561. def _real_extract(self, url):
  562. mobj = self._match_valid_url(url)
  563. track_id = mobj.group('track_id')
  564. query = {}
  565. if track_id:
  566. info_json_url = self._API_V2_BASE + 'tracks/' + track_id
  567. full_title = track_id
  568. token = mobj.group('secret_token')
  569. if token:
  570. query['secret_token'] = token
  571. else:
  572. full_title = resolve_title = '{}/{}'.format(*mobj.group('uploader', 'title'))
  573. token = mobj.group('token')
  574. if token:
  575. resolve_title += f'/{token}'
  576. info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
  577. info = self._call_api(
  578. info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
  579. for retry in self.RetryManager():
  580. try:
  581. return self._extract_info_dict(info, full_title, token)
  582. except ExtractorError as e:
  583. if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
  584. raise
  585. self.report_warning(
  586. 'You have reached the API rate limit, which is ~600 requests per '
  587. '10 minutes. Use the --extractor-retries and --retry-sleep options '
  588. 'to configure an appropriate retry count and wait time', only_once=True)
  589. retry.error = e.cause
  590. class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
  591. def _extract_set(self, playlist, token=None):
  592. playlist_id = str(playlist['id'])
  593. tracks = playlist.get('tracks') or []
  594. if not all(t.get('permalink_url') for t in tracks) and token:
  595. tracks = self._call_api(
  596. self._API_V2_BASE + 'tracks', playlist_id,
  597. 'Downloading tracks', query={
  598. 'ids': ','.join([str(t['id']) for t in tracks]),
  599. 'playlistId': playlist_id,
  600. 'playlistSecretToken': token,
  601. }, headers=self._HEADERS)
  602. entries = []
  603. for track in tracks:
  604. track_id = str_or_none(track.get('id'))
  605. url = track.get('permalink_url')
  606. if not url:
  607. if not track_id:
  608. continue
  609. url = self._API_V2_BASE + 'tracks/' + track_id
  610. if token:
  611. url += '?secret_token=' + token
  612. entries.append(self.url_result(
  613. url, SoundcloudIE.ie_key(), track_id))
  614. return self.playlist_result(
  615. entries, playlist_id,
  616. playlist.get('title'),
  617. playlist.get('description'))
  618. class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
  619. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[:\w\d-]+)(?:/(?P<token>[^?/]+))?'
  620. IE_NAME = 'soundcloud:set'
  621. _TESTS = [{
  622. 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
  623. 'info_dict': {
  624. 'id': '2284613',
  625. 'title': 'The Royal Concept EP',
  626. 'description': 'md5:71d07087c7a449e8941a70a29e34671e',
  627. },
  628. 'playlist_mincount': 5,
  629. }, {
  630. 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
  631. 'only_matching': True,
  632. }, {
  633. 'url': 'https://soundcloud.com/discover/sets/weekly::flacmatic',
  634. 'only_matching': True,
  635. }, {
  636. 'url': 'https://soundcloud.com/discover/sets/charts-top:all-music:de',
  637. 'only_matching': True,
  638. }, {
  639. 'url': 'https://soundcloud.com/discover/sets/charts-top:hiphoprap:kr',
  640. 'only_matching': True,
  641. }]
  642. def _real_extract(self, url):
  643. mobj = self._match_valid_url(url)
  644. full_title = '{}/sets/{}'.format(*mobj.group('uploader', 'slug_title'))
  645. token = mobj.group('token')
  646. if token:
  647. full_title += '/' + token
  648. info = self._call_api(self._resolv_url(
  649. self._BASE_URL + full_title), full_title, headers=self._HEADERS)
  650. if 'errors' in info:
  651. msgs = (str(err['error_message']) for err in info['errors'])
  652. raise ExtractorError('unable to download video webpage: {}'.format(','.join(msgs)))
  653. return self._extract_set(info, token)
  654. class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
  655. def _extract_playlist(self, base_url, playlist_id, playlist_title):
  656. return {
  657. '_type': 'playlist',
  658. 'id': playlist_id,
  659. 'title': playlist_title,
  660. 'entries': self._entries(base_url, playlist_id),
  661. }
  662. def _entries(self, url, playlist_id):
  663. # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
  664. # https://developers.soundcloud.com/blog/offset-pagination-deprecated
  665. query = {
  666. 'limit': 200,
  667. 'linked_partitioning': '1',
  668. 'offset': 0,
  669. }
  670. for i in itertools.count():
  671. for retry in self.RetryManager():
  672. try:
  673. response = self._call_api(
  674. url, playlist_id, query=query, headers=self._HEADERS,
  675. note=f'Downloading track page {i + 1}')
  676. break
  677. except ExtractorError as e:
  678. # Downloading page may result in intermittent 502 HTTP error
  679. # See https://github.com/yt-dlp/yt-dlp/issues/872
  680. if not isinstance(e.cause, HTTPError) or e.cause.status != 502:
  681. raise
  682. retry.error = e
  683. continue
  684. def resolve_entry(*candidates):
  685. for cand in candidates:
  686. if not isinstance(cand, dict):
  687. continue
  688. permalink_url = url_or_none(cand.get('permalink_url'))
  689. if permalink_url:
  690. return self.url_result(
  691. permalink_url,
  692. SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
  693. str_or_none(cand.get('id')), cand.get('title'))
  694. for e in response['collection'] or []:
  695. yield resolve_entry(e, e.get('track'), e.get('playlist'))
  696. url = response.get('next_href')
  697. if not url:
  698. break
  699. query.pop('offset', None)
  700. class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
  701. _VALID_URL = r'''(?x)
  702. https?://
  703. (?:(?:www|m)\.)?soundcloud\.com/
  704. (?P<user>[^/]+)
  705. (?:/
  706. (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight)
  707. )?
  708. /?(?:[?#].*)?$
  709. '''
  710. IE_NAME = 'soundcloud:user'
  711. _TESTS = [{
  712. 'url': 'https://soundcloud.com/soft-cell-official',
  713. 'info_dict': {
  714. 'id': '207965082',
  715. 'title': 'Soft Cell (All)',
  716. },
  717. 'playlist_mincount': 28,
  718. }, {
  719. 'url': 'https://soundcloud.com/soft-cell-official/tracks',
  720. 'info_dict': {
  721. 'id': '207965082',
  722. 'title': 'Soft Cell (Tracks)',
  723. },
  724. 'playlist_mincount': 27,
  725. }, {
  726. 'url': 'https://soundcloud.com/soft-cell-official/albums',
  727. 'info_dict': {
  728. 'id': '207965082',
  729. 'title': 'Soft Cell (Albums)',
  730. },
  731. 'playlist_mincount': 1,
  732. }, {
  733. 'url': 'https://soundcloud.com/jcv246/sets',
  734. 'info_dict': {
  735. 'id': '12982173',
  736. 'title': 'Jordi / cv (Sets)',
  737. },
  738. 'playlist_mincount': 2,
  739. }, {
  740. 'url': 'https://soundcloud.com/jcv246/reposts',
  741. 'info_dict': {
  742. 'id': '12982173',
  743. 'title': 'Jordi / cv (Reposts)',
  744. },
  745. 'playlist_mincount': 6,
  746. }, {
  747. 'url': 'https://soundcloud.com/clalberg/likes',
  748. 'info_dict': {
  749. 'id': '11817582',
  750. 'title': 'clalberg (Likes)',
  751. },
  752. 'playlist_mincount': 5,
  753. }, {
  754. 'url': 'https://soundcloud.com/grynpyret/spotlight',
  755. 'info_dict': {
  756. 'id': '7098329',
  757. 'title': 'Grynpyret (Spotlight)',
  758. },
  759. 'playlist_mincount': 1,
  760. }]
  761. _BASE_URL_MAP = {
  762. 'all': 'stream/users/%s',
  763. 'tracks': 'users/%s/tracks',
  764. 'albums': 'users/%s/albums',
  765. 'sets': 'users/%s/playlists',
  766. 'reposts': 'stream/users/%s/reposts',
  767. 'likes': 'users/%s/likes',
  768. 'spotlight': 'users/%s/spotlight',
  769. }
  770. def _real_extract(self, url):
  771. mobj = self._match_valid_url(url)
  772. uploader = mobj.group('user')
  773. user = self._call_api(
  774. self._resolv_url(self._BASE_URL + uploader),
  775. uploader, 'Downloading user info', headers=self._HEADERS)
  776. resource = mobj.group('rsrc') or 'all'
  777. return self._extract_playlist(
  778. self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'],
  779. str_or_none(user.get('id')),
  780. '{} ({})'.format(user['username'], resource.capitalize()))
  781. class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE):
  782. _VALID_URL = r'https?://api\.soundcloud\.com/users/(?P<id>\d+)'
  783. IE_NAME = 'soundcloud:user:permalink'
  784. _TESTS = [{
  785. 'url': 'https://api.soundcloud.com/users/30909869',
  786. 'info_dict': {
  787. 'id': '30909869',
  788. 'title': 'neilcic',
  789. },
  790. 'playlist_mincount': 22,
  791. }]
  792. def _real_extract(self, url):
  793. user_id = self._match_id(url)
  794. user = self._call_api(
  795. self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS)
  796. return self._extract_playlist(
  797. f'{self._API_V2_BASE}users/{user["id"]}/tracks', str(user['id']), user.get('username'))
  798. class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
  799. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
  800. IE_NAME = 'soundcloud:trackstation'
  801. _TESTS = [{
  802. 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
  803. 'info_dict': {
  804. 'id': '286017854',
  805. 'title': 'Track station: your text',
  806. },
  807. 'playlist_mincount': 47,
  808. }]
  809. def _real_extract(self, url):
  810. track_name = self._match_id(url)
  811. track = self._call_api(self._resolv_url(url), track_name, headers=self._HEADERS)
  812. track_id = self._search_regex(
  813. r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
  814. return self._extract_playlist(
  815. self._API_V2_BASE + 'stations/{}/tracks'.format(track['id']),
  816. track_id, 'Track station: {}'.format(track['title']))
  817. class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
  818. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<slug>[\w\d-]+/[\w\d-]+)/(?P<relation>albums|sets|recommended)'
  819. IE_NAME = 'soundcloud:related'
  820. _TESTS = [{
  821. 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/recommended',
  822. 'info_dict': {
  823. 'id': '1084577272',
  824. 'title': 'Sexapil - Pingers 5 (Recommended)',
  825. },
  826. 'playlist_mincount': 50,
  827. }, {
  828. 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/albums',
  829. 'info_dict': {
  830. 'id': '1084577272',
  831. 'title': 'Sexapil - Pingers 5 (Albums)',
  832. },
  833. 'playlist_mincount': 1,
  834. }, {
  835. 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/sets',
  836. 'info_dict': {
  837. 'id': '1084577272',
  838. 'title': 'Sexapil - Pingers 5 (Sets)',
  839. },
  840. 'playlist_mincount': 4,
  841. }]
  842. _BASE_URL_MAP = {
  843. 'albums': 'tracks/%s/albums',
  844. 'sets': 'tracks/%s/playlists_without_albums',
  845. 'recommended': 'tracks/%s/related',
  846. }
  847. def _real_extract(self, url):
  848. slug, relation = self._match_valid_url(url).group('slug', 'relation')
  849. track = self._call_api(
  850. self._resolv_url(self._BASE_URL + slug),
  851. slug, 'Downloading track info', headers=self._HEADERS)
  852. if track.get('errors'):
  853. raise ExtractorError(f'{self.IE_NAME} said: %s' % ','.join(
  854. str(err['error_message']) for err in track['errors']), expected=True)
  855. return self._extract_playlist(
  856. self._API_V2_BASE + self._BASE_URL_MAP[relation] % track['id'], str(track['id']),
  857. '{} ({})'.format(track.get('title') or slug, relation.capitalize()))
  858. class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
  859. _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
  860. IE_NAME = 'soundcloud:playlist'
  861. _TESTS = [{
  862. 'url': 'https://api.soundcloud.com/playlists/4110309',
  863. 'info_dict': {
  864. 'id': '4110309',
  865. 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
  866. 'description': 're:.*?TILT Brass - Bowery Poetry Club',
  867. },
  868. 'playlist_count': 6,
  869. }]
  870. def _real_extract(self, url):
  871. mobj = self._match_valid_url(url)
  872. playlist_id = mobj.group('id')
  873. query = {}
  874. token = mobj.group('token')
  875. if token:
  876. query['secret_token'] = token
  877. data = self._call_api(
  878. self._API_V2_BASE + 'playlists/' + playlist_id,
  879. playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
  880. return self._extract_set(data, token)
  881. class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor):
  882. IE_NAME = 'soundcloud:search'
  883. IE_DESC = 'Soundcloud search'
  884. _SEARCH_KEY = 'scsearch'
  885. _TESTS = [{
  886. 'url': 'scsearch15:post-avant jazzcore',
  887. 'info_dict': {
  888. 'id': 'post-avant jazzcore',
  889. 'title': 'post-avant jazzcore',
  890. },
  891. 'playlist_count': 15,
  892. }]
  893. _MAX_RESULTS_PER_PAGE = 200
  894. _DEFAULT_RESULTS_PER_PAGE = 50
  895. def _get_collection(self, endpoint, collection_id, **query):
  896. limit = min(
  897. query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
  898. self._MAX_RESULTS_PER_PAGE)
  899. query.update({
  900. 'limit': limit,
  901. 'linked_partitioning': 1,
  902. 'offset': 0,
  903. })
  904. next_url = update_url_query(self._API_V2_BASE + endpoint, query)
  905. for i in itertools.count(1):
  906. response = self._call_api(
  907. next_url, collection_id, f'Downloading page {i}',
  908. 'Unable to download API page', headers=self._HEADERS)
  909. for item in response.get('collection') or []:
  910. if item:
  911. yield self.url_result(
  912. item['uri'], SoundcloudIE.ie_key(), **self._extract_info_dict(item, extract_flat=True))
  913. next_url = response.get('next_href')
  914. if not next_url:
  915. break
  916. def _get_n_results(self, query, n):
  917. return self.playlist_result(itertools.islice(
  918. self._get_collection('search/tracks', query, limit=n, q=query),
  919. 0, None if n == float('inf') else n), query, query)