logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

bandcamp.py (20263B)


  1. import json
  2. import random
  3. import re
  4. import time
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. KNOWN_EXTENSIONS,
  8. ExtractorError,
  9. extract_attributes,
  10. float_or_none,
  11. int_or_none,
  12. parse_filesize,
  13. str_or_none,
  14. try_get,
  15. unified_strdate,
  16. unified_timestamp,
  17. update_url_query,
  18. url_or_none,
  19. urljoin,
  20. )
  21. from ..utils.traversal import find_element, traverse_obj
  22. class BandcampIE(InfoExtractor):
  23. _VALID_URL = r'https?://(?P<uploader>[^/]+)\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
  24. _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
  25. _TESTS = [{
  26. 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
  27. 'md5': 'c557841d5e50261777a6585648adf439',
  28. 'info_dict': {
  29. 'id': '1812978515',
  30. 'ext': 'mp3',
  31. 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
  32. 'duration': 9.8485,
  33. 'uploader': 'youtube-dl "\'/\\ä↭',
  34. 'upload_date': '20121129',
  35. 'timestamp': 1354224127,
  36. 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
  37. 'album_artist': 'youtube-dl "\'/\\ä↭',
  38. 'track_id': '1812978515',
  39. 'artist': 'youtube-dl "\'/\\ä↭',
  40. 'uploader_url': 'https://youtube-dl.bandcamp.com',
  41. 'uploader_id': 'youtube-dl',
  42. 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
  43. 'artists': ['youtube-dl "\'/\\ä↭'],
  44. 'album_artists': ['youtube-dl "\'/\\ä↭'],
  45. },
  46. 'skip': 'There is a limit of 200 free downloads / month for the test song',
  47. }, {
  48. # free download
  49. 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
  50. 'info_dict': {
  51. 'id': '2650410135',
  52. 'ext': 'm4a',
  53. 'acodec': r're:[fa]lac',
  54. 'title': 'Ben Prunty - Lanius (Battle)',
  55. 'thumbnail': r're:^https?://.*\.jpg$',
  56. 'uploader': 'Ben Prunty',
  57. 'timestamp': 1396508491,
  58. 'upload_date': '20140403',
  59. 'release_timestamp': 1396483200,
  60. 'release_date': '20140403',
  61. 'duration': 260.877,
  62. 'track': 'Lanius (Battle)',
  63. 'track_number': 1,
  64. 'track_id': '2650410135',
  65. 'artist': 'Ben Prunty',
  66. 'album_artist': 'Ben Prunty',
  67. 'album': 'FTL: Advanced Edition Soundtrack',
  68. 'uploader_url': 'https://benprunty.bandcamp.com',
  69. 'uploader_id': 'benprunty',
  70. },
  71. }, {
  72. # no free download, mp3 128
  73. 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
  74. 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
  75. 'info_dict': {
  76. 'id': '2584466013',
  77. 'ext': 'mp3',
  78. 'title': 'Mastodon - Hail to Fire',
  79. 'thumbnail': r're:^https?://.*\.jpg$',
  80. 'uploader': 'Mastodon',
  81. 'timestamp': 1322005399,
  82. 'upload_date': '20111122',
  83. 'release_timestamp': 1076112000,
  84. 'release_date': '20040207',
  85. 'duration': 120.79,
  86. 'track': 'Hail to Fire',
  87. 'track_number': 5,
  88. 'track_id': '2584466013',
  89. 'artist': 'Mastodon',
  90. 'album_artist': 'Mastodon',
  91. 'album': 'Call of the Mastodon',
  92. 'uploader_url': 'https://relapsealumni.bandcamp.com',
  93. 'uploader_id': 'relapsealumni',
  94. },
  95. }, {
  96. # track from compilation album (artist/album_artist difference)
  97. 'url': 'https://diskotopia.bandcamp.com/track/safehouse',
  98. 'md5': '19c5337bca1428afa54129f86a2f6a69',
  99. 'info_dict': {
  100. 'id': '1978174799',
  101. 'ext': 'mp3',
  102. 'title': 'submerse - submerse - Safehouse',
  103. 'thumbnail': r're:^https?://.*\.jpg$',
  104. 'uploader': 'submerse',
  105. 'timestamp': 1480779297,
  106. 'upload_date': '20161203',
  107. 'release_timestamp': 1481068800,
  108. 'release_date': '20161207',
  109. 'duration': 154.066,
  110. 'track': 'submerse - Safehouse',
  111. 'track_number': 3,
  112. 'track_id': '1978174799',
  113. 'artist': 'submerse',
  114. 'album_artist': 'Diskotopia',
  115. 'album': 'DSK F/W 2016-2017 Free Compilation',
  116. 'uploader_url': 'https://diskotopia.bandcamp.com',
  117. 'uploader_id': 'diskotopia',
  118. },
  119. }]
  120. def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
  121. return self._parse_json(self._html_search_regex(
  122. rf'data-{attr}=(["\'])({{.+?}})\1', webpage,
  123. attr + ' data', group=2), video_id, fatal=fatal)
  124. def _real_extract(self, url):
  125. title, uploader = self._match_valid_url(url).group('id', 'uploader')
  126. webpage = self._download_webpage(url, title)
  127. tralbum = self._extract_data_attr(webpage, title)
  128. thumbnail = self._og_search_thumbnail(webpage)
  129. track_id = None
  130. track = None
  131. track_number = None
  132. duration = None
  133. formats = []
  134. track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
  135. if track_info:
  136. file_ = track_info.get('file')
  137. if isinstance(file_, dict):
  138. for format_id, format_url in file_.items():
  139. if not url_or_none(format_url):
  140. continue
  141. ext, abr_str = format_id.split('-', 1)
  142. formats.append({
  143. 'format_id': format_id,
  144. 'url': self._proto_relative_url(format_url, 'http:'),
  145. 'ext': ext,
  146. 'vcodec': 'none',
  147. 'acodec': ext,
  148. 'abr': int_or_none(abr_str),
  149. })
  150. track = track_info.get('title')
  151. track_id = str_or_none(
  152. track_info.get('track_id') or track_info.get('id'))
  153. track_number = int_or_none(track_info.get('track_num'))
  154. duration = float_or_none(track_info.get('duration'))
  155. embed = self._extract_data_attr(webpage, title, 'embed', False)
  156. current = tralbum.get('current') or {}
  157. artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
  158. album_artist = self._html_search_regex(
  159. r'<h3 class="albumTitle">[\S\s]*?by\s*<span>\s*<a href="[^>]+">\s*([^>]+?)\s*</a>',
  160. webpage, 'album artist', fatal=False)
  161. timestamp = unified_timestamp(
  162. current.get('publish_date') or tralbum.get('album_publish_date'))
  163. download_link = tralbum.get('freeDownloadPage')
  164. if download_link:
  165. track_id = str(tralbum['id'])
  166. download_webpage = self._download_webpage(
  167. download_link, track_id, 'Downloading free downloads page')
  168. blob = self._extract_data_attr(download_webpage, track_id, 'blob')
  169. info = try_get(
  170. blob, (lambda x: x['digital_items'][0],
  171. lambda x: x['download_items'][0]), dict)
  172. if info:
  173. downloads = info.get('downloads')
  174. if isinstance(downloads, dict):
  175. if not track:
  176. track = info.get('title')
  177. if not artist:
  178. artist = info.get('artist')
  179. if not thumbnail:
  180. thumbnail = info.get('thumb_url')
  181. download_formats = {}
  182. download_formats_list = blob.get('download_formats')
  183. if isinstance(download_formats_list, list):
  184. for f in blob['download_formats']:
  185. name, ext = f.get('name'), f.get('file_extension')
  186. if all(isinstance(x, str) for x in (name, ext)):
  187. download_formats[name] = ext.strip('.')
  188. for format_id, f in downloads.items():
  189. format_url = f.get('url')
  190. if not format_url:
  191. continue
  192. # Stat URL generation algorithm is reverse engineered from
  193. # download_*_bundle_*.js
  194. stat_url = update_url_query(
  195. format_url.replace('/download/', '/statdownload/'), {
  196. '.rand': int(time.time() * 1000 * random.random()),
  197. })
  198. format_id = f.get('encoding_name') or format_id
  199. stat = self._download_json(
  200. stat_url, track_id, f'Downloading {format_id} JSON',
  201. transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
  202. fatal=False)
  203. if not stat:
  204. continue
  205. retry_url = url_or_none(stat.get('retry_url'))
  206. if not retry_url:
  207. continue
  208. formats.append({
  209. 'url': self._proto_relative_url(retry_url, 'http:'),
  210. 'ext': download_formats.get(format_id),
  211. 'format_id': format_id,
  212. 'format_note': f.get('description'),
  213. 'filesize': parse_filesize(f.get('size_mb')),
  214. 'vcodec': 'none',
  215. 'acodec': format_id.split('-')[0],
  216. })
  217. title = f'{artist} - {track}' if artist else track
  218. if not duration:
  219. duration = float_or_none(self._html_search_meta(
  220. 'duration', webpage, default=None))
  221. return {
  222. 'id': track_id,
  223. 'title': title,
  224. 'thumbnail': thumbnail,
  225. 'uploader': artist,
  226. 'uploader_id': uploader,
  227. 'uploader_url': f'https://{uploader}.bandcamp.com',
  228. 'timestamp': timestamp,
  229. 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
  230. 'duration': duration,
  231. 'track': track,
  232. 'track_number': track_number,
  233. 'track_id': track_id,
  234. 'artist': artist,
  235. 'album': embed.get('album_title'),
  236. 'album_artist': album_artist,
  237. 'formats': formats,
  238. }
  239. class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
  240. IE_NAME = 'Bandcamp:album'
  241. _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)'
  242. _TESTS = [{
  243. 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
  244. 'playlist': [
  245. {
  246. 'md5': '39bc1eded3476e927c724321ddf116cf',
  247. 'info_dict': {
  248. 'id': '1353101989',
  249. 'ext': 'mp3',
  250. 'title': 'Blazo - Intro',
  251. 'timestamp': 1311756226,
  252. 'upload_date': '20110727',
  253. 'uploader': 'Blazo',
  254. 'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
  255. 'album_artists': ['Blazo'],
  256. 'uploader_url': 'https://blazo.bandcamp.com',
  257. 'release_date': '20110727',
  258. 'release_timestamp': 1311724800.0,
  259. 'track': 'Intro',
  260. 'uploader_id': 'blazo',
  261. 'track_number': 1,
  262. 'album': 'Jazz Format Mixtape vol.1',
  263. 'artists': ['Blazo'],
  264. 'duration': 19.335,
  265. 'track_id': '1353101989',
  266. },
  267. },
  268. {
  269. 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
  270. 'info_dict': {
  271. 'id': '38097443',
  272. 'ext': 'mp3',
  273. 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
  274. 'timestamp': 1311757238,
  275. 'upload_date': '20110727',
  276. 'uploader': 'Blazo',
  277. 'track': 'Kero One - Keep It Alive (Blazo remix)',
  278. 'release_date': '20110727',
  279. 'track_id': '38097443',
  280. 'track_number': 2,
  281. 'duration': 181.467,
  282. 'uploader_url': 'https://blazo.bandcamp.com',
  283. 'album': 'Jazz Format Mixtape vol.1',
  284. 'uploader_id': 'blazo',
  285. 'album_artists': ['Blazo'],
  286. 'artists': ['Blazo'],
  287. 'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
  288. 'release_timestamp': 1311724800.0,
  289. },
  290. },
  291. ],
  292. 'info_dict': {
  293. 'title': 'Jazz Format Mixtape vol.1',
  294. 'id': 'jazz-format-mixtape-vol-1',
  295. 'uploader_id': 'blazo',
  296. 'description': 'md5:38052a93217f3ffdc033cd5dbbce2989',
  297. },
  298. 'params': {
  299. 'playlistend': 2,
  300. },
  301. 'skip': 'Bandcamp imposes download limits.',
  302. }, {
  303. 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
  304. 'info_dict': {
  305. 'title': 'Hierophany of the Open Grave',
  306. 'uploader_id': 'nightbringer',
  307. 'id': 'hierophany-of-the-open-grave',
  308. },
  309. 'playlist_mincount': 9,
  310. }, {
  311. # with escaped quote in title
  312. 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
  313. 'info_dict': {
  314. 'title': '"Entropy" EP',
  315. 'uploader_id': 'jstrecords',
  316. 'id': 'entropy-ep',
  317. 'description': 'md5:0ff22959c943622972596062f2f366a5',
  318. },
  319. 'playlist_mincount': 3,
  320. }, {
  321. # not all tracks have songs
  322. 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
  323. 'info_dict': {
  324. 'id': 'we-are-the-plague',
  325. 'title': 'WE ARE THE PLAGUE',
  326. 'uploader_id': 'insulters',
  327. 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
  328. },
  329. 'playlist_count': 2,
  330. }]
  331. @classmethod
  332. def suitable(cls, url):
  333. return (False
  334. if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
  335. else super().suitable(url))
  336. def _real_extract(self, url):
  337. uploader_id, album_id = self._match_valid_url(url).groups()
  338. playlist_id = album_id or uploader_id
  339. webpage = self._download_webpage(url, playlist_id)
  340. tralbum = self._extract_data_attr(webpage, playlist_id)
  341. track_info = tralbum.get('trackinfo')
  342. if not track_info:
  343. raise ExtractorError('The page doesn\'t contain any tracks')
  344. # Only tracks with duration info have songs
  345. entries = [
  346. self.url_result(
  347. urljoin(url, t['title_link']), BandcampIE.ie_key(),
  348. str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
  349. for t in track_info
  350. if t.get('duration')]
  351. current = tralbum.get('current') or {}
  352. return {
  353. '_type': 'playlist',
  354. 'uploader_id': uploader_id,
  355. 'id': playlist_id,
  356. 'title': current.get('title'),
  357. 'description': current.get('about'),
  358. 'entries': entries,
  359. }
  360. class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
  361. IE_NAME = 'Bandcamp:weekly'
  362. _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
  363. _TESTS = [{
  364. 'url': 'https://bandcamp.com/?show=224',
  365. 'md5': '61acc9a002bed93986b91168aa3ab433',
  366. 'info_dict': {
  367. 'id': '224',
  368. 'ext': 'mp3',
  369. 'title': 'BC Weekly April 4th 2017 - Magic Moments',
  370. 'description': 'md5:5d48150916e8e02d030623a48512c874',
  371. 'duration': 5829.77,
  372. 'release_date': '20170404',
  373. 'series': 'Bandcamp Weekly',
  374. 'episode': 'Magic Moments',
  375. 'episode_id': '224',
  376. },
  377. 'params': {
  378. 'format': 'mp3-128',
  379. },
  380. }, {
  381. 'url': 'https://bandcamp.com/?blah/blah@&show=228',
  382. 'only_matching': True,
  383. }]
  384. def _real_extract(self, url):
  385. show_id = self._match_id(url)
  386. webpage = self._download_webpage(url, show_id)
  387. blob = self._extract_data_attr(webpage, show_id, 'blob')
  388. show = blob['bcw_data'][show_id]
  389. formats = []
  390. for format_id, format_url in show['audio_stream'].items():
  391. if not url_or_none(format_url):
  392. continue
  393. for known_ext in KNOWN_EXTENSIONS:
  394. if known_ext in format_id:
  395. ext = known_ext
  396. break
  397. else:
  398. ext = None
  399. formats.append({
  400. 'format_id': format_id,
  401. 'url': format_url,
  402. 'ext': ext,
  403. 'vcodec': 'none',
  404. })
  405. title = show.get('audio_title') or 'Bandcamp Weekly'
  406. subtitle = show.get('subtitle')
  407. if subtitle:
  408. title += f' - {subtitle}'
  409. return {
  410. 'id': show_id,
  411. 'title': title,
  412. 'description': show.get('desc') or show.get('short_desc'),
  413. 'duration': float_or_none(show.get('audio_duration')),
  414. 'is_live': False,
  415. 'release_date': unified_strdate(show.get('published_date')),
  416. 'series': 'Bandcamp Weekly',
  417. 'episode': show.get('subtitle'),
  418. 'episode_id': show_id,
  419. 'formats': formats,
  420. }
  421. class BandcampUserIE(InfoExtractor):
  422. IE_NAME = 'Bandcamp:user'
  423. _VALID_URL = r'https?://(?!www\.)(?P<id>[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)'
  424. _TESTS = [{
  425. # Type 1 Bandcamp user page.
  426. 'url': 'https://adrianvonziegler.bandcamp.com',
  427. 'info_dict': {
  428. 'id': 'adrianvonziegler',
  429. 'title': 'Discography of adrianvonziegler',
  430. },
  431. 'playlist_mincount': 23,
  432. }, {
  433. # Bandcamp user page with only one album
  434. 'url': 'http://dotscale.bandcamp.com',
  435. 'info_dict': {
  436. 'id': 'dotscale',
  437. 'title': 'Discography of dotscale',
  438. },
  439. 'playlist_count': 1,
  440. }, {
  441. # Type 2 Bandcamp user page.
  442. 'url': 'https://nightcallofficial.bandcamp.com',
  443. 'info_dict': {
  444. 'id': 'nightcallofficial',
  445. 'title': 'Discography of nightcallofficial',
  446. },
  447. 'playlist_count': 4,
  448. }, {
  449. 'url': 'https://steviasphere.bandcamp.com/music',
  450. 'playlist_mincount': 47,
  451. 'info_dict': {
  452. 'id': 'steviasphere',
  453. 'title': 'Discography of steviasphere',
  454. },
  455. }, {
  456. 'url': 'https://coldworldofficial.bandcamp.com/music',
  457. 'playlist_mincount': 7,
  458. 'info_dict': {
  459. 'id': 'coldworldofficial',
  460. 'title': 'Discography of coldworldofficial',
  461. },
  462. }, {
  463. 'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
  464. 'playlist_mincount': 399,
  465. 'info_dict': {
  466. 'id': 'nuclearwarnowproductions',
  467. 'title': 'Discography of nuclearwarnowproductions',
  468. },
  469. }]
  470. def _yield_items(self, webpage):
  471. yield from (
  472. re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
  473. or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
  474. yield from traverse_obj(webpage, (
  475. {find_element(id='music-grid', html=True)}, {extract_attributes},
  476. 'data-client-items', {json.loads}, ..., 'page_url', {str}))
  477. def _real_extract(self, url):
  478. uploader = self._match_id(url)
  479. webpage = self._download_webpage(url, uploader)
  480. return self.playlist_from_matches(
  481. self._yield_items(webpage), uploader, f'Discography of {uploader}',
  482. getter=urljoin(url))