logo

etc_portage

Unnamed repository; edit this file 'description' to name the repository. git clone https://hacktivis.me/git/etc_portage.git

youtube-dl-2020.09.20-bandcamp_url_quoted_data.patch (5269B)


  1. From f2ecf2b7e6e79d2003c8ef3ddf018430f62eb19f Mon Sep 17 00:00:00 2001
  2. From: Leonardo Taccari <iamleot@gmail.com>
  3. Date: Sun, 11 Oct 2020 14:47:25 +0200
  4. Subject: [PATCH] [bandcamp] Update to handle HTML quoted data
  5. Adjust the extractor to handle JSON data-* attributes by introducing a
  6. _json_data_extract() method to handle them (and existing existing
  7. patterns in the code).
  8. Based on Gilles Pietri #26684.
  9. ---
  10. youtube_dl/extractor/bandcamp.py | 48 ++++++++++++++------------------
  11. 1 file changed, 21 insertions(+), 27 deletions(-)
  12. diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
  13. index f14b407dc82c..7357b794b275 100644
  14. --- a/youtube_dl/extractor/bandcamp.py
  15. +++ b/youtube_dl/extractor/bandcamp.py
  16. @@ -79,6 +79,14 @@ class BandcampIE(InfoExtractor):
  17. },
  18. }]
  19. + def _json_data_extract(self, data_key, video_id, webpage):
  20. + return self._parse_json(
  21. + self._search_regex(
  22. + r'data-' + data_key + r'=(["\'])(?P<data>{.+?})\1',
  23. + webpage, 'JSON data {data_key}'.format(data_key=data_key),
  24. + group='data', default=None),
  25. + video_id, transform_source=unescapeHTML)
  26. +
  27. def _real_extract(self, url):
  28. mobj = re.match(self._VALID_URL, url)
  29. title = mobj.group('title')
  30. @@ -91,10 +99,9 @@ def _real_extract(self, url):
  31. duration = None
  32. formats = []
  33. - track_info = self._parse_json(
  34. - self._search_regex(
  35. - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n',
  36. - webpage, 'track info', default='{}'), title)
  37. + tralbum_data = self._json_data_extract('tralbum', title, webpage)
  38. + embed_data = self._json_data_extract('embed', title, webpage)
  39. + track_info = tralbum_data['trackinfo'][0]
  40. if track_info:
  41. file_ = track_info.get('file')
  42. if isinstance(file_, dict):
  43. @@ -116,9 +123,9 @@ def _real_extract(self, url):
  44. duration = float_or_none(track_info.get('duration'))
  45. def extract(key):
  46. - return self._search_regex(
  47. - r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key,
  48. - webpage, key, default=None, group='value')
  49. + for data in tralbum_data['current'], embed_data, tralbum_data:
  50. + if key in data and data[key]:
  51. + return data[key]
  52. artist = extract('artist')
  53. album = extract('album_title')
  54. @@ -126,9 +133,7 @@ def extract(key):
  55. extract('publish_date') or extract('album_publish_date'))
  56. release_date = unified_strdate(extract('album_release_date'))
  57. - download_link = self._search_regex(
  58. - r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
  59. - 'download link', default=None, group='url')
  60. + download_link = tralbum_data['freeDownloadPage']
  61. if download_link:
  62. track_id = self._search_regex(
  63. r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
  64. @@ -137,11 +142,7 @@ def extract(key):
  65. download_webpage = self._download_webpage(
  66. download_link, track_id, 'Downloading free downloads page')
  67. - blob = self._parse_json(
  68. - self._search_regex(
  69. - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
  70. - 'blob', group='blob'),
  71. - track_id, transform_source=unescapeHTML)
  72. + blob = self._json_data_extract('blob', track_id, download_webpage)
  73. info = try_get(
  74. blob, (lambda x: x['digital_items'][0],
  75. @@ -218,7 +219,7 @@ def extract(key):
  76. }
  77. -class BandcampAlbumIE(InfoExtractor):
  78. +class BandcampAlbumIE(BandcampIE):
  79. IE_NAME = 'Bandcamp:album'
  80. _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
  81. @@ -314,11 +315,8 @@ def _real_extract(self, url):
  82. for elem_content, t_path in track_elements
  83. if self._html_search_meta('duration', elem_content, default=None)]
  84. - title = self._html_search_regex(
  85. - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
  86. - webpage, 'title', fatal=False)
  87. - if title:
  88. - title = title.replace(r'\"', '"')
  89. + embed_data = self._json_data_extract('embed', album_id, webpage)
  90. + title = embed_data.get('album_title')
  91. return {
  92. '_type': 'playlist',
  93. 'uploader_id': uploader_id,
  94. @@ -328,7 +326,7 @@ def _real_extract(self, url):
  95. }
  96. -class BandcampWeeklyIE(InfoExtractor):
  97. +class BandcampWeeklyIE(BandcampIE):
  98. IE_NAME = 'Bandcamp:weekly'
  99. _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
  100. _TESTS = [{
  101. @@ -355,11 +353,7 @@ def _real_extract(self, url):
  102. video_id = self._match_id(url)
  103. webpage = self._download_webpage(url, video_id)
  104. - blob = self._parse_json(
  105. - self._search_regex(
  106. - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
  107. - 'blob', group='blob'),
  108. - video_id, transform_source=unescapeHTML)
  109. + blob = self._json_data_extract('blob', video_id, webpage)
  110. show = blob['bcw_show']