commit: e452345fc5cee5e79d2cad6be575da563987a4ff
parent bf45295c5387d0d90b97ca34d377cdaa07c71bcb
Author: Remita Amine <remitamine@gmail.com>
Date: Mon, 4 Nov 2019 15:43:52 +0100
[jamendo] improve extraction
- fix album extraction(closes #18564)
- improve metadata extraction(closes #18565)(closes #21379)
Diffstat:
1 file changed, 103 insertions(+), 67 deletions(-)
diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py
@@ -1,38 +1,26 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
+import hashlib
+import random
-from ..compat import compat_urlparse
+from ..compat import compat_str
from .common import InfoExtractor
-from ..utils import parse_duration
-
-
-class JamendoBaseIE(InfoExtractor):
- def _extract_meta(self, webpage, fatal=True):
- title = self._og_search_title(
- webpage, default=None) or self._search_regex(
- r'<title>([^<]+)', webpage,
- 'title', default=None)
- if title:
- title = self._search_regex(
- r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None)
- if not title:
- title = self._html_search_meta(
- 'name', webpage, 'title', fatal=fatal)
- mobj = re.search(r'(.+) - (.+)', title or '')
- artist, second = mobj.groups() if mobj else [None] * 2
- return title, artist, second
-
-
-class JamendoIE(JamendoBaseIE):
+from ..utils import (
+ clean_html,
+ int_or_none,
+ try_get,
+)
+
+
+class JamendoIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
licensing\.jamendo\.com/[^/]+|
(?:www\.)?jamendo\.com
)
- /track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)
+ /track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))?
'''
_TESTS = [{
'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i',
@@ -45,7 +33,9 @@ class JamendoIE(JamendoBaseIE):
'artist': 'Maya Filipič',
'track': 'Stories from Emona I',
'duration': 210,
- 'thumbnail': r're:^https?://.*\.jpg'
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'timestamp': 1217438117,
+ 'upload_date': '20080730',
}
}, {
'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
@@ -53,15 +43,19 @@ class JamendoIE(JamendoBaseIE):
}]
def _real_extract(self, url):
- mobj = self._VALID_URL_RE.match(url)
- track_id = mobj.group('id')
- display_id = mobj.group('display_id')
-
- webpage = self._download_webpage(
- 'https://www.jamendo.com/track/%s/%s' % (track_id, display_id),
- display_id)
-
- title, artist, track = self._extract_meta(webpage)
+ track_id, display_id = self._VALID_URL_RE.match(url).groups()
+ webpage = self._download_webpage(url, track_id)
+ models = self._parse_json(self._html_search_regex(
+ r"data-bundled-models='([^']+)",
+ webpage, 'bundled models'), track_id)
+ track = models['track']['models'][0]
+ title = track_name = track['name']
+ get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
+ artist = get_model('artist')
+ artist_name = artist.get('name')
+ if artist_name:
+ title = '%s - %s' % (artist_name, title)
+ album = get_model('album')
formats = [{
'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
@@ -77,31 +71,58 @@ class JamendoIE(JamendoBaseIE):
))]
self._sort_formats(formats)
- thumbnail = self._html_search_meta(
- 'image', webpage, 'thumbnail', fatal=False)
- duration = parse_duration(self._search_regex(
- r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']',
- webpage, 'duration', fatal=False))
+ urls = []
+ thumbnails = []
+ for _, covers in track.get('cover', {}).items():
+ for cover_id, cover_url in covers.items():
+ if not cover_url or cover_url in urls:
+ continue
+ urls.append(cover_url)
+ size = int_or_none(cover_id.lstrip('size'))
+ thumbnails.append({
+ 'id': cover_id,
+ 'url': cover_url,
+ 'width': size,
+ 'height': size,
+ })
+
+ tags = []
+ for tag in track.get('tags', []):
+ tag_name = tag.get('name')
+ if not tag_name:
+ continue
+ tags.append(tag_name)
+
+ stats = track.get('stats') or {}
return {
'id': track_id,
'display_id': display_id,
- 'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'title': title,
- 'duration': duration,
- 'artist': artist,
- 'track': track,
- 'formats': formats
+ 'description': track.get('description'),
+ 'duration': int_or_none(track.get('duration')),
+ 'artist': artist_name,
+ 'track': track_name,
+ 'album': album.get('name'),
+ 'formats': formats,
+ 'license': '-'.join(track.get('licenseCC', [])) or None,
+ 'timestamp': int_or_none(track.get('dateCreated')),
+ 'view_count': int_or_none(stats.get('listenedAll')),
+ 'like_count': int_or_none(stats.get('favorited')),
+ 'average_rating': int_or_none(stats.get('averageNote')),
+ 'tags': tags,
}
-class JamendoAlbumIE(JamendoBaseIE):
- _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)'
+class JamendoAlbumIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
'info_dict': {
'id': '121486',
- 'title': 'Shearer - Duck On Cover'
+ 'title': 'Duck On Cover',
+ 'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239',
},
'playlist': [{
'md5': 'e1a2fcb42bda30dfac990212924149a8',
@@ -111,6 +132,8 @@ class JamendoAlbumIE(JamendoBaseIE):
'title': 'Shearer - Warmachine',
'artist': 'Shearer',
'track': 'Warmachine',
+ 'timestamp': 1368089771,
+ 'upload_date': '20130509',
}
}, {
'md5': '1f358d7b2f98edfe90fd55dac0799d50',
@@ -120,6 +143,8 @@ class JamendoAlbumIE(JamendoBaseIE):
'title': 'Shearer - Without Your Ghost',
'artist': 'Shearer',
'track': 'Without Your Ghost',
+ 'timestamp': 1368089771,
+ 'upload_date': '20130509',
}
}],
'params': {
@@ -127,24 +152,35 @@ class JamendoAlbumIE(JamendoBaseIE):
}
}
+ def _call_api(self, resource, resource_id):
+ path = '/api/%ss' % resource
+ rand = compat_str(random.random())
+ return self._download_json(
+ 'https://www.jamendo.com' + path, resource_id, query={
+ 'id[]': resource_id,
+ }, headers={
+ 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
+ })[0]
+
def _real_extract(self, url):
- mobj = self._VALID_URL_RE.match(url)
- album_id = mobj.group('id')
-
- webpage = self._download_webpage(url, mobj.group('display_id'))
-
- title, artist, album = self._extract_meta(webpage, fatal=False)
-
- entries = [{
- '_type': 'url_transparent',
- 'url': compat_urlparse.urljoin(url, m.group('path')),
- 'ie_key': JamendoIE.ie_key(),
- 'id': self._search_regex(
- r'/track/(\d+)', m.group('path'), 'track id', default=None),
- 'artist': artist,
- 'album': album,
- } for m in re.finditer(
- r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link',
- webpage)]
-
- return self.playlist_result(entries, album_id, title)
+ album_id = self._match_id(url)
+ album = self._call_api('album', album_id)
+ album_name = album.get('name')
+
+ entries = []
+ for track in album.get('tracks', []):
+ track_id = track.get('id')
+ if not track_id:
+ continue
+ track_id = compat_str(track_id)
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': 'https://www.jamendo.com/track/' + track_id,
+ 'ie_key': JamendoIE.ie_key(),
+ 'id': track_id,
+ 'album': album_name,
+ })
+
+ return self.playlist_result(
+ entries, album_id, album_name,
+ clean_html(try_get(album, lambda x: x['description']['en'], compat_str)))