commit: ecae54a98d2a8d9300142bf3d586f31e8144ccd6
parent f318882955b90bead8206ee411641e65037b1011
Author: Sergey M․ <dstftw@gmail.com>
Date: Tue, 5 Jan 2021 07:40:06 +0700
[motherless] Fix review issues and improve extraction (closes #26495, closes #27450)
Diffstat:
1 file changed, 34 insertions(+), 18 deletions(-)
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
@@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor):
# no keywords
'url': 'http://motherless.com/8B4BBC1',
'only_matching': True,
+ }, {
+ # see https://motherless.com/videos/recent for recent videos with
+ # uploaded date in "ago" format
+ 'url': 'https://motherless.com/3C3E2CF',
+ 'info_dict': {
+ 'id': '3C3E2CF',
+ 'ext': 'mp4',
+ 'title': 'a/ Hot Teens',
+ 'categories': list,
+ 'upload_date': '20210104',
+ 'uploader_id': 'yonbiw',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -85,29 +102,28 @@ class MotherlessIE(InfoExtractor):
or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
age_limit = self._rta_search(webpage)
view_count = str_to_int(self._html_search_regex(
- (r'>([\d,.]+)\s+Views<', # 1,234,567 Views
- r'<strong>Views</strong>\s+([^<]+)<'),
+ (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
webpage, 'view count', fatal=False))
like_count = str_to_int(self._html_search_regex(
- (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites
+ (r'>([\d,.]+)\s+Favorites<',
r'<strong>Favorited</strong>\s+([^<]+)<'),
webpage, 'like count', fatal=False))
- upload_date = self._html_search_regex(
- (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<',
- r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago
- r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date')
- relative = re.match(r'(\d+)([hd])$', upload_date)
- if relative:
- delta = int(relative.group(1))
- unit = relative.group(2)
- if unit == 'h':
- delta_t = datetime.timedelta(hours=delta)
- else: # unit == 'd'
- delta_t = datetime.timedelta(days=delta)
- upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d')
- else:
- upload_date = unified_strdate(upload_date)
+ upload_date = unified_strdate(self._search_regex(
+ r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
+ 'upload date', default=None))
+ if not upload_date:
+ uploaded_ago = self._search_regex(
+ r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
+ default=None)
+ if uploaded_ago:
+ delta = int(uploaded_ago[:-1])
+ _AGO_UNITS = {
+ 'h': 'hours',
+ 'd': 'days',
+ }
+ kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
+ upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = webpage.count('class="media-comment-contents"')
uploader_id = self._html_search_regex(