logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git
commit: 7bbd5b13d4c6cfc3e24f56413ff1a1eace8472b8
parent c91cbf60729af93c4677864aa6c8b74b576146ca
Author: dirkf <fieldhouse@gmx.net>
Date:   Wed, 12 Oct 2022 01:09:55 +0100

[Motherless] Pull from yt-dlp, etc

* use username field
* loosen regexes
* warn on page count 0 in group
* avoid reloading group page 1
Closes #29626

Diffstat:

Myoutube_dl/extractor/motherless.py33++++++++++++++++++++++++++-------
1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py @@ -126,9 +126,10 @@ class MotherlessIE(InfoExtractor): kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') - comment_count = webpage.count('class="media-comment-contents"') + comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( - r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''', + (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''', + r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id') categories = self._html_search_meta('keywords', webpage, default=None) @@ -171,6 +172,17 @@ class MotherlessGroupIE(InfoExtractor): 'any kind!' }, 'playlist_mincount': 0, + 'expected_warnings': [ + 'This group has no videos.', + ] + }, { + 'url': 'https://motherless.com/g/beautiful_cock', + 'info_dict': { + 'id': 'beautiful_cock', + 'title': 'Beautiful Cock', + 'description': 'Group for lovely cocks yours, mine, a friends anything human', + }, + 'playlist_mincount': 2500, }] @classmethod @@ -211,14 +223,21 @@ class MotherlessGroupIE(InfoExtractor): 'description', webpage, fatal=False) page_count = str_to_int(self._search_regex( r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', - webpage, 'page_count', default='1')) + webpage, 'page_count', default=0)) + if not page_count: + message = self._search_regex( + r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', + webpage, 'error_msg', default=None) or 'This group has no videos.' + self.report_warning(message, group_id) + page_count = 1 PAGE_SIZE = 80 def _get_page(idx): - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) + if idx > 0: + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) for entry in self._extract_entries(webpage, url): yield entry