commit: b2ba24bb026904f3503db71f65d2b1627f08edf1
parent a190b559640ce1b5fe67e5a4843dc58328503f3c
Author: dirkf <fieldhouse@gmx.net>
Date: Wed, 19 Jul 2023 14:14:50 +0100
[InfoExtractor] Add `_match_valid_url()` class method and refactor
* API compatible with yt-dlp
* also support Sequence of patterns in _VALID_URL
* one place to compile _VALID_URL
* TODO: remove existing extractor shims
Diffstat:
3 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py
@@ -4,6 +4,7 @@ from inspect import getsource
import io
import os
from os.path import dirname as dirn
+import re
import sys
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
@@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
with open('devscripts/lazy_load_template.py', 'rt') as f:
module_template = f.read()
+
+def get_source(m):
+ return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
+
+
module_contents = [
- module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
+ module_template,
+ get_source(InfoExtractor.suitable),
+ get_source(InfoExtractor._match_valid_url) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780)
- 'from youtube_dl.utils import parse_qs\n',
+ 'from youtube_dl.utils import parse_qs, variadic\n',
]
ie_template = '''
@@ -66,7 +74,7 @@ def build_lazy_ie(ie, name):
valid_url=valid_url,
module=ie.__module__)
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
- s += '\n' + getsource(ie.suitable)
+ s += '\n' + get_source(ie.suitable)
if hasattr(ie, '_make_valid_url'):
# search extractors
s += make_valid_template.format(valid_url=ie._make_valid_url())
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
@@ -83,6 +83,7 @@ from ..utils import (
urljoin,
url_basename,
url_or_none,
+ variadic,
xpath_element,
xpath_text,
xpath_with_ns,
@@ -371,9 +372,22 @@ class InfoExtractor(object):
title, description etc.
- Subclasses of this one should re-define the _real_initialize() and
- _real_extract() methods and define a _VALID_URL regexp.
- Probably, they should also be added to the list of extractors.
+ A subclass of InfoExtractor must be defined to handle each specific site (or
+ several sites). Such a concrete subclass should be added to the list of
+ extractors. It should also:
+ * define its _VALID_URL attribute as a regexp, or a Sequence of alternative
+ regexps (but see below)
+ * re-define the _real_extract() method
+ * optionally re-define the _real_initialize() method.
+
+ An extractor subclass may also override suitable() if necessary, but the
+ function signature must be preserved and the function must import everything
+ it needs (except other extractors), so that lazy_extractors works correctly.
+ If the subclass's suitable() and _real_extract() functions avoid using
+ _VALID_URL, the subclass need not set that class attribute.
+
+ An abstract subclass of InfoExtractor may be used to simplify implementation
+ within an extractor module; it should not be added to the list of extractors.
_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
@@ -409,21 +423,32 @@ class InfoExtractor(object):
self.set_downloader(downloader)
@classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
-
+ def __match_valid_url(cls, url):
# This does not use has/getattr intentionally - we want to know whether
- # we have cached the regexp for *this* class, whereas getattr would also
- # match the superclass
+ # we have cached the regexp for cls, whereas getattr would also
+ # match its superclass
if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- return cls._VALID_URL_RE.match(url) is not None
+ # _VALID_URL can now be a list/tuple of patterns
+ cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
+ # 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
+ for p in cls._VALID_URL_RE:
+ p = p.match(url)
+ if p:
+ return p
+
+ # The public alias can safely be overridden, as in some back-ports
+ _match_valid_url = __match_valid_url
+
+ @classmethod
+ def suitable(cls, url):
+ """Receives a URL and returns True if suitable for this IE."""
+ # This function must import everything it needs (except other extractors),
+ # so that lazy_extractors works correctly
+ return cls.__match_valid_url(url) is not None
@classmethod
def _match_id(cls, url):
- if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- m = cls._VALID_URL_RE.match(url)
+ m = cls.__match_valid_url(url)
assert m
return compat_str(m.group('id'))
diff --git a/youtube_dl/extractor/globalplayer.py b/youtube_dl/extractor/globalplayer.py
@@ -18,12 +18,6 @@ from ..utils import (
class GlobalPlayerBaseIE(InfoExtractor):
- import re
-
- @classmethod
- def _match_valid_url(cls, url):
- return cls.re.match(cls._VALID_URL, url)
-
def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']