commit: 71a1f61700789fb0d61fc6ad9681b6f0899d2f51
parent 6510a3aa971c00525969040ad654249c0c73f125
Author: Sergey M․ <dstftw@gmail.com>
Date: Wed, 23 Jan 2019 04:12:06 +0700
[pornhub] Apply scrape detection bypass for all extractors
Diffstat:
1 file changed, 24 insertions(+), 22 deletions(-)
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
@@ -24,7 +24,29 @@ from ..utils import (
)
-class PornHubIE(InfoExtractor):
+class PornHubBaseIE(InfoExtractor):
+ def _download_webpage_handle(self, *args, **kwargs):
+ def dl(*args, **kwargs):
+ return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
+
+ webpage, urlh = dl(*args, **kwargs)
+
+ if any(re.search(p, webpage) for p in (
+ r'<body\b[^>]+\bonload=["\']go\(\)',
+ r'document\.cookie\s*=\s*["\']RNKEY=',
+ r'document\.location\.reload\(true\)')):
+ url_or_request = args[0]
+ url = (url_or_request.get_full_url()
+ if isinstance(url_or_request, compat_urllib_request.Request)
+ else url_or_request)
+ phantom = PhantomJSwrapper(self, required_version='2.0')
+ phantom.get(url, html=webpage)
+ webpage, urlh = dl(*args, **kwargs)
+
+ return webpage, urlh
+
+
+class PornHubIE(PornHubBaseIE):
IE_DESC = 'PornHub and Thumbzilla'
_VALID_URL = r'''(?x)
https?://
@@ -128,26 +150,6 @@ class PornHubIE(InfoExtractor):
'only_matching': True,
}]
- def _download_webpage_handle(self, *args, **kwargs):
- def dl(*args, **kwargs):
- return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs)
-
- webpage, urlh = dl(*args, **kwargs)
-
- if any(re.search(p, webpage) for p in (
- r'<body\b[^>]+\bonload=["\']go\(\)',
- r'document\.cookie\s*=\s*["\']RNKEY=',
- r'document\.location\.reload\(true\)')):
- url_or_request = args[0]
- url = (url_or_request.get_full_url()
- if isinstance(url_or_request, compat_urllib_request.Request)
- else url_or_request)
- phantom = PhantomJSwrapper(self, required_version='2.0')
- phantom.get(url, html=webpage)
- webpage, urlh = dl(*args, **kwargs)
-
- return webpage, urlh
-
@staticmethod
def _extract_urls(webpage):
return re.findall(
@@ -329,7 +331,7 @@ class PornHubIE(InfoExtractor):
}
-class PornHubPlaylistBaseIE(InfoExtractor):
+class PornHubPlaylistBaseIE(PornHubBaseIE):
def _extract_entries(self, webpage, host):
# Only process container div with main playlist content skipping
# drop-down menu that uses similar pattern for videos (see