logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git

googledrive.py (10892B)


  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..compat import compat_parse_qs
  5. from ..utils import (
  6. determine_ext,
  7. ExtractorError,
  8. get_element_by_class,
  9. int_or_none,
  10. lowercase_escape,
  11. try_get,
  12. update_url_query,
  13. )
  14. class GoogleDriveIE(InfoExtractor):
  15. _VALID_URL = r'''(?x)
  16. https?://
  17. (?:
  18. (?:docs|drive)\.google\.com/
  19. (?:
  20. (?:uc|open)\?.*?id=|
  21. file/d/
  22. )|
  23. video\.google\.com/get_player\?.*?docid=
  24. )
  25. (?P<id>[a-zA-Z0-9_-]{28,})
  26. '''
  27. _TESTS = [{
  28. 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  29. 'md5': '5c602afbbf2c1db91831f5d82f678554',
  30. 'info_dict': {
  31. 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  32. 'ext': 'mp4',
  33. 'title': 'Big Buck Bunny.mp4',
  34. 'duration': 45,
  35. }
  36. }, {
  37. # video can't be watched anonymously due to view count limit reached,
  38. # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
  39. 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
  40. 'only_matching': True,
  41. }, {
  42. # video id is longer than 28 characters
  43. 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
  44. 'only_matching': True,
  45. }, {
  46. 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
  47. 'only_matching': True,
  48. }, {
  49. 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
  50. 'only_matching': True,
  51. }]
  52. _FORMATS_EXT = {
  53. '5': 'flv',
  54. '6': 'flv',
  55. '13': '3gp',
  56. '17': '3gp',
  57. '18': 'mp4',
  58. '22': 'mp4',
  59. '34': 'flv',
  60. '35': 'flv',
  61. '36': '3gp',
  62. '37': 'mp4',
  63. '38': 'mp4',
  64. '43': 'webm',
  65. '44': 'webm',
  66. '45': 'webm',
  67. '46': 'webm',
  68. '59': 'mp4',
  69. }
  70. _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
  71. _CAPTIONS_ENTRY_TAG = {
  72. 'subtitles': 'track',
  73. 'automatic_captions': 'target',
  74. }
  75. _caption_formats_ext = []
  76. _captions_xml = None
  77. @staticmethod
  78. def _extract_url(webpage):
  79. mobj = re.search(
  80. r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
  81. webpage)
  82. if mobj:
  83. return 'https://drive.google.com/file/d/%s' % mobj.group('id')
  84. def _download_subtitles_xml(self, video_id, subtitles_id, hl):
  85. if self._captions_xml:
  86. return
  87. self._captions_xml = self._download_xml(
  88. self._BASE_URL_CAPTIONS, video_id, query={
  89. 'id': video_id,
  90. 'vid': subtitles_id,
  91. 'hl': hl,
  92. 'v': video_id,
  93. 'type': 'list',
  94. 'tlangs': '1',
  95. 'fmts': '1',
  96. 'vssids': '1',
  97. }, note='Downloading subtitles XML',
  98. errnote='Unable to download subtitles XML', fatal=False)
  99. if self._captions_xml:
  100. for f in self._captions_xml.findall('format'):
  101. if f.attrib.get('fmt_code') and not f.attrib.get('default'):
  102. self._caption_formats_ext.append(f.attrib['fmt_code'])
  103. def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
  104. origin_lang_code=None):
  105. if not subtitles_id or not caption_type:
  106. return
  107. captions = {}
  108. for caption_entry in self._captions_xml.findall(
  109. self._CAPTIONS_ENTRY_TAG[caption_type]):
  110. caption_lang_code = caption_entry.attrib.get('lang_code')
  111. if not caption_lang_code:
  112. continue
  113. caption_format_data = []
  114. for caption_format in self._caption_formats_ext:
  115. query = {
  116. 'vid': subtitles_id,
  117. 'v': video_id,
  118. 'fmt': caption_format,
  119. 'lang': (caption_lang_code if origin_lang_code is None
  120. else origin_lang_code),
  121. 'type': 'track',
  122. 'name': '',
  123. 'kind': '',
  124. }
  125. if origin_lang_code is not None:
  126. query.update({'tlang': caption_lang_code})
  127. caption_format_data.append({
  128. 'url': update_url_query(self._BASE_URL_CAPTIONS, query),
  129. 'ext': caption_format,
  130. })
  131. captions[caption_lang_code] = caption_format_data
  132. return captions
  133. def _get_subtitles(self, video_id, subtitles_id, hl):
  134. if not subtitles_id or not hl:
  135. return
  136. self._download_subtitles_xml(video_id, subtitles_id, hl)
  137. if not self._captions_xml:
  138. return
  139. return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
  140. def _get_automatic_captions(self, video_id, subtitles_id, hl):
  141. if not subtitles_id or not hl:
  142. return
  143. self._download_subtitles_xml(video_id, subtitles_id, hl)
  144. if not self._captions_xml:
  145. return
  146. track = self._captions_xml.find('track')
  147. if track is None:
  148. return
  149. origin_lang_code = track.attrib.get('lang_code')
  150. if not origin_lang_code:
  151. return
  152. return self._get_captions_by_type(
  153. video_id, subtitles_id, 'automatic_captions', origin_lang_code)
  154. def _real_extract(self, url):
  155. video_id = self._match_id(url)
  156. video_info = compat_parse_qs(self._download_webpage(
  157. 'https://drive.google.com/get_video_info',
  158. video_id, query={'docid': video_id}))
  159. def get_value(key):
  160. return try_get(video_info, lambda x: x[key][0])
  161. reason = get_value('reason')
  162. title = get_value('title')
  163. if not title and reason:
  164. raise ExtractorError(reason, expected=True)
  165. formats = []
  166. fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
  167. fmt_list = (get_value('fmt_list') or '').split(',')
  168. if fmt_stream_map and fmt_list:
  169. resolutions = {}
  170. for fmt in fmt_list:
  171. mobj = re.search(
  172. r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
  173. if mobj:
  174. resolutions[mobj.group('format_id')] = (
  175. int(mobj.group('width')), int(mobj.group('height')))
  176. for fmt_stream in fmt_stream_map:
  177. fmt_stream_split = fmt_stream.split('|')
  178. if len(fmt_stream_split) < 2:
  179. continue
  180. format_id, format_url = fmt_stream_split[:2]
  181. f = {
  182. 'url': lowercase_escape(format_url),
  183. 'format_id': format_id,
  184. 'ext': self._FORMATS_EXT[format_id],
  185. }
  186. resolution = resolutions.get(format_id)
  187. if resolution:
  188. f.update({
  189. 'width': resolution[0],
  190. 'height': resolution[1],
  191. })
  192. formats.append(f)
  193. source_url = update_url_query(
  194. 'https://drive.google.com/uc', {
  195. 'id': video_id,
  196. 'export': 'download',
  197. })
  198. def request_source_file(source_url, kind):
  199. return self._request_webpage(
  200. source_url, video_id, note='Requesting %s file' % kind,
  201. errnote='Unable to request %s file' % kind, fatal=False)
  202. urlh = request_source_file(source_url, 'source')
  203. if urlh:
  204. def add_source_format(urlh):
  205. formats.append({
  206. # Use redirect URLs as download URLs in order to calculate
  207. # correct cookies in _calc_cookies.
  208. # Using original URLs may result in redirect loop due to
  209. # google.com's cookies mistakenly used for googleusercontent.com
  210. # redirect URLs (see #23919).
  211. 'url': urlh.geturl(),
  212. 'ext': determine_ext(title, 'mp4').lower(),
  213. 'format_id': 'source',
  214. 'quality': 1,
  215. })
  216. if urlh.headers.get('Content-Disposition'):
  217. add_source_format(urlh)
  218. else:
  219. confirmation_webpage = self._webpage_read_content(
  220. urlh, url, video_id, note='Downloading confirmation page',
  221. errnote='Unable to confirm download', fatal=False)
  222. if confirmation_webpage:
  223. confirm = self._search_regex(
  224. r'confirm=([^&"\']+)', confirmation_webpage,
  225. 'confirmation code', default=None)
  226. if confirm:
  227. confirmed_source_url = update_url_query(source_url, {
  228. 'confirm': confirm,
  229. })
  230. urlh = request_source_file(confirmed_source_url, 'confirmed source')
  231. if urlh and urlh.headers.get('Content-Disposition'):
  232. add_source_format(urlh)
  233. else:
  234. self.report_warning(
  235. get_element_by_class('uc-error-subcaption', confirmation_webpage)
  236. or get_element_by_class('uc-error-caption', confirmation_webpage)
  237. or 'unable to extract confirmation code')
  238. if not formats and reason:
  239. raise ExtractorError(reason, expected=True)
  240. self._sort_formats(formats)
  241. hl = get_value('hl')
  242. subtitles_id = None
  243. ttsurl = get_value('ttsurl')
  244. if ttsurl:
  245. # the video Id for subtitles will be the last value in the ttsurl
  246. # query string
  247. subtitles_id = ttsurl.encode('utf-8').decode(
  248. 'unicode_escape').split('=')[-1]
  249. return {
  250. 'id': video_id,
  251. 'title': title,
  252. 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
  253. 'duration': int_or_none(get_value('length_seconds')),
  254. 'formats': formats,
  255. 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
  256. 'automatic_captions': self.extract_automatic_captions(
  257. video_id, subtitles_id, hl),
  258. }