logo

youtube-dl

[mirror] Download/Watch videos from video hostersgit clone https://hacktivis.me/git/mirror/youtube-dl.git

tagesschau.py (12208B)


  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. determine_ext,
  7. js_to_json,
  8. parse_iso8601,
  9. parse_filesize,
  10. )
  11. class TagesschauPlayerIE(InfoExtractor):
  12. IE_NAME = 'tagesschau:player'
  13. _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
  14. _TESTS = [{
  15. 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
  16. 'md5': '8d09548d5c15debad38bee3a4d15ca21',
  17. 'info_dict': {
  18. 'id': '179517',
  19. 'ext': 'mp4',
  20. 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
  21. 'thumbnail': r're:^https?:.*\.jpg$',
  22. 'formats': 'mincount:6',
  23. },
  24. }, {
  25. 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
  26. 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
  27. 'info_dict': {
  28. 'id': '29417',
  29. 'ext': 'mp3',
  30. 'title': 'Trabi - Bye, bye Rennpappe',
  31. 'thumbnail': r're:^https?:.*\.jpg$',
  32. 'formats': 'mincount:2',
  33. },
  34. }, {
  35. 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
  36. 'only_matching': True,
  37. }]
  38. _FORMATS = {
  39. 'xs': {'quality': 0},
  40. 's': {'width': 320, 'height': 180, 'quality': 1},
  41. 'm': {'width': 512, 'height': 288, 'quality': 2},
  42. 'l': {'width': 960, 'height': 540, 'quality': 3},
  43. 'xl': {'width': 1280, 'height': 720, 'quality': 4},
  44. 'xxl': {'quality': 5},
  45. }
  46. def _extract_via_api(self, kind, video_id):
  47. info = self._download_json(
  48. 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
  49. video_id)
  50. title = info['headline']
  51. formats = []
  52. for media in info['mediadata']:
  53. for format_id, format_url in media.items():
  54. if determine_ext(format_url) == 'm3u8':
  55. formats.extend(self._extract_m3u8_formats(
  56. format_url, video_id, 'mp4',
  57. entry_protocol='m3u8_native', m3u8_id='hls'))
  58. else:
  59. formats.append({
  60. 'url': format_url,
  61. 'format_id': format_id,
  62. 'vcodec': 'none' if kind == 'audio' else None,
  63. })
  64. self._sort_formats(formats)
  65. timestamp = parse_iso8601(info.get('date'))
  66. return {
  67. 'id': video_id,
  68. 'title': title,
  69. 'timestamp': timestamp,
  70. 'formats': formats,
  71. }
  72. def _real_extract(self, url):
  73. mobj = re.match(self._VALID_URL, url)
  74. video_id = mobj.group('id')
  75. # kind = mobj.group('kind').lower()
  76. # if kind == 'video':
  77. # return self._extract_via_api(kind, video_id)
  78. # JSON api does not provide some audio formats (e.g. ogg) thus
  79. # extracting audio via webpage
  80. webpage = self._download_webpage(url, video_id)
  81. title = self._og_search_title(webpage).strip()
  82. formats = []
  83. for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
  84. media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
  85. if not media:
  86. continue
  87. src = media.get('src')
  88. if not src:
  89. return
  90. quality = media.get('quality')
  91. kind = media.get('type', '').split('/')[0]
  92. ext = determine_ext(src)
  93. f = {
  94. 'url': src,
  95. 'format_id': '%s_%s' % (quality, ext) if quality else ext,
  96. 'ext': ext,
  97. 'vcodec': 'none' if kind == 'audio' else None,
  98. }
  99. f.update(self._FORMATS.get(quality, {}))
  100. formats.append(f)
  101. self._sort_formats(formats)
  102. thumbnail = self._og_search_thumbnail(webpage)
  103. return {
  104. 'id': video_id,
  105. 'title': title,
  106. 'thumbnail': thumbnail,
  107. 'formats': formats,
  108. }
  109. class TagesschauIE(InfoExtractor):
  110. _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
  111. _TESTS = [{
  112. 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
  113. 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
  114. 'info_dict': {
  115. 'id': 'video-102143',
  116. 'ext': 'mp4',
  117. 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
  118. 'description': '18.07.2015 20:10 Uhr',
  119. 'thumbnail': r're:^https?:.*\.jpg$',
  120. },
  121. }, {
  122. 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
  123. 'md5': '3c54c1f6243d279b706bde660ceec633',
  124. 'info_dict': {
  125. 'id': 'ts-5727',
  126. 'ext': 'mp4',
  127. 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
  128. 'description': 'md5:695c01bfd98b7e313c501386327aea59',
  129. 'thumbnail': r're:^https?:.*\.jpg$',
  130. },
  131. }, {
  132. # exclusive audio
  133. 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
  134. 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
  135. 'info_dict': {
  136. 'id': 'audio-29417',
  137. 'ext': 'mp3',
  138. 'title': 'Trabi - Bye, bye Rennpappe',
  139. 'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
  140. 'thumbnail': r're:^https?:.*\.jpg$',
  141. },
  142. }, {
  143. # audio in article
  144. 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
  145. 'md5': 'e0916c623e85fc1d2b26b78f299d3958',
  146. 'info_dict': {
  147. 'id': 'bnd-303',
  148. 'ext': 'mp3',
  149. 'title': 'Viele Baustellen für neuen BND-Chef',
  150. 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
  151. 'thumbnail': r're:^https?:.*\.jpg$',
  152. },
  153. }, {
  154. 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
  155. 'info_dict': {
  156. 'id': 'afd-parteitag-135',
  157. 'title': 'Möchtegern-Underdog mit Machtanspruch',
  158. },
  159. 'playlist_count': 2,
  160. }, {
  161. 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
  162. 'only_matching': True,
  163. }, {
  164. 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
  165. 'only_matching': True,
  166. }, {
  167. 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
  168. 'only_matching': True,
  169. }, {
  170. 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
  171. 'only_matching': True,
  172. }, {
  173. 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
  174. 'only_matching': True,
  175. }, {
  176. 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
  177. 'only_matching': True,
  178. }, {
  179. 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
  180. 'only_matching': True,
  181. }, {
  182. 'url': 'http://www.tagesschau.de/100sekunden/index.html',
  183. 'only_matching': True,
  184. }, {
  185. # playlist article with collapsing sections
  186. 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
  187. 'only_matching': True,
  188. }]
  189. @classmethod
  190. def suitable(cls, url):
  191. return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
  192. def _extract_formats(self, download_text, media_kind):
  193. links = re.finditer(
  194. r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
  195. download_text)
  196. formats = []
  197. for l in links:
  198. link_url = l.group('url')
  199. if not link_url:
  200. continue
  201. format_id = self._search_regex(
  202. r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
  203. default=determine_ext(link_url))
  204. format = {
  205. 'format_id': format_id,
  206. 'url': l.group('url'),
  207. 'format_name': l.group('name'),
  208. }
  209. title = l.group('title')
  210. if title:
  211. if media_kind.lower() == 'video':
  212. m = re.match(
  213. r'''(?x)
  214. Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
  215. (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
  216. (?P<vbr>[0-9]+)kbps&\#10;
  217. Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
  218. Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
  219. title)
  220. if m:
  221. format.update({
  222. 'format_note': m.group('audio_desc'),
  223. 'vcodec': m.group('vcodec'),
  224. 'width': int(m.group('width')),
  225. 'height': int(m.group('height')),
  226. 'abr': int(m.group('abr')),
  227. 'vbr': int(m.group('vbr')),
  228. 'filesize_approx': parse_filesize(m.group('filesize_approx')),
  229. })
  230. else:
  231. m = re.match(
  232. r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
  233. title)
  234. if m:
  235. format.update({
  236. 'format_note': '%s, %s' % (m.group('format'), m.group('note')),
  237. 'vcodec': 'none',
  238. 'abr': int(m.group('abr')),
  239. })
  240. formats.append(format)
  241. self._sort_formats(formats)
  242. return formats
  243. def _real_extract(self, url):
  244. mobj = re.match(self._VALID_URL, url)
  245. video_id = mobj.group('id') or mobj.group('path')
  246. display_id = video_id.lstrip('-')
  247. webpage = self._download_webpage(url, display_id)
  248. title = self._html_search_regex(
  249. r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
  250. webpage, 'title', default=None) or self._og_search_title(webpage)
  251. DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
  252. webpage_type = self._og_search_property('type', webpage, default=None)
  253. if webpage_type == 'website': # Article
  254. entries = []
  255. for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
  256. r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
  257. webpage), 1):
  258. entries.append({
  259. 'id': '%s-%d' % (display_id, num),
  260. 'title': '%s' % entry_title,
  261. 'formats': self._extract_formats(download_text, media_kind),
  262. })
  263. if len(entries) > 1:
  264. return self.playlist_result(entries, display_id, title)
  265. formats = entries[0]['formats']
  266. else: # Assume single video
  267. download_text = self._search_regex(
  268. DOWNLOAD_REGEX, webpage, 'download links', group='links')
  269. media_kind = self._search_regex(
  270. DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
  271. formats = self._extract_formats(download_text, media_kind)
  272. thumbnail = self._og_search_thumbnail(webpage)
  273. description = self._html_search_regex(
  274. r'(?s)<p class="teasertext">(.*?)</p>',
  275. webpage, 'description', default=None)
  276. self._sort_formats(formats)
  277. return {
  278. 'id': display_id,
  279. 'title': title,
  280. 'thumbnail': thumbnail,
  281. 'formats': formats,
  282. 'description': description,
  283. }