logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

japandiet.py (10390B)


  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. ExtractorError,
  5. clean_html,
  6. int_or_none,
  7. join_nonempty,
  8. parse_qs,
  9. smuggle_url,
  10. traverse_obj,
  11. try_call,
  12. unsmuggle_url,
  13. )
  14. def _parse_japanese_date(text):
  15. if not text:
  16. return None
  17. ERA_TABLE = {
  18. '明治': 1868,
  19. '大正': 1912,
  20. '昭和': 1926,
  21. '平成': 1989,
  22. '令和': 2019,
  23. }
  24. ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys()))
  25. mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text))
  26. if not mobj:
  27. return None
  28. era, year, month, day = mobj.groups()
  29. year, month, day = map(int, (year, month, day))
  30. if era:
  31. # example input: 令和5年3月34日
  32. # even though each era have their end, don't check here
  33. year += ERA_TABLE[era]
  34. return '%04d%02d%02d' % (year, month, day)
  35. def _parse_japanese_duration(text):
  36. mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or ''))
  37. if not mobj:
  38. return
  39. days, hours, mins, secs = (int_or_none(x, default=0) for x in mobj.groups())
  40. return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60
  41. class ShugiinItvBaseIE(InfoExtractor):
  42. _INDEX_ROOMS = None
  43. @classmethod
  44. def _find_rooms(cls, webpage):
  45. return [{
  46. '_type': 'url',
  47. 'id': x.group(1),
  48. 'title': clean_html(x.group(2)).strip(),
  49. 'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}),
  50. 'ie_key': ShugiinItvLiveIE.ie_key(),
  51. } for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)]
  52. def _fetch_rooms(self):
  53. if not self._INDEX_ROOMS:
  54. webpage = self._download_webpage(
  55. 'https://www.shugiintv.go.jp/jp/index.php', None,
  56. encoding='euc-jp', note='Downloading proceedings info')
  57. ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage)
  58. return self._INDEX_ROOMS
  59. class ShugiinItvLiveIE(ShugiinItvBaseIE):
  60. _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
  61. IE_DESC = '衆議院インターネット審議中継'
  62. _TESTS = [{
  63. 'url': 'https://www.shugiintv.go.jp/jp/index.php',
  64. 'info_dict': {
  65. '_type': 'playlist',
  66. 'title': 'All proceedings for today',
  67. },
  68. # expect at least one proceedings is running
  69. 'playlist_mincount': 1,
  70. }]
  71. @classmethod
  72. def suitable(cls, url):
  73. return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE))
  74. def _real_extract(self, url):
  75. self.to_screen(
  76. 'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
  77. return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today')
  78. class ShugiinItvLiveRoomIE(ShugiinItvBaseIE):
  79. _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
  80. IE_DESC = '衆議院インターネット審議中継 (中継)'
  81. _TESTS = [{
  82. 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
  83. 'info_dict': {
  84. 'id': 'room01',
  85. 'title': '内閣委員会',
  86. },
  87. 'skip': 'this runs for a time and not every day',
  88. }, {
  89. 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
  90. 'info_dict': {
  91. 'id': 'room11',
  92. 'title': '外務委員会',
  93. },
  94. 'skip': 'this runs for a time and not every day',
  95. }]
  96. def _real_extract(self, url):
  97. url, smug = unsmuggle_url(url, default={})
  98. if smug.get('g'):
  99. room_id, title = smug['g']
  100. else:
  101. room_id = self._match_id(url)
  102. title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False)
  103. formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  104. f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
  105. room_id, ext='mp4')
  106. return {
  107. 'id': room_id,
  108. 'title': title,
  109. 'formats': formats,
  110. 'subtitles': subtitles,
  111. 'is_live': True,
  112. }
  113. class ShugiinItvVodIE(ShugiinItvBaseIE):
  114. _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
  115. IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)'
  116. _TESTS = [{
  117. 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
  118. 'info_dict': {
  119. 'id': '53846',
  120. 'title': 'ウクライナ大統領国会演説(オンライン)',
  121. 'release_date': '20220323',
  122. 'chapters': 'count:4',
  123. },
  124. }, {
  125. 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
  126. 'only_matching': True,
  127. }]
  128. def _real_extract(self, url):
  129. video_id = self._match_id(url)
  130. webpage = self._download_webpage(
  131. f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id,
  132. encoding='euc-jp')
  133. m3u8_url = self._search_regex(
  134. r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url')
  135. m3u8_url = re.sub(r'^http://', 'https://', m3u8_url)
  136. formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  137. m3u8_url, video_id, ext='mp4')
  138. title = self._html_search_regex(
  139. (r'<td\s+align="left">(.+)\s*\(\d+分\)',
  140. r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False)
  141. release_date = _parse_japanese_date(self._html_search_regex(
  142. r'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
  143. webpage, 'title', fatal=False))
  144. chapters = []
  145. for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage):
  146. chapters.append({
  147. 'title': clean_html(chp.group(2)).strip(),
  148. 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())),
  149. })
  150. # NOTE: there are blanks at the first and the end of the videos,
  151. # so getting/providing the video duration is not possible
  152. # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
  153. last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1]
  154. if last_tr and chapters:
  155. last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1]
  156. if last_td:
  157. chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td))
  158. return {
  159. 'id': video_id,
  160. 'title': title,
  161. 'release_date': release_date,
  162. 'chapters': chapters,
  163. 'formats': formats,
  164. 'subtitles': subtitles,
  165. }
  166. class SangiinInstructionIE(InfoExtractor):
  167. _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
  168. IE_DESC = False # this shouldn't be listed as a supported site
  169. def _real_extract(self, url):
  170. raise ExtractorError(
  171. 'Copy the link from the button below the video description/player '
  172. 'and use that link to download. If there is no button in the frame, '
  173. 'get the URL of the frame showing the video.', expected=True)
  174. class SangiinIE(InfoExtractor):
  175. _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
  176. IE_DESC = '参議院インターネット審議中継 (archive)'
  177. _TESTS = [{
  178. 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
  179. 'info_dict': {
  180. 'id': '7052',
  181. 'title': '2022年10月7日 本会議',
  182. 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
  183. 'upload_date': '20221007',
  184. 'ext': 'mp4',
  185. },
  186. }, {
  187. 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
  188. 'info_dict': {
  189. 'id': '7037',
  190. 'title': '2022年10月3日 開会式',
  191. 'upload_date': '20221003',
  192. 'ext': 'mp4',
  193. },
  194. }, {
  195. 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
  196. 'info_dict': {
  197. 'id': '7076',
  198. 'title': '2022年10月27日 法務委員会',
  199. 'upload_date': '20221027',
  200. 'ext': 'mp4',
  201. 'is_live': True,
  202. },
  203. 'skip': 'this live is turned into archive after it ends',
  204. }]
  205. def _real_extract(self, url):
  206. video_id = self._match_id(url)
  207. webpage = self._download_webpage(url, video_id)
  208. date = self._html_search_regex(
  209. r'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
  210. 'date', fatal=False)
  211. upload_date = _parse_japanese_date(date)
  212. title = self._html_search_regex(
  213. r'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
  214. 'date', fatal=False)
  215. # some videos don't have the elements, so assume it's missing
  216. description = self._html_search_regex(
  217. r'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage,
  218. 'description', default=None)
  219. # this row appears only when it's livestream
  220. is_live = bool(self._html_search_regex(
  221. r'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
  222. 'is_live', default=None))
  223. m3u8_url = self._search_regex(
  224. r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage,
  225. 'm3u8 url', group=2)
  226. formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
  227. return {
  228. 'id': video_id,
  229. 'title': join_nonempty(date, title, delim=' '),
  230. 'description': description,
  231. 'upload_date': upload_date,
  232. 'formats': formats,
  233. 'subtitles': subs,
  234. 'is_live': is_live,
  235. }