From bf6f1048b2662dd316321fa54fc9af319bc45e3d Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Sun, 27 Sep 2020 18:30:01 +0900 Subject: [PATCH] [tver] Significantly improved extraction logic and added 'creator' and 'uploader' keys to extraction information --- youtube_dl/extractor/tver.py | 186 +++++++++++++++++------------------ 1 file changed, 88 insertions(+), 98 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 6f51ead85..52b714b8b 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -4,63 +4,31 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils import ( + js_to_json, +) -class TVerIE(BrightcoveNewIE): +class TVerIE(InfoExtractor): - _TESTS = [ - { - 'url': 'https://tver.jp/feature/f0057485', # 'feature' - 'md5': '1c1c09662252571992dee0441028b4ec', # MD5 hash of a short video downloaded by running youtube-dl with the --test option - 'info_dict': { - 'id': 'f0057485', # TVer ID - 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID - 'ext': 'mp4', - 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', - 'description': '大和田(香川照之)の不正を糾弾し、子会社へ出向を命じられた半沢直樹(堺雅人)は、東京セントラル証券営業企画部長に。ある日1500億円超の買収案件が舞い込むが…。', - 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4031511847001/37b5f176-3989-48d9-81d1-4688e80c5531/main/1920x1080/34m10s16ms/match/image.jpg', - 'duration': 4100.032, - 'timestamp': 1600308623, - 'upload_date': '20200917', - 'uploader_id': '4031511847001', - }, - 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', + _TEST = { + 'url': 'https://tver.jp/feature/f0057485', # In addition to 'feature', there are also categories such as 'corner' and 'episode'. + 'md5': '4ae1bc00e6d55af8f7e2b2c17029f1a3', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'info_dict': { + 'id': 'f0057485', # TVer ID + 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID + 'ext': 'mp4', + 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', + 'description': 'md5:92ce839312ee1e9b162de73fa08b6374', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 4100.032, + 'timestamp': 1600308623, + 'upload_date': '20200917', + 'uploader_id': '4031511847001', }, - { - 'url': 'https://tver.jp/corner/f0056997', # 'corner' - 'md5': 'aac4e681dcdb775fc44497da4f7bdd05', # MD5 hash of a short video downloaded by running youtube-dl with the --test option - 'info_dict': { - 'id': 'f0056997', # TVer ID - 'display_id': 'ref:kanokari_10', # Brightcove ID - 'ext': 'mp4', - 'title': '彼女、お借りします 第10話「友達の彼女」-トモカノ-', - 'description': 'バイトの初任給を何に使おうか考える和也だったが、ふと栗林のことが脳裏をよぎる。最近栗林の様子がおかしいと、木部から話を聞いていたのだ。ボーッとしていたり、女性不信のつぶやきをしているという。和也は意を決して、栗林を呼び出すことに。翌日、栗林が和也を待っていると──「駿君、だよね?」。待ち合わせ場所にやって来たのは、千鶴だった……!', - 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/5102072605001/900216cc-2e97-4c19-93bb-1a531de358d6/main/1920x1080/12m18s37ms/match/image.jpg', - 'duration': 1476.075, - 'timestamp': 1599554409, - 'upload_date': '20200908', - 'uploader_id': '5102072605001', - }, - 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', - }, - { - 'url': 'https://tver.jp/episode/76799350', # 'episode' - 'md5': 'ad893db02b8a3e949216c463af7ce51e', # MD5 hash of a short video downloaded by running youtube-dl with the --test option - 'info_dict': { - 'id': '76799350', # TVer ID - 'display_id': '2366_2365_4533', # Brightcove ID - 'ext': 'mp4', - 'title': '港時間 #49 神奈川県/リビエラシーボニアマリーナ 9月18日(金)放送分', - 'description': '【毎週金曜 よる12時15分から放送】\n\n日本のヨット文化 を育んできた三浦半島の西海岸、小網代湾にあるリビエラシーボニアマリーナ。昨年から始まったSailGPの日本チームを率いるヨット界のレジェンドに会いました。', - 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4394098883001/904361ca-40d3-4028-8478-8916b9a0ff49/main/1920x1080/58s80ms/match/image.jpg', - 'duration': 116.16, - 'timestamp': 1600052421, - 'upload_date': '20200914', - 'uploader_id': '4394098883001', - }, - 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', - }, - ] + 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', + } IE_NAME = 'TVer' IE_DESC = 'TVer' @@ -68,62 +36,84 @@ class TVerIE(BrightcoveNewIE): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(corner|episode|feature)/(?Pf?[0-9]+)' _GEO_COUNTRIES = ['JP'] # TVer service is limited to Japan only - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=ref:%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + # TODO: FOD対応 def _real_extract(self, url): - # extract video id video_id = self._match_id(url) - - # download webpage webpage = self._download_webpage(url, video_id) - # extract video information - video_info_csv = self._search_regex(r'addPlayer\((?P.*?)\);', webpage, 'video information', flags=re.DOTALL).strip() - video_info_csv = video_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' - video_info = video_info_csv.split(',') + # extract tver information + tver_info_csv = self._search_regex(r'addPlayer\((?P.*?)\);', webpage, 'tver information', flags=re.DOTALL).strip() + tver_info_csv = tver_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' + tver_info = tver_info_csv.split(',') - # extract brightcove account id - brightcove_account_id = video_info[3] - - # extract brightcove video id - brightcove_video_id = video_info[4] - - # brightcove url + # extract brightcove information + brightcove_account_id = tver_info[3] + brightcove_video_id = 'ref:' + tver_info[4] brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id) + brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/') - # debug output - if self._downloader.params.get('verbose', False): - self.to_screen('Video Information: %s' % video_info) - self.to_screen('Brightcove Account ID: %s' % brightcove_account_id) - self.to_screen('Brightcove Video ID: %s' % brightcove_video_id) - self.to_screen('Brightcove URL: %s' % brightcove_url) - - # evacuate _VALID_URL - _VALID_URL = self._VALID_URL - - # temporarily replace _VALID_URL - # prevent _VALID_URL from being the URL of Tver when executing the parent class's _real_extract () method - self._VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*(?Pvideo|playlist)Id=(?P\d+|ref:[^&]+)' - - # get video information - info_dict = super(TVerIE, self)._real_extract(brightcove_url) - - # get video description + # extract tver description description = \ - self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) or \ + self._html_search_meta(['og:description', 'twitter:description'], webpage, 'description', default=None) or \ self._html_search_regex(r']+class="description"[^>]*>(?P.*?)', webpage, 'description', default=None, flags=re.DOTALL) - # undo _VALID_URL - self._VALID_URL = _VALID_URL - - # TVer ID - info_dict['id'] = video_id - # Brightcove ID - info_dict['display_id'] = brightcove_video_id - # select large thumbnail - info_dict['thumbnail'] = info_dict.get('thumbnail').replace('160x90', '1920x1080') - # desctiption - info_dict['description'] = description + # Note: Delegate extraction to BrightcoveNewIE by specifying url_transparent, + # while also making TverIE's own acquired entities such as description available. + info_dict = { + '_type': 'url_transparent', + 'url': brightcove_url, + 'ie_key': BrightcoveNewIE.ie_key(), + 'id': video_id, # Tver ID + 'display_id': brightcove_video_id, # Brightcove ID + 'title': brightcove_info.get('name'), + 'description': description, + 'thumbnail': re.sub(r'/[0-9]+x[0-9]+/', r'/1920x1080/', brightcove_info.get('poster')), # select large thumbnail + 'creator': tver_info[7], # Broadcaster name e.g. 'tbs', 'ntv' + 'uploader': tver_info[8], # Delivery platform name e.g. 'TBS FREE', '日テレ無料' + } return info_dict + + def _extract_brightcove_info(self, url, referrer): + + valid_url = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*(?Pvideo|playlist)Id=(?P\d+|ref:[^&]+)' + + account_id, player_id, embed, content_type, video_id = re.match(valid_url, url).groups() + + def extract_policy_key(): + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') + + return policy_key + + # brightcove api url + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) + + # set header + headers = { + 'Accept': 'application/json;pk=%s' % extract_policy_key(), + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), + 'Referer': referrer, + } + + # return brightcove api info + return self._download_json(api_url, video_id, headers=headers)