From dec0b636b93e61a1f5d9b9bbc83f8885c558bc74 Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Mon, 13 Apr 2020 11:33:03 +0200 Subject: [PATCH] [nexx] Download using correct protection tokens (fixes #24309) --- youtube_dl/extractor/funk.py | 18 +++++++++++ youtube_dl/extractor/nexx.py | 59 +++++++++++++++++++----------------- 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index 81d1949fd..22c581d55 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -25,6 +25,24 @@ class FunkIE(InfoExtractor): 'upload_date': '20171229', }, + }, { + # protection tokens (available until 2020-07-29) + 'url': 'https://www.funk.net/channel/the-job-lot-1199/unter-druck-185103/the-job-lot-staffel-1-1444', + 'md5': '91330fc2708e3ef3fc7e259a341bf4d5', + 'info_dict': { + 'id': '185103', + 'ext': 'mp4', + 'title': 'Unter Druck', + 'duration': 1459, + 'upload_date': '20170419', + 'timestamp': 1492618074, + 'description': 'md5:0e752a4151c9a340db26f1455b735fa4', + }, + 'params': { + 'format': 'azure-dash-1_V_video_7', + }, + # really just want to test that it does not return a 400 Bad Request + 'file_minsize': None, }, { 'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699', 'only_matching': True, diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 586c1b7eb..61d152254 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -37,7 +37,7 @@ class NexxIE(InfoExtractor): 'alt_title': 'Wie ein Test abläuft', 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)$', 'duration': 2509, 'timestamp': 1384264416, 'upload_date': '20131112', @@ -51,7 +51,7 @@ class NexxIE(InfoExtractor): 'title': 'Return of the Golden Child (OV)', 'description': 'md5:5d969537509a92b733de21bae249dc63', 'release_year': 2017, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)$', 'duration': 1397, 'timestamp': 1495033267, 'upload_date': '20170517', @@ -69,28 +69,14 @@ class NexxIE(InfoExtractor): 'info_dict': { 'id': '1269984', 'ext': 'mp4', - 'title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': '1 TAG ohne KLO... wortwörtlich! ?', + 'alt_title': '1 TAG ohne KLO... wortwörtlich! ?', + 'description': 'md5:1cf562de6653f76fbc589ecbdf985727', + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)$', 'duration': 607, 'timestamp': 1518614955, 'upload_date': '20180214', }, - }, { - # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html - 'url': 'nexx:747:1533779', - 'md5': '6bf6883912b82b7069fb86c2297e9893', - 'info_dict': { - 'id': '1533779', - 'ext': 'mp4', - 'title': 'Aufregung um ausgebrochene Raubtiere', - 'alt_title': 'Eifel-Zoo', - 'description': 'md5:f21375c91c74ad741dcb164c427999d2', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 111, - 'timestamp': 1527874460, - 'upload_date': '20180601', - }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -220,6 +206,9 @@ class NexxIE(InfoExtractor): return formats + def _protection_tokens(self, video): + return try_get(video, lambda x: x['protectiondata'], dict) + def _extract_azure_formats(self, video, video_id): stream_data = video['streamdata'] cdn = stream_data['cdnType'] @@ -247,24 +236,39 @@ class NexxIE(InfoExtractor): azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' + def add_token(url, token, key=None): + return url + '?%s=%s' % (key or 'hdnts', token) + azure_manifest_urls = {} + + protection_data = self._protection_tokens(video) + # TODO: is this kind of token still in use? (perhaps add a test) protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) + protection_data, lambda x: x['token'], compat_str) if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token + azure_manifest_url = add_token(azure_manifest_url, protection_token) + elif protection_data: + token_key = try_get(protection_data, lambda x: x['tokenReference'], compat_str) + for key in protection_data: + value = protection_data[key] + if isinstance(value, compat_str) and key.startswith('token') and len(key) > 5: + azure_manifest_urls[key[5:]] = add_token(azure_manifest_url, value, key=token_key) formats = self._extract_m3u8_formats( - azure_manifest_url % '(format=m3u8-aapl)', + (try_get(azure_manifest_urls, lambda x: x['HLS'], compat_str) or azure_manifest_url) % '(format=m3u8-aapl)', video_id, 'mp4', 'm3u8_native', m3u8_id='%s-hls' % cdn, fatal=False) formats.extend(self._extract_mpd_formats( - azure_manifest_url % '(format=mpd-time-csf)', + (try_get(azure_manifest_urls, lambda x: x['DASH'], compat_str) or azure_manifest_url) % '(format=mpd-time-csf)', video_id, mpd_id='%s-dash' % cdn, fatal=False)) formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + (try_get(azure_manifest_urls, lambda x: x['DASH'], compat_str) or azure_manifest_url) % '', + video_id, ism_id='%s-mss' % cdn, fatal=False)) azure_progressive_base = get_cdn_shield_base('Prog', True) azure_file_distribution = stream_data.get('azureFileDistribution') - if azure_file_distribution: + # TODO: this download method does not work when tokenHLS/tokenDASH is present, + # (simply adding one of the tokens to the url does not fix the 400 Bad Request) + if azure_file_distribution and len(azure_manifest_urls) == 0: fds = azure_file_distribution.split(',') if fds: for fd in fds: @@ -314,7 +318,8 @@ class NexxIE(InfoExtractor): video = find_video(result) # not all videos work via arc, e.g. nexx:741:1269984 - if not video: + # some videos require extra protection tokens + if not video or self._protection_tokens(video): # Reverse engineered from JS code (see getDeviceID function) device_id = '%d:%d:%d%d' % ( random.randint(1, 4), int(time.time()),