From 511ad55d78235cce7c0a9a0a2fe81c7658b4b767 Mon Sep 17 00:00:00 2001 From: thezero Date: Thu, 9 Apr 2020 01:09:00 +0200 Subject: [PATCH] [sproutvideo] improve HLS download, fix video detection --- youtube_dl/downloader/hls.py | 18 ++++++++++++----- youtube_dl/extractor/sproutvideo.py | 31 ++++++++++++++--------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 80d006fb6..56b173309 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -64,7 +64,7 @@ class HlsFD(FragmentFD): s = urlh.read().decode('utf-8', 'ignore') if not self.can_download(s, info_dict): - if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'): + if info_dict.get('extra_param_to_segment_url') or info_dict.get('extra_param_to_key_url'): self.report_error('pycrypto not found. Please install it.') return False self.report_warning( @@ -115,13 +115,17 @@ class HlsFD(FragmentFD): extra_segment_query = None extra_key_query = None + extra_key_url = None extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + extra_param_to_key_url = info_dict.get('extra_param_to_key_url') if extra_param_to_segment_url: extra_segment_query = compat_urlparse.parse_qs(extra_param_to_segment_url) extra_key_query = compat_urlparse.parse_qs(extra_param_to_segment_url) - extra_param_to_key_url = info_dict.get('extra_param_to_key_url') if extra_param_to_key_url: - extra_key_query = compat_urlparse.parse_qs(extra_param_to_key_url) + if extra_param_to_key_url.startswith('http'): + extra_key_url = extra_param_to_key_url + else: + extra_key_query = compat_urlparse.parse_qs(extra_param_to_key_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -174,8 +178,10 @@ class HlsFD(FragmentFD): if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( - self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - # We don't decrypt fragments during the test + self._prepare_url(info_dict, decrypt_info['URI'])).read() + # Since "self._TEST_FILE_SIZE" is set to 10241 bytes, only those will be downloaded for the first fragment + # In case a fragment is bigger then 10241 bytes, the fragment will be cropped so AES-CBC decryption will fail. + # For this reason we can't decrypt fragments during the tests. if not test: frag_content = AES.new( decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) @@ -196,6 +202,8 @@ class HlsFD(FragmentFD): man_url, decrypt_info['URI']) if extra_key_query: decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_key_query) + elif extra_key_url: + decrypt_info['URI'] = extra_key_url if decrypt_url != decrypt_info['URI']: decrypt_info['KEY'] = None elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): diff --git a/youtube_dl/extractor/sproutvideo.py b/youtube_dl/extractor/sproutvideo.py index 85f58d3ba..4e805ccde 100644 --- a/youtube_dl/extractor/sproutvideo.py +++ b/youtube_dl/extractor/sproutvideo.py @@ -12,7 +12,7 @@ from ..compat import ( class SproutVideoIE(InfoExtractor): - _NOSCHEMA_URL = r'//videos.sproutvideo.com/embed/(?P[a-f0-9]+)/[a-f0-9]+' + _NOSCHEMA_URL = r'//videos\.sproutvideo\.com/embed/(?P[a-f0-9]+)/[a-f0-9]+' _VALID_URL = r'https?:%s' % _NOSCHEMA_URL _TEST = { 'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3', @@ -28,14 +28,14 @@ class SproutVideoIE(InfoExtractor): def _extract_urls(webpage): # Fix the video URL if the iframe doesn't have a defined schema return [sprout.group('url') for sprout in re.finditer( - r'(?:(?:https?:|)%s[^\'\"]+)[\'\"]' % SproutVideoIE._NOSCHEMA_URL, + r']+src=[\'"](?P(?:https?:|)%s[^\'"]+)[\'"]' % SproutVideoIE._NOSCHEMA_URL, webpage)] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._search_regex(r']+>var dat = \'([^\']+)\';', webpage, 'data') + data = self._search_regex(r"var\s+dat\s+=\s+'([^']+)';", webpage, 'data') data_decoded = compat_b64decode(data).decode('utf-8') parsed_data = self._parse_json(data_decoded, video_id) @@ -43,18 +43,15 @@ class SproutVideoIE(InfoExtractor): # signature->m for manifests # signature->k for keys # signature->t for segments - m_sign = self._policy_to_qs(parsed_data, 'm') - k_sign = self._policy_to_qs(parsed_data, 'k') - t_sign = self._policy_to_qs(parsed_data, 't') + m_sign = SproutVideoIE._policy_to_qs(parsed_data, 'm') + k_sign = SproutVideoIE._policy_to_qs(parsed_data, 'k') + t_sign = SproutVideoIE._policy_to_qs(parsed_data, 't') - resource_url = 'https://{0}.videos.sproutvideo.com/{1}/{2}/video/index.m3u8?{3}' - resource_url = resource_url.format(parsed_data['base'], - parsed_data['s3_user_hash'], - parsed_data['s3_video_hash'], - m_sign) + resource_url = 'https://{0}.videos.sproutvideo.com/{1}/{2}/video/index.m3u8?{3}'.format( + parsed_data['base'], parsed_data['s3_user_hash'], parsed_data['s3_video_hash'], m_sign) - formats = self._extract_m3u8_formats(resource_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) + formats = self._extract_m3u8_formats( + resource_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) self._sort_formats(formats) for entry in formats: @@ -70,13 +67,15 @@ class SproutVideoIE(InfoExtractor): 'formats': formats, } - def _format_qsdata(self, qs_data): + @staticmethod + def _format_qsdata(qs_data): parsed_dict = dict() for key in qs_data: parsed_dict[key.replace('CloudFront-', '')] = qs_data[key] return parsed_dict - def _policy_to_qs(self, policy, key): - sign = self._format_qsdata(policy['signatures'][key]) + @staticmethod + def _policy_to_qs(policy, key): + sign = SproutVideoIE._format_qsdata(policy['signatures'][key]) sign['sessionID'] = policy['sessionID'] return compat_urllib_parse_urlencode(sign, doseq=True)