From fbfd54d0b8cea0cfdda4da5d0aa31f68eb8b2b91 Mon Sep 17 00:00:00 2001 From: Lukas Anzinger Date: Wed, 1 Apr 2020 22:47:46 +0200 Subject: [PATCH] Add support for extracting subtitles from MPD manifests --- test/test_InfoExtractor.py | 112 ++++++++++++++++++++++++++++++-- test/testdata/mpd/subtitles.mpd | 31 +++++++++ youtube_dl/extractor/common.py | 48 ++++++++------ youtube_dl/extractor/generic.py | 7 +- 4 files changed, 170 insertions(+), 28 deletions(-) create mode 100644 test/testdata/mpd/subtitles.mpd diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 71f6608fe..3827e275e 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -745,7 +745,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) - def test_parse_mpd_formats(self): + def test_parse_mpd_formats_subtitles(self): _TEST_CASES = [ ( # https://github.com/ytdl-org/youtube-dl/issues/13919 @@ -829,7 +829,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 5997.485, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/pull/14844 'urls_only', @@ -912,7 +913,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 4400, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/issues/20346 # Media considered unfragmented even though it contains @@ -958,18 +960,116 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 360, 'height': 360, 'fps': 30, - }] + }], + {}, + ), ( + 'subtitles', + 'https://example.com/streams/1/playlist/playlist.mpd', # mpd_url + 'https://example.com/streams/1/playlist', # mpd_base_url + [{'acodec': 'mp4a.40.2', + 'asr': 48000, + 'container': 'm4a_dash', + 'ext': 'm4a', + 'filesize': None, + 'format_id': '131kbps', + 'format_note': 'DASH audio', + 'fps': None, + 'fragment_base_url': 'https://example.com/streams/1/playlist/', + 'fragments': [{'path': '../audio/1_stereo_131072/dash/init.mp4'}, + {'duration': 3989.0, + 'path': '../audio/1_stereo_131072/dash/segment_0.m4s'}, + {'duration': 3989.0, + 'path': '../audio/1_stereo_131072/dash/segment_1.m4s'}], + 'height': None, + 'language': 'de', + 'manifest_url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'protocol': 'http_dash_segments', + 'tbr': 131.072, + 'url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'vcodec': 'none', + 'width': None}, + {'acodec': 'mp4a.40.2', + 'asr': 48000, + 'container': 'm4a_dash', + 'ext': 'm4a', + 'filesize': None, + 'format_id': '196kbps', + 'format_note': 'DASH audio', + 'fps': None, + 'fragment_base_url': 'https://example.com/streams/1/playlist/', + 'fragments': [{'path': '../audio/1_stereo_196608/dash/init.mp4'}, + {'duration': 3989.0, + 'path': '../audio/1_stereo_196608/dash/segment_0.m4s'}, + {'duration': 3989.0, + 'path': '../audio/1_stereo_196608/dash/segment_1.m4s'}], + 'height': None, + 'language': 'de', + 'manifest_url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'protocol': 'http_dash_segments', + 'tbr': 196.608, + 'url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'vcodec': 'none', + 'width': None}, + {'acodec': 'none', + 'asr': None, + 'container': 'mp4_dash', + 'ext': 'mp4', + 'filesize': None, + 'format_id': '720p 1712kbps', + 'format_note': 'DASH video', + 'fps': 25, + 'fragment_base_url': 'https://example.com/streams/1/playlist/', + 'fragments': [{'path': '../video/720_1712128/dash/init.mp4'}, + {'duration': 4000.0, + 'path': '../video/720_1712128/dash/segment_0.m4s'}, + {'duration': 4000.0, + 'path': '../video/720_1712128/dash/segment_1.m4s'}], + 'height': 720, + 'language': None, + 'manifest_url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'protocol': 'http_dash_segments', + 'tbr': 1712.128, + 'url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'vcodec': 'avc1.42c00d', + 'width': 1280}, + {'acodec': 'none', + 'asr': None, + 'container': 'mp4_dash', + 'ext': 'mp4', + 'filesize': None, + 'format_id': '1080p 4669kbps', + 'format_note': 'DASH video', + 'fps': 25, + 'fragment_base_url': 'https://example.com/streams/1/playlist/', + 'fragments': [{'path': '../video/1080_4669440/dash/init.mp4'}, + {'duration': 4000.0, + 'path': '../video/1080_4669440/dash/segment_0.m4s'}, + {'duration': 4000.0, + 'path': '../video/1080_4669440/dash/segment_1.m4s'}], + 'height': 1080, + 'language': None, + 'manifest_url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'protocol': 'http_dash_segments', + 'tbr': 4669.44, + 'url': 'https://example.com/streams/1/playlist/playlist.mpd', + 'vcodec': 'avc1.42c00d', + 'width': 1920}], + {'en': [{'ext': 'vtt', + 'url': 'https://example.com/streams/1/subtitles/sub_en.vtt'}], + 'fr': [{'ext': 'vtt', + 'url': 'https://example.com/streams/1/subtitles/sub_fr.vtt'}]}, ) ] - for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES: + for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES: with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_mpd_formats( + formats, subtitles = self.ie._parse_mpd_formats_subtitles( compat_etree_fromstring(f.read().encode('utf-8')), mpd_base_url=mpd_base_url, mpd_url=mpd_url) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subtitles, expected_subtitles, None) def test_parse_f4m_formats(self): _TEST_CASES = [ diff --git a/test/testdata/mpd/subtitles.mpd b/test/testdata/mpd/subtitles.mpd new file mode 100644 index 000000000..e7d973499 --- /dev/null +++ b/test/testdata/mpd/subtitles.mpd @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + ../subtitles/sub_fr.vtt + + + + + ../subtitles/sub_en.vtt + + + + \ No newline at end of file diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484..3099302cc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2023,7 +2023,10 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): + def _extract_mpd_formats(self, *args, **kwargs): + return self._extract_mpd_formats_subtitles(*args, **kwargs)[0] + + def _extract_mpd_formats_subtitles(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', @@ -2036,13 +2039,16 @@ class InfoExtractor(object): return [] mpd_base_url = base_url(urlh.geturl()) - return self._parse_mpd_formats( + return self._parse_mpd_formats_subtitles( mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, formats_dict=formats_dict, mpd_url=mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): + def _parse_mpd_formats(self, *args, **kwargs): + return self._parse_mpd_formats_subtitles(*args, **kwargs)[0] + + def _parse_mpd_formats_subtitles(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): """ - Parse formats from MPD manifest. + Parse formats and subtitles from MPD manifest. References: 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip @@ -2119,6 +2125,7 @@ class InfoExtractor(object): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats = [] + subtitles = {} for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2134,26 +2141,27 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) + + base_url = '' + for element in (representation, adaptation_set, period, mpd_doc): + base_url_e = element.find(_add_ns('BaseURL')) + if base_url_e is not None: + base_url = compat_urlparse.urljoin(base_url_e.text, base_url) + if re.match(r'^https?://', base_url): + break + if mpd_base_url and not re.match(r'^https?://', base_url): + if not mpd_base_url.endswith('/') and not base_url.startswith('/'): + mpd_base_url += '/' + base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + representation_id = representation_attrib.get('id') + lang = representation_attrib.get('lang') + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] content_type = mime_type.split('/')[0] if content_type == 'text': - # TODO implement WebVTT downloading - pass + subtitles[lang] = [{'ext': mimetype2ext(mime_type), 'url': base_url}] elif content_type in ('video', 'audio'): - base_url = '' - for element in (representation, adaptation_set, period, mpd_doc): - base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: - base_url = base_url_e.text + base_url - if re.match(r'^https?://', base_url): - break - if mpd_base_url and not re.match(r'^https?://', base_url): - if not mpd_base_url.endswith('/') and not base_url.startswith('/'): - mpd_base_url += '/' - base_url = mpd_base_url + base_url - representation_id = representation_attrib.get('id') - lang = representation_attrib.get('lang') url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) bandwidth = int_or_none(representation_attrib.get('bandwidth')) @@ -2329,7 +2337,7 @@ class InfoExtractor(object): formats.append(full_info) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - return formats + return formats, subtitles def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a495ee15a..0fab2cad2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2390,11 +2390,14 @@ class GenericIE(InfoExtractor): xspf_base_url=full_response.geturl()), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - info_dict['formats'] = self._parse_mpd_formats( + formats, subtitles = self._parse_mpd_formats_subtitles( doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) - self._sort_formats(info_dict['formats']) + self._sort_formats(formats) + info_dict['formats'] = formats + if subtitles: + info_dict['subtitles'] = subtitles return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)