Merge 05743aa624 into d65d89183f

2020-10-03 08:36:10 +08:00 · 2020-10-03 08:36:10 +08:00 · 623d90dc22
parent d65d89183f 05743aa624
commit 623d90dc22
2 changed files with 56 additions and 1 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1814,6 +1814,38 @@ class InfoExtractor(object):
                last_stream_inf = {}
        return formats

+    def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
+        """
+        Parse subtitles from m3u8 file.
+        Please avoid downloading the m3u8 twice.
+        """
+        format_url = lambda u: (
+            u
+            if re.match(r'^https?://', u)
+            else compat_urlparse.urljoin(m3u8_url, u))
+        subtitles = {}
+
+        def extract_media(x_media_line):
+            media = parse_m3u8_attributes(x_media_line)
+            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+            if not (media_type and group_id and name):
+                return
+            # Check for subtitles
+            if media_type not in ('SUBTITLES'):
+                return
+            subtitle_url = media.get('URI')
+            if subtitle_url:
+                subtitles.setdefault(media.get('LANGUAGE'), []).append({
+                    'url': format_url(subtitle_url),
+                })
+
+        for line in m3u8_doc.splitlines():
+            if line.startswith('#EXT-X-MEDIA:'):
+                extract_media(line)
+
+        return subtitles
+
    @staticmethod
    def _xpath_ns(path, namespace=None):
        if not namespace:
--- a/youtube_dl/extractor/tv4.py
+++ b/youtube_dl/extractor/tv4.py
@ -107,11 +107,34 @@ class TV4IE(InfoExtractor):

        self._sort_formats(formats)

+        # Download manifest and extract subtitles. Extracting formats
+        # using this result resulted in an error. This means the
+        # manifest is currently being downloaded twice, which is not
+        # great.
+        res = self._download_webpage_handle(
+            manifest_url, video_id,
+            note='Downloading subtitle information',
+            errnote='Failed to download subtitle information',
+            fatal=True, data=None, headers={}, query={})
+
+        if res:
+            m3u8_doc, urlh = res
+            subtitles = self._parse_m3u8_subtitles(m3u8_doc, manifest_url)
+            # Hardcode webvtt for now
+            for item in subtitles:
+                # List inside dictionary
+                # Modify extension
+                url = subtitles[item][0]['url'].replace('m3u8', 'webvtt')
+                subtitles[item][0]['url'] = url
+                subtitles[item][0]['ext'] = 'vtt'
+        else:
+            subtitles = {}
+
        return {
            'id': video_id,
            'title': title,
            'formats': formats,
-            # 'subtitles': subtitles,
+            'subtitles': subtitles,
            'description': info.get('description'),
            'timestamp': parse_iso8601(info.get('broadcast_date_time')),
            'duration': int_or_none(info.get('duration')),