This commit is contained in:
swedebugia 2020-10-03 08:36:10 +08:00 committed by GitHub
commit 623d90dc22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 1 deletions

View File

@ -1814,6 +1814,38 @@ class InfoExtractor(object):
last_stream_inf = {}
return formats
def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
"""
Parse subtitles from m3u8 file.
Please avoid downloading the m3u8 twice.
"""
format_url = lambda u: (
u
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
subtitles = {}
def extract_media(x_media_line):
media = parse_m3u8_attributes(x_media_line)
# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
if not (media_type and group_id and name):
return
# Check for subtitles
if media_type not in ('SUBTITLES'):
return
subtitle_url = media.get('URI')
if subtitle_url:
subtitles.setdefault(media.get('LANGUAGE'), []).append({
'url': format_url(subtitle_url),
})
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-MEDIA:'):
extract_media(line)
return subtitles
@staticmethod
def _xpath_ns(path, namespace=None):
if not namespace:

View File

@ -107,11 +107,34 @@ class TV4IE(InfoExtractor):
self._sort_formats(formats)
# Download manifest and extract subtitles. Extracting formats
# using this result resulted in an error. This means the
# manifest is currently being downloaded twice, which is not
# great.
res = self._download_webpage_handle(
manifest_url, video_id,
note='Downloading subtitle information',
errnote='Failed to download subtitle information',
fatal=True, data=None, headers={}, query={})
if res:
m3u8_doc, urlh = res
subtitles = self._parse_m3u8_subtitles(m3u8_doc, manifest_url)
# Hardcode webvtt for now
for item in subtitles:
# List inside dictionary
# Modify extension
url = subtitles[item][0]['url'].replace('m3u8', 'webvtt')
subtitles[item][0]['url'] = url
subtitles[item][0]['ext'] = 'vtt'
else:
subtitles = {}
return {
'id': video_id,
'title': title,
'formats': formats,
# 'subtitles': subtitles,
'subtitles': subtitles,
'description': info.get('description'),
'timestamp': parse_iso8601(info.get('broadcast_date_time')),
'duration': int_or_none(info.get('duration')),