From 08d06c8f07b6531b3e41ffb51b8731873aac2ee6 Mon Sep 17 00:00:00 2001 From: swedebugia Date: Sun, 19 Apr 2020 23:49:40 +0200 Subject: [PATCH 1/5] Add hardcoded swedish subs to tv4 extractor If anyone know how to extract it dynamically then feel free to give me a tip or improve it before merging it in. # The subtitles are defined in the manifest_url like this: # # SUBTITLES groups # #EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="textstream",NAME="Swedish",LANGUAGE="sv",AUTOSELECT=YES,DEFAULT=YES,URI="bmetgl4z0mr(12579349_ISMUSP)-textstream_swe=3000.m3u8" # but I don't know yet how to extract it dynamically from there so they are hardcoded as a start. --- youtube_dl/extractor/tv4.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index c498b0191..4fc6572c2 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -107,11 +107,21 @@ class TV4IE(InfoExtractor): self._sort_formats(formats) + # The subtitles are defined in the manifest_url like this: + # # SUBTITLES groups + # #EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="textstream",NAME="Swedish",LANGUAGE="sv",AUTOSELECT=YES,DEFAULT=YES,URI="bmetgl4z0mr(12579349_ISMUSP)-textstream_swe=3000.m3u8" + # but I don't know yet how to extract it dynamically from there so they are hardcoded as a start. + hardcoded_swedish_subs_url = manifest_url[:-5] + "-textstream_swe=3000.webvtt" + subtitles = {} + subtitles.setdefault('sv', []).append({ + 'url': hardcoded_swedish_subs_url, + 'ext': 'vtt'}) + return { 'id': video_id, 'title': title, 'formats': formats, - # 'subtitles': subtitles, + 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), From 3acf1dbfd53e180a16806c0855299a1e9ba16c12 Mon Sep 17 00:00:00 2001 From: swedebugia Date: Wed, 22 Apr 2020 14:31:44 +0200 Subject: [PATCH 2/5] remove whitespace --- youtube_dl/extractor/tv4.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 4fc6572c2..f1974fc9e 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -115,8 +115,9 @@ class TV4IE(InfoExtractor): subtitles = {} subtitles.setdefault('sv', []).append({ 'url': hardcoded_swedish_subs_url, - 'ext': 'vtt'}) - + 'ext': 'vtt' + }) + return { 'id': video_id, 'title': title, From eedd717032a7beaeffdf56ec20e15a4376d1f3d9 Mon Sep 17 00:00:00 2001 From: swedebugia Date: Wed, 22 Apr 2020 18:16:36 +0200 Subject: [PATCH 3/5] Add method for parsing of subtitles from m3u8 This is needed for TV4 subtitles --- youtube_dl/extractor/common.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c51a3a07d..bd7c87715 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1782,6 +1782,38 @@ class InfoExtractor(object): last_stream_inf = {} return formats + def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url): + """ + Parse subtitles from m3u8 file. + Please avoid downloading the m3u8 twice. + """ + format_url = lambda u: ( + u + if re.match(r'^https?://', u) + else compat_urlparse.urljoin(m3u8_url, u)) + subtitles = {} + + def extract_media(x_media_line): + media = parse_m3u8_attributes(x_media_line) + # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED + media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') + if not (media_type and group_id and name): + return + # Check for subtitles + if media_type not in ('SUBTITLES'): + return + subtitle_url = media.get('URI') + if subtitle_url: + subtitles.setdefault(media.get('LANGUAGE'), []).append({ + 'url': format_url(subtitle_url), + }) + + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-MEDIA:'): + extract_media(line) + + return subtitles + @staticmethod def _xpath_ns(path, namespace=None): if not namespace: From 46357967787a47cc349994208819a37b75aa651e Mon Sep 17 00:00:00 2001 From: swedebugia Date: Wed, 22 Apr 2020 18:22:45 +0200 Subject: [PATCH 4/5] Download and parse subtitle information from m3u8 The format is hardcoded to vtt for now and works with mpv out of the box. Test with e.g. youtuble-dl https://www.tv4play.se/program/scandinavian-star/12515629 --all-subs (and open with mpv with: "mpv --audio-file-auto=fuzzy" --- youtube_dl/extractor/tv4.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index f1974fc9e..e316cdc43 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -107,16 +107,29 @@ class TV4IE(InfoExtractor): self._sort_formats(formats) - # The subtitles are defined in the manifest_url like this: - # # SUBTITLES groups - # #EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="textstream",NAME="Swedish",LANGUAGE="sv",AUTOSELECT=YES,DEFAULT=YES,URI="bmetgl4z0mr(12579349_ISMUSP)-textstream_swe=3000.m3u8" - # but I don't know yet how to extract it dynamically from there so they are hardcoded as a start. - hardcoded_swedish_subs_url = manifest_url[:-5] + "-textstream_swe=3000.webvtt" - subtitles = {} - subtitles.setdefault('sv', []).append({ - 'url': hardcoded_swedish_subs_url, - 'ext': 'vtt' - }) + # Download manifest and extract subtitles. Extracting formats + # using this result resulted in an error. This means the + # manifest is currently being downloaded twice, which is not + # great. + res = self._download_webpage_handle( + manifest_url, video_id, + note='Downloading subtitle information', + errnote='Failed to download subtitle information', + fatal=True, data=None, headers={}, query={}) + + if res: + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() + subtitles = self._parse_m3u8_subtitles(m3u8_doc, manifest_url) + # Hardcode webvtt for now + for item in subtitles: + # List inside dictionary + # Modify extension + url = subtitles[item][0]['url'].replace('m3u8', 'webvtt') + subtitles[item][0]['url'] = url + subtitles[item][0]['ext'] = 'vtt' + else: + subtitles = {} return { 'id': video_id, From 05743aa6249b086d79311a88e6cfce1d8622aba7 Mon Sep 17 00:00:00 2001 From: swedebugia Date: Thu, 23 Apr 2020 14:12:54 +0200 Subject: [PATCH 5/5] Fix 2 flake8-warnings --- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/tv4.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bd7c87715..ad6a93e02 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1784,7 +1784,7 @@ class InfoExtractor(object): def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url): """ - Parse subtitles from m3u8 file. + Parse subtitles from m3u8 file. Please avoid downloading the m3u8 twice. """ format_url = lambda u: ( diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index e316cdc43..a727eb697 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -119,7 +119,6 @@ class TV4IE(InfoExtractor): if res: m3u8_doc, urlh = res - m3u8_url = urlh.geturl() subtitles = self._parse_m3u8_subtitles(m3u8_doc, manifest_url) # Hardcode webvtt for now for item in subtitles: