From 99fd3bf6ad6306d67c146ec51a551abaf1995564 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Tue, 7 Apr 2020 22:05:09 +0200 Subject: [PATCH 01/10] [nebula] Add basic support for Nebula (refs #21258) --- AUTHORS | 1 + docs/supportedsites.md | 1 + youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/nebula.py | 132 +++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/nebula.py diff --git a/AUTHORS b/AUTHORS index b507cb8df..64ac71249 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Henrik Heimbuerger diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 174b83bf3..164b1e47e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -542,6 +542,7 @@ - **ndr:embed** - **ndr:embed:base** - **NDTV** + - **Nebula** - **NerdCubedFeed** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e407ab3d9..0e8087d7c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -692,8 +692,9 @@ from .ndr import ( NJoyEmbedIE, ) from .ndtv import NDTVIE -from .netzkino import NetzkinoIE +from .nebula import NebulaIE from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE from .neteasemusic import ( NetEaseMusicIE, NetEaseMusicAlbumIE, diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py new file mode 100644 index 000000000..e22a4b088 --- /dev/null +++ b/youtube_dl/extractor/nebula.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..utils import parse_iso8601 + +COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests + + +class NebulaIE(InfoExtractor): + """ + Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos + off-YouTube from a small hand-picked group of creators. + + All videos require a subscription to watch. There are no known freely available videos. So the test case is + disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription). + + Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off + video extraction to the Zype extractor. + + This description has been last updated on 2020-04-07. + """ + + _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the slug, but we misname it 'id' to be able to use _match_id() + _TEST = { + 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', + 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'info_dict': { + 'id': '5c271b40b13fd613090034fd', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + #'uploader': 'Lindsay Ellis', # TODO: removed because unreliable/sometimes incorrect + } + } + _WORKING = False # this is set to False because the test won't pass without an auth cookie for a (paid) subscription + + def _extract_state_object(self, webpage, display_id): + """ + As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script + tag. This function is extracting this script tag, parsing it as JSON. + """ + initial_state_object = self._search_regex(r'', webpage, 'initial_state') + metadata = self._parse_json(initial_state_object, video_id=display_id) # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? + + return metadata + + def _extract_video_metadata(self, state_object, display_id): + """ + The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the + video ID, we can then extract a dictionary with various meta data about the video itself. + """ + video_id = state_object['videos']['byURL'][display_id] + video_meta = state_object['videos']['byID'][video_id] + + return video_id, video_meta + + def _extract_video_url(self, webpage, state_object, video_id): + """ + To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a + bit more stable to extract the iframe source that links to the video. + """ + iframe = self._search_regex(r'', webpage, 'iframe', fatal=False) + video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None + + # fallback: reconstruct using video ID and access token from state object + if not video_url: + access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken'] + video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token) + + return video_url + + def _extract_uploader(self, video_meta): + """ + Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized + more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so + I'll go with this for now. + """ + return video_meta['categories'][0]['value'][0] + + def _real_extract(self, url): + # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests + if COOKIE_NEBULA_AUTH: + self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH) + + # extract the video's display ID from the URL (we'll retrieve the video ID later) + display_id = self._match_id(url) + + # download the page + webpage = self._download_webpage(url, video_id=display_id) # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead... + + # extract the state object from the webpage, and then retrieve video meta data from it + state_object = self._extract_state_object(webpage, display_id) + video_id, video_meta = self._extract_video_metadata(state_object, display_id) + + # extract the video URL from the webpage + video_url = self._extract_video_url(webpage, state_object, video_id) + + return { + 'id': video_id, + 'display_id': display_id, + + # we're passing this video URL on to the 'Zype' extractor (that's the video infrastructure that Nebula is + # built on top of) and use the 'url_transparent' type to indicate that our meta data should be better than + # whatever the Zype extractor is able to identify + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': video_url, + + # the meta data we were able to extract from Nebula + 'title': video_meta['title'], + 'description': video_meta['description'], + 'timestamp': parse_iso8601(video_meta['published_at']), + #'uploader': self._extract_uploader(video_meta), # TODO: removed because unreliable/sometimes incorrect + 'thumbnails': [ + { + 'id': tn['name'], # this appears to be null in all cases I've seen + 'url': tn['url'], + 'width': tn['width'], + 'height': tn['height'], + } for tn in video_meta['thumbnails'] + ], + 'duration': video_meta['duration'], + # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! + # TODO: channel + # TODO: channel_id + # TODO: channel_url + } From 1cfcf0b79a03cdf5e7ca81829daebad1a4131e8d Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Thu, 16 Apr 2020 04:34:17 +0200 Subject: [PATCH 02/10] [nebula] Add additional test cases and improve cookie envvar handling --- youtube_dl/extractor/nebula.py | 61 +++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index e22a4b088..6aa4e1da4 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -23,21 +23,52 @@ class NebulaIE(InfoExtractor): This description has been last updated on 2020-04-07. """ - _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the slug, but we misname it 'id' to be able to use _match_id() - _TEST = { - 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', - 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', - 'info_dict': { - 'id': '5c271b40b13fd613090034fd', - 'ext': 'mp4', - 'title': 'That Time Disney Remade Beauty and the Beast', - 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', - 'upload_date': '20180731', - 'timestamp': 1533009600, - #'uploader': 'Lindsay Ellis', # TODO: removed because unreliable/sometimes incorrect - } - } - _WORKING = False # this is set to False because the test won't pass without an auth cookie for a (paid) subscription + _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id() + _TESTS = [ + { + 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', + 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'info_dict': { + 'id': '5c271b40b13fd613090034fd', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + 'channel': 'Lindsay Ellis', + 'uploader': 'Lindsay Ellis', + } + }, + { + 'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'md5': 'b0b171504d67e2822179149ccd6787db', + 'info_dict': { + 'id': '5e7e78171aaf320001fbd6be', + 'ext': 'mp4', + 'title': 'Landing Craft - How The Allies Got Ashore', + 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', + 'upload_date': '20200327', + 'timestamp': 1585348140, + 'channel': 'The Logistics of D-Day', + 'uploader': 'The Logistics of D-Day', + } + }, + { + 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', + 'md5': '98e96346caa3b303fec4493c5d49dcb5', + 'info_dict': { + 'id': '5e779ebdd157bc0001d1c75a', + 'ext': 'mp4', + 'title': 'Episode 1: The Draw', + 'description': r're:^There’s free money on offer… if the players can all work together.', + 'upload_date': '20200323', + 'timestamp': 1584980400, + 'channel': 'Tom Scott Presents: Money', + 'uploader': 'Tom Scott Presents: Money', + } + }, + ] + _WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription? def _extract_state_object(self, webpage, display_id): """ From 6daa352a7dea08fecfb227965ee304799f9da0c7 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Thu, 16 Apr 2020 04:35:05 +0200 Subject: [PATCH 03/10] [nebula] Add better channel title extraction (refs #21258) --- youtube_dl/extractor/nebula.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 6aa4e1da4..828ea1c6f 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -105,13 +105,32 @@ class NebulaIE(InfoExtractor): return video_url - def _extract_uploader(self, video_meta): + def _extract_channel(self, video_meta): """ - Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized - more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so - I'll go with this for now. + Extract the channel title, by going through the list of categories and finding the first value of the + first category that has a value. + + I know this look like a terrible approach. But actually, it's just reproducing the behavior of the + React code the Nebula frontend uses (as of 2020-04-07): + + let channel; + if (video && video.categories && video.categories.length) { + const channelTitle = video.categories.map((category) => (category.value[0])) + .filter((title) => (!!title))[0]; + channel = getChannelByTitle(state, { title: channelTitle }); + } + + Basically, it finds the first (truthy) value in the category list and that's assumed to be the + channel title. And then the channel details (e.g. the URL) are looked up by title (!) (not by any + kind of ID) via an additional API call. + + TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL """ - return video_meta['categories'][0]['value'][0] + categories = video_meta['categories'] + for category in categories: + if category['value']: + return category['value'][0] + return None def _real_extract(self, url): # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests @@ -127,6 +146,7 @@ class NebulaIE(InfoExtractor): # extract the state object from the webpage, and then retrieve video meta data from it state_object = self._extract_state_object(webpage, display_id) video_id, video_meta = self._extract_video_metadata(state_object, display_id) + channel_title = self._extract_channel(video_meta) # extract the video URL from the webpage video_url = self._extract_video_url(webpage, state_object, video_id) @@ -146,7 +166,6 @@ class NebulaIE(InfoExtractor): 'title': video_meta['title'], 'description': video_meta['description'], 'timestamp': parse_iso8601(video_meta['published_at']), - #'uploader': self._extract_uploader(video_meta), # TODO: removed because unreliable/sometimes incorrect 'thumbnails': [ { 'id': tn['name'], # this appears to be null in all cases I've seen @@ -156,8 +175,9 @@ class NebulaIE(InfoExtractor): } for tn in video_meta['thumbnails'] ], 'duration': video_meta['duration'], + 'channel': channel_title, + 'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! - # TODO: channel # TODO: channel_id # TODO: channel_url } From 6c35cac4325d0325d1cb47609b20b97570d331ba Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sat, 18 Apr 2020 06:15:03 +0200 Subject: [PATCH 04/10] [nebula] Relax meta data lookups --- youtube_dl/extractor/nebula.py | 48 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 828ea1c6f..038863348 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -4,7 +4,8 @@ from __future__ import unicode_literals import os from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..compat import compat_str +from ..utils import parse_iso8601, try_get COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests @@ -74,9 +75,13 @@ class NebulaIE(InfoExtractor): """ As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script tag. This function is extracting this script tag, parsing it as JSON. + + May return None if no state object could be found or it didn't contain valid JSON. """ - initial_state_object = self._search_regex(r'', webpage, 'initial_state') - metadata = self._parse_json(initial_state_object, video_id=display_id) # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? + initial_state_object = self._search_regex( + r']*id="initial-app-state"[^>]*>(.+?)', webpage, + 'initial_state', fatal=False, default=None) + metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? return metadata @@ -84,9 +89,12 @@ class NebulaIE(InfoExtractor): """ The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the video ID, we can then extract a dictionary with various meta data about the video itself. + + May return (None, {}) if no state object was given or it didn't contain the expected lookup table or + meta data. """ - video_id = state_object['videos']['byURL'][display_id] - video_meta = state_object['videos']['byID'][video_id] + video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str) + video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {} return video_id, video_meta @@ -100,8 +108,10 @@ class NebulaIE(InfoExtractor): # fallback: reconstruct using video ID and access token from state object if not video_url: - access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken'] - video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token) + access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'], + compat_str) + video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format( + video_id=video_id, access_token=access_token) return video_url @@ -125,12 +135,13 @@ class NebulaIE(InfoExtractor): kind of ID) via an additional API call. TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL + + May return None of no category list could be found or no category had a label ('value'). """ - categories = video_meta['categories'] + categories = video_meta.get('categories', []) if video_meta else [] for category in categories: - if category['value']: + if category.get('value'): # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well! return category['value'][0] - return None def _real_extract(self, url): # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests @@ -163,18 +174,17 @@ class NebulaIE(InfoExtractor): 'url': video_url, # the meta data we were able to extract from Nebula - 'title': video_meta['title'], - 'description': video_meta['description'], - 'timestamp': parse_iso8601(video_meta['published_at']), + 'title': video_meta.get('title'), + 'description': video_meta.get('description'), + 'timestamp': parse_iso8601(video_meta.get('published_at')), 'thumbnails': [ { - 'id': tn['name'], # this appears to be null in all cases I've seen + 'id': tn.get('name'), # this appears to be null in all cases I've seen 'url': tn['url'], - 'width': tn['width'], - 'height': tn['height'], - } for tn in video_meta['thumbnails'] - ], - 'duration': video_meta['duration'], + 'width': tn.get('width'), + 'height': tn.get('height'), + } for tn in video_meta.get('thumbnails', [])], + 'duration': video_meta.get('duration'), 'channel': channel_title, 'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! From 606409a2043a88daa3d21c0d07ece29704cd5cd3 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Mon, 11 May 2020 05:58:14 +0200 Subject: [PATCH 05/10] [nebula] Rewrite extractor to new frontend (refs #21258) --- youtube_dl/extractor/nebula.py | 129 ++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 038863348..9a6ddf6f3 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -4,10 +4,8 @@ from __future__ import unicode_literals import os from .common import InfoExtractor -from ..compat import compat_str -from ..utils import parse_iso8601, try_get - -COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests +from ..compat import compat_urllib_parse_unquote, compat_str +from ..utils import parse_iso8601, ExtractorError, try_get class NebulaIE(InfoExtractor): @@ -15,13 +13,13 @@ class NebulaIE(InfoExtractor): Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos off-YouTube from a small hand-picked group of creators. - All videos require a subscription to watch. There are no known freely available videos. So the test case is - disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription). + All videos require a subscription to watch. There are no known freely available videos. An authentication token to + an account with a valid subscription can be specified in multiple ways. Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off video extraction to the Zype extractor. - This description has been last updated on 2020-04-07. + This description has been last updated on 2020-05-11. """ _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id() @@ -61,7 +59,7 @@ class NebulaIE(InfoExtractor): 'id': '5e779ebdd157bc0001d1c75a', 'ext': 'mp4', 'title': 'Episode 1: The Draw', - 'description': r're:^There’s free money on offer… if the players can all work together.', + 'description': r'contains:There’s free money on offer… if the players can all work together.', 'upload_date': '20200323', 'timestamp': 1584980400, 'channel': 'Tom Scott Presents: Money', @@ -71,49 +69,76 @@ class NebulaIE(InfoExtractor): ] _WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription? - def _extract_state_object(self, webpage, display_id): + def _retrieve_nebula_auth(self, video_id): """ - As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script - tag. This function is extracting this script tag, parsing it as JSON. + Attempt to find a Nebula API token. Makes multiple attempts in the following order: + a) the --video-password command line argument + b) the --cookies supplied cookie jar + c) the NEBULA_TOKEN environment variable + If none of these are successful, an end user-intended error message is returned, listing some solutions. - May return None if no state object could be found or it didn't contain valid JSON. + # TODO: are these authentication methods, in this order, the best practice for youtube-dl? """ - initial_state_object = self._search_regex( - r']*id="initial-app-state"[^>]*>(.+?)', webpage, - 'initial_state', fatal=False, default=None) - metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? + nebula_token = self._downloader.params.get('videopassword') + if not nebula_token: + # TODO: is there a helper to do all this cookie extraction? + nebula_cookies = self._get_cookies('https://watchnebula.com') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) + nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken') + if not nebula_token and 'NEBULA_TOKEN' in os.environ: + nebula_token = os.environ.get('NEBULA_TOKEN') + if not nebula_token: + raise ExtractorError('Nebula requires an account with an active subscription. ' + 'You can supply a corresponding token by either ' + 'a) finding your nebula-auth cookie and then specifying it via --video-password, or ' + 'b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or ' + 'c) setting the environment variable NEBULA_TOKEN.') + return nebula_token - return metadata - - def _extract_video_metadata(self, state_object, display_id): + def _call_zype_api(self, path, params, video_id, api_key): """ - The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the - video ID, we can then extract a dictionary with various meta data about the video itself. - - May return (None, {}) if no state object was given or it didn't contain the expected lookup table or - meta data. + A helper for making calls to the Zype API. """ - video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str) - video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {} + query = {'api_key': api_key, 'per_page': 1} + query.update(params) + return self._download_json('https://api.zype.com' + path, video_id, query=query) - return video_id, video_meta - - def _extract_video_url(self, webpage, state_object, video_id): + def _fetch_zype_video_data(self, display_id, api_key): """ - To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a - bit more stable to extract the iframe source that links to the video. + Fetch video meta data from the Zype API. """ - iframe = self._search_regex(r'', webpage, 'iframe', fatal=False) - video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None + response = self._call_zype_api('/videos', {'friendly_title': display_id}, display_id, api_key) + if 'response' not in response or len(response['response']) != 1: + raise ExtractorError('Unable to find video on Zype API') + return response['response'][0] - # fallback: reconstruct using video ID and access token from state object - if not video_url: - access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'], - compat_str) - video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format( - video_id=video_id, access_token=access_token) + def _call_nebula_api(self, path, video_id, access_token): + """ + A helper for making calls to the Nebula API. + """ + return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ + 'Authorization': 'Token {access_token}'.format(access_token=access_token) + }) - return video_url + def _fetch_zype_access_token(self, video_id, nebula_token): + """ + Requests a Zype access token from the Nebula API. + """ + user_object = self._call_nebula_api('/auth/user', video_id, nebula_token) + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) + if not access_token: + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token + + def _build_video_url(self, video_id, zype_access_token): + """ + Construct a Zype video URL (as supported by the Zype extractor), given a Zype video ID and a Zype access token. + """ + return 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format( + video_id=video_id, + access_token=zype_access_token) def _extract_channel(self, video_meta): """ @@ -144,23 +169,23 @@ class NebulaIE(InfoExtractor): return category['value'][0] def _real_extract(self, url): - # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests - if COOKIE_NEBULA_AUTH: - self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH) - # extract the video's display ID from the URL (we'll retrieve the video ID later) display_id = self._match_id(url) - # download the page - webpage = self._download_webpage(url, video_id=display_id) # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead... + # retrieve Nebula authentication information + nebula_token = self._retrieve_nebula_auth(display_id) - # extract the state object from the webpage, and then retrieve video meta data from it - state_object = self._extract_state_object(webpage, display_id) - video_id, video_meta = self._extract_video_metadata(state_object, display_id) + # fetch video meta data from the Nebula API + api_key = 'JlSv9XTImxelHi-eAHUVDy_NUM3uAtEogEpEdFoWHEOl9SKf5gl9pCHB1AYbY3QF' # FIXME: extract from main chunk at runtime + video_meta = self._fetch_zype_video_data(display_id, api_key) + video_id = video_meta['_id'] + + # extract additional info channel_title = self._extract_channel(video_meta) - # extract the video URL from the webpage - video_url = self._extract_video_url(webpage, state_object, video_id) + # fetch the access token for Zype, then construct the video URL + zype_access_token = self._fetch_zype_access_token(video_id, nebula_token=nebula_token) + video_url = self._build_video_url(video_id, zype_access_token) return { 'id': video_id, @@ -179,7 +204,7 @@ class NebulaIE(InfoExtractor): 'timestamp': parse_iso8601(video_meta.get('published_at')), 'thumbnails': [ { - 'id': tn.get('name'), # this appears to be null in all cases I've seen + 'id': tn.get('name'), # this appears to be null in all cases I've encountered 'url': tn['url'], 'width': tn.get('width'), 'height': tn.get('height'), From b27ab412eed214368242082116f2477562d32759 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Fri, 15 May 2020 06:14:08 +0200 Subject: [PATCH 06/10] [nebula] Implement Zype API key retrieval from JS chunk --- youtube_dl/extractor/nebula.py | 63 +++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 9a6ddf6f3..0ce229ad5 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -5,7 +5,7 @@ import os from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote, compat_str -from ..utils import parse_iso8601, ExtractorError, try_get +from ..utils import parse_iso8601, ExtractorError, try_get, urljoin class NebulaIE(InfoExtractor): @@ -97,36 +97,81 @@ class NebulaIE(InfoExtractor): 'c) setting the environment variable NEBULA_TOKEN.') return nebula_token - def _call_zype_api(self, path, params, video_id, api_key): + def _retrieve_zype_api_key(self, page_url, display_id): + """ + Retrieves the Zype API key required to make calls to the Zype API. + + Unfortunately, the Nebula frontend stores this as a JS object literal in one of its JS chunks, + looking somewhat like this (but minified): + + return { + NODE_ENV: "production", + REACT_APP_NAME: "Nebula", + REACT_APP_NEBULA_API: "https://api.watchnebula.com/api/v1/", + REACT_APP_ZYPE_API: "https://api.zype.com/", + REACT_APP_ZYPE_API_KEY: "", + REACT_APP_ZYPE_APP_KEY: "", + // ... + } + + So we have to find the reference to the chunk in the video page (as it is hashed and the hash will + change when they do a new release), then download the chunk and extract the API key from there, + hoping they won't rename the constant. + + Alternatively, it is currently hardcoded and shared among all users. We haven't seen it + change so far, so we could also just hardcode it in the extractor as a fallback. + """ + # fetch the video page + webpage = self._download_webpage(page_url, video_id=display_id) + + # find the script tag with a file named 'main..chunk.js' in there + main_script_relpath = self._search_regex( + r']*src="(?P[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, + group='script_relpath', name='script relative path', fatal=True) + + # fetch the JS chunk + main_script_abspath = urljoin(page_url, main_script_relpath) + main_script = self._download_webpage(main_script_abspath, video_id=display_id, + note='Retrieving Zype API key') + + # find the API key named 'REACT_APP_ZYPE_API_KEY' in there + api_key = self._search_regex( + r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P[\w-]*)"', main_script, + group='api_key', name='API key', fatal=True) + + return api_key + + def _call_zype_api(self, path, params, video_id, api_key, note): """ A helper for making calls to the Zype API. """ query = {'api_key': api_key, 'per_page': 1} query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query) + return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) def _fetch_zype_video_data(self, display_id, api_key): """ Fetch video meta data from the Zype API. """ - response = self._call_zype_api('/videos', {'friendly_title': display_id}, display_id, api_key) + response = self._call_zype_api('/videos', {'friendly_title': display_id}, + display_id, api_key, note='Retrieving metadata from Zype') if 'response' not in response or len(response['response']) != 1: raise ExtractorError('Unable to find video on Zype API') return response['response'][0] - def _call_nebula_api(self, path, video_id, access_token): + def _call_nebula_api(self, path, video_id, access_token, note): """ A helper for making calls to the Nebula API. """ return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }) + }, note=note) def _fetch_zype_access_token(self, video_id, nebula_token): """ Requests a Zype access token from the Nebula API. """ - user_object = self._call_nebula_api('/auth/user', video_id, nebula_token) + user_object = self._call_nebula_api('/auth/user', video_id, nebula_token, note='Retrieving Zype access token') access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) if not access_token: raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') @@ -176,7 +221,7 @@ class NebulaIE(InfoExtractor): nebula_token = self._retrieve_nebula_auth(display_id) # fetch video meta data from the Nebula API - api_key = 'JlSv9XTImxelHi-eAHUVDy_NUM3uAtEogEpEdFoWHEOl9SKf5gl9pCHB1AYbY3QF' # FIXME: extract from main chunk at runtime + api_key = self._retrieve_zype_api_key(url, display_id) video_meta = self._fetch_zype_video_data(display_id, api_key) video_id = video_meta['_id'] @@ -184,7 +229,7 @@ class NebulaIE(InfoExtractor): channel_title = self._extract_channel(video_meta) # fetch the access token for Zype, then construct the video URL - zype_access_token = self._fetch_zype_access_token(video_id, nebula_token=nebula_token) + zype_access_token = self._fetch_zype_access_token(display_id, nebula_token=nebula_token) video_url = self._build_video_url(video_id, zype_access_token) return { From 97fe7cd386e2dff88a927a7fd28f3bd144248b79 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Tue, 2 Jun 2020 04:57:37 +0200 Subject: [PATCH 07/10] [nebula] Improve performance by avoiding redirect --- youtube_dl/extractor/nebula.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 0ce229ad5..9a2828e19 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -171,7 +171,7 @@ class NebulaIE(InfoExtractor): """ Requests a Zype access token from the Nebula API. """ - user_object = self._call_nebula_api('/auth/user', video_id, nebula_token, note='Retrieving Zype access token') + user_object = self._call_nebula_api('/auth/user/', video_id, nebula_token, note='Retrieving Zype access token') access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) if not access_token: raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') From 4411fcec81352f1bfb5b74e014dde0eb8489e3d7 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sat, 3 Oct 2020 05:41:27 +0200 Subject: [PATCH 08/10] [nebula] Update test video checksums --- youtube_dl/extractor/nebula.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 9a2828e19..4b8cca8fd 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -40,7 +40,7 @@ class NebulaIE(InfoExtractor): }, { 'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'md5': 'b0b171504d67e2822179149ccd6787db', + 'md5': '6d4edd14ce65720fa63aba5c583fb328', 'info_dict': { 'id': '5e7e78171aaf320001fbd6be', 'ext': 'mp4', @@ -54,7 +54,7 @@ class NebulaIE(InfoExtractor): }, { 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', - 'md5': '98e96346caa3b303fec4493c5d49dcb5', + 'md5': '8c7d272910eea320f6f8e6d3084eecf5', 'info_dict': { 'id': '5e779ebdd157bc0001d1c75a', 'ext': 'mp4', From 8e3842e156ab562d7e73650bca718d91ee4b5046 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sat, 3 Oct 2020 06:08:30 +0200 Subject: [PATCH 09/10] [nebula] Implement PoC of netrc authentication --- youtube_dl/extractor/nebula.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 4b8cca8fd..6566dc2d2 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import json import os from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote, compat_str -from ..utils import parse_iso8601, ExtractorError, try_get, urljoin +from ..utils import parse_iso8601, ExtractorError, try_get, urljoin, sanitized_Request class NebulaIE(InfoExtractor): @@ -68,6 +69,22 @@ class NebulaIE(InfoExtractor): }, ] _WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription? + _NETRC_MACHINE = 'watchnebula' + + def _perform_login(self, username, password, video_id): + """ + Perform login to Nebula. + + Takes a username (email address) and password. Returns a Nebula token. + """ + data = json.dumps({'email': username, 'password': password}).encode('utf8') + request = sanitized_Request(method='POST', + url='https://api.watchnebula.com/api/v1/auth/login/', + data=data, + headers={'content-type': 'application/json'}) + response = self._download_json(request, fatal=True, video_id=video_id, + note='Logging in to Nebula') + return response['key'] def _retrieve_nebula_auth(self, video_id): """ @@ -79,6 +96,11 @@ class NebulaIE(InfoExtractor): # TODO: are these authentication methods, in this order, the best practice for youtube-dl? """ + + username, password = self._get_login_info() + nebula_token = self._perform_login(username, password, video_id) + return nebula_token + nebula_token = self._downloader.params.get('videopassword') if not nebula_token: # TODO: is there a helper to do all this cookie extraction? From aeda0f33563d779bec9fa82bf0015d29923c6679 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Fri, 23 Oct 2020 05:43:55 +0200 Subject: [PATCH 10/10] [nebula] Clean up credentials-based authentication --- youtube_dl/extractor/nebula.py | 51 +++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 6566dc2d2..e5e2b4048 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -15,12 +15,15 @@ class NebulaIE(InfoExtractor): off-YouTube from a small hand-picked group of creators. All videos require a subscription to watch. There are no known freely available videos. An authentication token to - an account with a valid subscription can be specified in multiple ways. + an account with a valid subscription can be specified in multiple ways, including credentials in .netrc or a cookie + jar. + As neither of these parameters appear to be supported by the unit test runner, it's recommended to set the envvar + NEBULA_TOKEN to execute the test runs. Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off video extraction to the Zype extractor. - This description has been last updated on 2020-05-11. + This description has been last updated on 2020-10-22. """ _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id() @@ -73,35 +76,44 @@ class NebulaIE(InfoExtractor): def _perform_login(self, username, password, video_id): """ - Perform login to Nebula. + Log in to Nebula, authenticating using a given username and password. - Takes a username (email address) and password. Returns a Nebula token. + Returns a Nebula token, as the frontend would store it in the + nebula-auth cookie. Or False, if authentication fails. """ data = json.dumps({'email': username, 'password': password}).encode('utf8') request = sanitized_Request(method='POST', url='https://api.watchnebula.com/api/v1/auth/login/', data=data, headers={'content-type': 'application/json'}) - response = self._download_json(request, fatal=True, video_id=video_id, - note='Logging in to Nebula') + response = self._download_json(request, fatal=False, video_id=video_id, + note='Authenticating to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or 'key' not in response: + return False return response['key'] def _retrieve_nebula_auth(self, video_id): """ - Attempt to find a Nebula API token. Makes multiple attempts in the following order: - a) the --video-password command line argument + Attempt to find a Nebula API token. Makes multiple attempts in the + following order: + a) login credentials used to authenticate to the Nebula login endpoint, + either from .netrc or specified using --username/--password b) the --cookies supplied cookie jar c) the NEBULA_TOKEN environment variable - If none of these are successful, an end user-intended error message is returned, listing some solutions. - - # TODO: are these authentication methods, in this order, the best practice for youtube-dl? + d) the --video-password command line argument (this isn't documented in + the error message, because probably highly unpopular) + If none of these are successful, an end user-intended error message is + raised, listing some solutions. """ + nebula_token = None + # option #1: login credentials via .netrc or --username and --password username, password = self._get_login_info() - nebula_token = self._perform_login(username, password, video_id) - return nebula_token + if username and password: + nebula_token = self._perform_login(username, password, video_id) - nebula_token = self._downloader.params.get('videopassword') + # option #2: nebula token via cookie jar if not nebula_token: # TODO: is there a helper to do all this cookie extraction? nebula_cookies = self._get_cookies('https://watchnebula.com') @@ -109,12 +121,19 @@ class NebulaIE(InfoExtractor): if nebula_cookie: nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken') + + # option #3: nebula token via environment variable if not nebula_token and 'NEBULA_TOKEN' in os.environ: nebula_token = os.environ.get('NEBULA_TOKEN') + + # option #4: nebula token via --videopassword + if not nebula_token: + nebula_token = self._downloader.params.get('videopassword') + if not nebula_token: raise ExtractorError('Nebula requires an account with an active subscription. ' - 'You can supply a corresponding token by either ' - 'a) finding your nebula-auth cookie and then specifying it via --video-password, or ' + 'You can supply your authentication information by either ' + 'a) storing your credentials in .netrc or supplying them via --username and --password, or ' 'b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or ' 'c) setting the environment variable NEBULA_TOKEN.') return nebula_token