From 99fd3bf6ad6306d67c146ec51a551abaf1995564 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Tue, 7 Apr 2020 22:05:09 +0200
Subject: [PATCH 01/10] [nebula] Add basic support for Nebula (refs #21258)

---
 AUTHORS                            |   1 +
 docs/supportedsites.md             |   1 +
 youtube_dl/extractor/extractors.py |   3 +-
 youtube_dl/extractor/nebula.py     | 132 +++++++++++++++++++++++++++++
 4 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 youtube_dl/extractor/nebula.py
diff --git a/AUTHORS b/AUTHORS
index b507cb8df..64ac71249 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -246,3 +246,4 @@ Enes Solak
 Nathan Rossi
 Thomas van der Berg
 Luca Cherubin
+Henrik Heimbuerger
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 174b83bf3..164b1e47e 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -542,6 +542,7 @@
  - **ndr:embed**
  - **ndr:embed:base**
  - **NDTV**
+ - **Nebula**
  - **NerdCubedFeed**
  - **netease:album**: 网易云音乐 - 专辑
  - **netease:djradio**: 网易云音乐 - 电台
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index e407ab3d9..0e8087d7c 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -692,8 +692,9 @@ from .ndr import (
     NJoyEmbedIE,
 )
 from .ndtv import NDTVIE
-from .netzkino import NetzkinoIE
+from .nebula import NebulaIE
 from .nerdcubed import NerdCubedFeedIE
+from .netzkino import NetzkinoIE
 from .neteasemusic import (
     NetEaseMusicIE,
     NetEaseMusicAlbumIE,
diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
new file mode 100644
index 000000000..e22a4b088
--- /dev/null
+++ b/youtube_dl/extractor/nebula.py
@@ -0,0 +1,132 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import os
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH')   # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
+
+
+class NebulaIE(InfoExtractor):
+    """
+    Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos
+    off-YouTube from a small hand-picked group of creators.
+
+    All videos require a subscription to watch. There are no known freely available videos. So the test case is
+    disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription).
+
+    Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off
+    video extraction to the Zype extractor.
+
+    This description has been last updated on 2020-04-07.
+    """
+
+    _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P<id>[-\w]+)'   # the 'id' group is actually the slug, but we misname it 'id' to be able to use _match_id()
+    _TEST = {
+        'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast',
+        'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
+        'info_dict': {
+            'id': '5c271b40b13fd613090034fd',
+            'ext': 'mp4',
+            'title': 'That Time Disney Remade Beauty and the Beast',
+            'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
+            'upload_date': '20180731',
+            'timestamp': 1533009600,
+            #'uploader': 'Lindsay Ellis',   # TODO: removed because unreliable/sometimes incorrect
+        }
+    }
+    _WORKING = False   # this is set to False because the test won't pass without an auth cookie for a (paid) subscription
+
+    def _extract_state_object(self, webpage, display_id):
+        """
+        As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script
+        tag. This function is extracting this script tag, parsing it as JSON.
+        """
+        initial_state_object = self._search_regex(r'<script id="initial-app-state" type="application/json">(.+?)</script>', webpage, 'initial_state')
+        metadata = self._parse_json(initial_state_object, video_id=display_id)   # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
+
+        return metadata
+
+    def _extract_video_metadata(self, state_object, display_id):
+        """
+        The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the
+        video ID, we can then extract a dictionary with various meta data about the video itself.
+        """
+        video_id = state_object['videos']['byURL'][display_id]
+        video_meta = state_object['videos']['byID'][video_id]
+
+        return video_id, video_meta
+
+    def _extract_video_url(self, webpage, state_object, video_id):
+        """
+        To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a
+        bit more stable to extract the iframe source that links to the video.
+        """
+        iframe = self._search_regex(r'<iframe(.+?)</iframe>', webpage, 'iframe', fatal=False)
+        video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None
+
+        # fallback: reconstruct using video ID and access token from state object
+        if not video_url:
+            access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken']
+            video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token)
+
+        return video_url
+
+    def _extract_uploader(self, video_meta):
+        """
+        Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized
+        more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so
+        I'll go with this for now.
+        """
+        return video_meta['categories'][0]['value'][0]
+
+    def _real_extract(self, url):
+        # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
+        if COOKIE_NEBULA_AUTH:
+            self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH)
+
+        # extract the video's display ID from the URL (we'll retrieve the video ID later)
+        display_id = self._match_id(url)
+
+        # download the page
+        webpage = self._download_webpage(url, video_id=display_id)    # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead...
+
+        # extract the state object from the webpage, and then retrieve video meta data from it
+        state_object = self._extract_state_object(webpage, display_id)
+        video_id, video_meta = self._extract_video_metadata(state_object, display_id)
+
+        # extract the video URL from the webpage
+        video_url = self._extract_video_url(webpage, state_object, video_id)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+
+            # we're passing this video URL on to the 'Zype' extractor (that's the video infrastructure that Nebula is
+            # built on top of) and use the 'url_transparent' type to indicate that our meta data should be better than
+            # whatever the Zype extractor is able to identify
+            '_type': 'url_transparent',
+            'ie_key': 'Zype',
+            'url': video_url,
+
+            # the meta data we were able to extract from Nebula
+            'title': video_meta['title'],
+            'description': video_meta['description'],
+            'timestamp': parse_iso8601(video_meta['published_at']),
+            #'uploader': self._extract_uploader(video_meta),   # TODO: removed because unreliable/sometimes incorrect
+            'thumbnails': [
+                {
+                    'id': tn['name'],   # this appears to be null in all cases I've seen
+                    'url': tn['url'],
+                    'width': tn['width'],
+                    'height': tn['height'],
+                } for tn in video_meta['thumbnails']
+            ],
+            'duration': video_meta['duration'],
+            # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!
+            # TODO: channel
+            # TODO: channel_id
+            # TODO: channel_url
+        }

From 1cfcf0b79a03cdf5e7ca81829daebad1a4131e8d Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Thu, 16 Apr 2020 04:34:17 +0200
Subject: [PATCH 02/10] [nebula] Add additional test cases and improve cookie
 envvar handling

---
 youtube_dl/extractor/nebula.py | 61 +++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index e22a4b088..6aa4e1da4 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -23,21 +23,52 @@ class NebulaIE(InfoExtractor):
     This description has been last updated on 2020-04-07.
     """
 
-    _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P<id>[-\w]+)'   # the 'id' group is actually the slug, but we misname it 'id' to be able to use _match_id()
-    _TEST = {
-        'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast',
-        'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
-        'info_dict': {
-            'id': '5c271b40b13fd613090034fd',
-            'ext': 'mp4',
-            'title': 'That Time Disney Remade Beauty and the Beast',
-            'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
-            'upload_date': '20180731',
-            'timestamp': 1533009600,
-            #'uploader': 'Lindsay Ellis',   # TODO: removed because unreliable/sometimes incorrect
-        }
-    }
-    _WORKING = False   # this is set to False because the test won't pass without an auth cookie for a (paid) subscription
+    _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P<id>[-\w]+)'   # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id()
+    _TESTS = [
+        {
+            'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast',
+            'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
+            'info_dict': {
+                'id': '5c271b40b13fd613090034fd',
+                'ext': 'mp4',
+                'title': 'That Time Disney Remade Beauty and the Beast',
+                'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
+                'upload_date': '20180731',
+                'timestamp': 1533009600,
+                'channel': 'Lindsay Ellis',
+                'uploader': 'Lindsay Ellis',
+            }
+        },
+        {
+            'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
+            'md5': 'b0b171504d67e2822179149ccd6787db',
+            'info_dict': {
+                'id': '5e7e78171aaf320001fbd6be',
+                'ext': 'mp4',
+                'title': 'Landing Craft - How The Allies Got Ashore',
+                'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
+                'upload_date': '20200327',
+                'timestamp': 1585348140,
+                'channel': 'The Logistics of D-Day',
+                'uploader': 'The Logistics of D-Day',
+            }
+        },
+        {
+            'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
+            'md5': '98e96346caa3b303fec4493c5d49dcb5',
+            'info_dict': {
+                'id': '5e779ebdd157bc0001d1c75a',
+                'ext': 'mp4',
+                'title': 'Episode 1: The Draw',
+                'description': r're:^There’s free money on offer… if the players can all work together.',
+                'upload_date': '20200323',
+                'timestamp': 1584980400,
+                'channel': 'Tom Scott Presents: Money',
+                'uploader': 'Tom Scott Presents: Money',
+            }
+        },
+    ]
+    _WORKING = True   # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription?
 
     def _extract_state_object(self, webpage, display_id):
         """

From 6daa352a7dea08fecfb227965ee304799f9da0c7 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Thu, 16 Apr 2020 04:35:05 +0200
Subject: [PATCH 03/10] [nebula] Add better channel title extraction (refs
 #21258)

---
 youtube_dl/extractor/nebula.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 6aa4e1da4..828ea1c6f 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -105,13 +105,32 @@ class NebulaIE(InfoExtractor):
 
         return video_url
 
-    def _extract_uploader(self, video_meta):
+    def _extract_channel(self, video_meta):
         """
-        Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized
-        more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so
-        I'll go with this for now.
+        Extract the channel title, by going through the list of categories and finding the first value of the
+        first category that has a value.
+
+        I know this look like a terrible approach. But actually, it's just reproducing the behavior of the
+        React code the Nebula frontend uses (as of 2020-04-07):
+
+            let channel;
+            if (video && video.categories && video.categories.length) {
+                const channelTitle = video.categories.map((category) => (category.value[0]))
+                                                     .filter((title) => (!!title))[0];
+                channel = getChannelByTitle(state, { title: channelTitle });
+            }
+
+        Basically, it finds the first (truthy) value in the category list and that's assumed to be the
+        channel title. And then the channel details (e.g. the URL) are looked up by title (!) (not by any
+        kind of ID) via an additional API call.
+
+        TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL
         """
-        return video_meta['categories'][0]['value'][0]
+        categories = video_meta['categories']
+        for category in categories:
+            if category['value']:
+                return category['value'][0]
+        return None
 
     def _real_extract(self, url):
         # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
@@ -127,6 +146,7 @@ class NebulaIE(InfoExtractor):
         # extract the state object from the webpage, and then retrieve video meta data from it
         state_object = self._extract_state_object(webpage, display_id)
         video_id, video_meta = self._extract_video_metadata(state_object, display_id)
+        channel_title = self._extract_channel(video_meta)
 
         # extract the video URL from the webpage
         video_url = self._extract_video_url(webpage, state_object, video_id)
@@ -146,7 +166,6 @@ class NebulaIE(InfoExtractor):
             'title': video_meta['title'],
             'description': video_meta['description'],
             'timestamp': parse_iso8601(video_meta['published_at']),
-            #'uploader': self._extract_uploader(video_meta),   # TODO: removed because unreliable/sometimes incorrect
             'thumbnails': [
                 {
                     'id': tn['name'],   # this appears to be null in all cases I've seen
@@ -156,8 +175,9 @@ class NebulaIE(InfoExtractor):
                 } for tn in video_meta['thumbnails']
             ],
             'duration': video_meta['duration'],
+            'channel': channel_title,
+            'uploader': channel_title,   # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
             # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!
-            # TODO: channel
             # TODO: channel_id
             # TODO: channel_url
         }

From 6c35cac4325d0325d1cb47609b20b97570d331ba Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Sat, 18 Apr 2020 06:15:03 +0200
Subject: [PATCH 04/10] [nebula] Relax meta data lookups

---
 youtube_dl/extractor/nebula.py | 48 ++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 828ea1c6f..038863348 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -4,7 +4,8 @@ from __future__ import unicode_literals
 import os
 
 from .common import InfoExtractor
-from ..utils import parse_iso8601
+from ..compat import compat_str
+from ..utils import parse_iso8601, try_get
 
 COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH')   # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
 
@@ -74,9 +75,13 @@ class NebulaIE(InfoExtractor):
         """
         As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script
         tag. This function is extracting this script tag, parsing it as JSON.
+
+        May return None if no state object could be found or it didn't contain valid JSON.
         """
-        initial_state_object = self._search_regex(r'<script id="initial-app-state" type="application/json">(.+?)</script>', webpage, 'initial_state')
-        metadata = self._parse_json(initial_state_object, video_id=display_id)   # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
+        initial_state_object = self._search_regex(
+            r'<script[^>]*id="initial-app-state"[^>]*>(.+?)</script>', webpage,
+            'initial_state', fatal=False, default=None)
+        metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None   # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
 
         return metadata
 
@@ -84,9 +89,12 @@ class NebulaIE(InfoExtractor):
         """
         The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the
         video ID, we can then extract a dictionary with various meta data about the video itself.
+
+        May return (None, {}) if no state object was given or it didn't contain the expected lookup table or
+        meta data.
         """
-        video_id = state_object['videos']['byURL'][display_id]
-        video_meta = state_object['videos']['byID'][video_id]
+        video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str)
+        video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {}
 
         return video_id, video_meta
 
@@ -100,8 +108,10 @@ class NebulaIE(InfoExtractor):
 
         # fallback: reconstruct using video ID and access token from state object
         if not video_url:
-            access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken']
-            video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token)
+            access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'],
+                                   compat_str)
+            video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(
+                video_id=video_id, access_token=access_token)
 
         return video_url
 
@@ -125,12 +135,13 @@ class NebulaIE(InfoExtractor):
         kind of ID) via an additional API call.
 
         TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL
+
+        May return None of no category list could be found or no category had a label ('value').
         """
-        categories = video_meta['categories']
+        categories = video_meta.get('categories', []) if video_meta else []
         for category in categories:
-            if category['value']:
+            if category.get('value'):   # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well!
                 return category['value'][0]
-        return None
 
     def _real_extract(self, url):
         # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
@@ -163,18 +174,17 @@ class NebulaIE(InfoExtractor):
             'url': video_url,
 
             # the meta data we were able to extract from Nebula
-            'title': video_meta['title'],
-            'description': video_meta['description'],
-            'timestamp': parse_iso8601(video_meta['published_at']),
+            'title': video_meta.get('title'),
+            'description': video_meta.get('description'),
+            'timestamp': parse_iso8601(video_meta.get('published_at')),
             'thumbnails': [
                 {
-                    'id': tn['name'],   # this appears to be null in all cases I've seen
+                    'id': tn.get('name'),   # this appears to be null in all cases I've seen
                     'url': tn['url'],
-                    'width': tn['width'],
-                    'height': tn['height'],
-                } for tn in video_meta['thumbnails']
-            ],
-            'duration': video_meta['duration'],
+                    'width': tn.get('width'),
+                    'height': tn.get('height'),
+                } for tn in video_meta.get('thumbnails', [])],
+            'duration': video_meta.get('duration'),
             'channel': channel_title,
             'uploader': channel_title,   # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
             # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!

From 606409a2043a88daa3d21c0d07ece29704cd5cd3 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Mon, 11 May 2020 05:58:14 +0200
Subject: [PATCH 05/10] [nebula] Rewrite extractor to new frontend (refs
 #21258)

---
 youtube_dl/extractor/nebula.py | 129 ++++++++++++++++++++-------------
 1 file changed, 77 insertions(+), 52 deletions(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 038863348..9a6ddf6f3 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -4,10 +4,8 @@ from __future__ import unicode_literals
 import os
 
 from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import parse_iso8601, try_get
-
-COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH')   # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
+from ..compat import compat_urllib_parse_unquote, compat_str
+from ..utils import parse_iso8601, ExtractorError, try_get
 
 
 class NebulaIE(InfoExtractor):
@@ -15,13 +13,13 @@ class NebulaIE(InfoExtractor):
     Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos
     off-YouTube from a small hand-picked group of creators.
 
-    All videos require a subscription to watch. There are no known freely available videos. So the test case is
-    disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription).
+    All videos require a subscription to watch. There are no known freely available videos. An authentication token to
+    an account with a valid subscription can be specified in multiple ways.
 
     Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off
     video extraction to the Zype extractor.
 
-    This description has been last updated on 2020-04-07.
+    This description has been last updated on 2020-05-11.
     """
 
     _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P<id>[-\w]+)'   # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id()
@@ -61,7 +59,7 @@ class NebulaIE(InfoExtractor):
                 'id': '5e779ebdd157bc0001d1c75a',
                 'ext': 'mp4',
                 'title': 'Episode 1: The Draw',
-                'description': r're:^There’s free money on offer… if the players can all work together.',
+                'description': r'contains:There’s free money on offer… if the players can all work together.',
                 'upload_date': '20200323',
                 'timestamp': 1584980400,
                 'channel': 'Tom Scott Presents: Money',
@@ -71,49 +69,76 @@ class NebulaIE(InfoExtractor):
     ]
     _WORKING = True   # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription?
 
-    def _extract_state_object(self, webpage, display_id):
+    def _retrieve_nebula_auth(self, video_id):
         """
-        As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script
-        tag. This function is extracting this script tag, parsing it as JSON.
+        Attempt to find a Nebula API token. Makes multiple attempts in the following order:
+        a) the --video-password command line argument
+        b) the --cookies supplied cookie jar
+        c) the NEBULA_TOKEN environment variable
+        If none of these are successful, an end user-intended error message is returned, listing some solutions.
 
-        May return None if no state object could be found or it didn't contain valid JSON.
+        # TODO: are these authentication methods, in this order, the best practice for youtube-dl?
         """
-        initial_state_object = self._search_regex(
-            r'<script[^>]*id="initial-app-state"[^>]*>(.+?)</script>', webpage,
-            'initial_state', fatal=False, default=None)
-        metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None   # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
+        nebula_token = self._downloader.params.get('videopassword')
+        if not nebula_token:
+            # TODO: is there a helper to do all this cookie extraction?
+            nebula_cookies = self._get_cookies('https://watchnebula.com')
+            nebula_cookie = nebula_cookies.get('nebula-auth')
+            if nebula_cookie:
+                nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)
+                nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken')
+        if not nebula_token and 'NEBULA_TOKEN' in os.environ:
+            nebula_token = os.environ.get('NEBULA_TOKEN')
+        if not nebula_token:
+            raise ExtractorError('Nebula requires an account with an active subscription. '
+                                 'You can supply a corresponding token by either '
+                                 'a) finding your nebula-auth cookie and then specifying it via --video-password, or '
+                                 'b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or '
+                                 'c) setting the environment variable NEBULA_TOKEN.')
+        return nebula_token
 
-        return metadata
-
-    def _extract_video_metadata(self, state_object, display_id):
+    def _call_zype_api(self, path, params, video_id, api_key):
         """
-        The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the
-        video ID, we can then extract a dictionary with various meta data about the video itself.
-
-        May return (None, {}) if no state object was given or it didn't contain the expected lookup table or
-        meta data.
+        A helper for making calls to the Zype API.
         """
-        video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str)
-        video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {}
+        query = {'api_key': api_key, 'per_page': 1}
+        query.update(params)
+        return self._download_json('https://api.zype.com' + path, video_id, query=query)
 
-        return video_id, video_meta
-
-    def _extract_video_url(self, webpage, state_object, video_id):
+    def _fetch_zype_video_data(self, display_id, api_key):
         """
-        To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a
-        bit more stable to extract the iframe source that links to the video.
+        Fetch video meta data from the Zype API.
         """
-        iframe = self._search_regex(r'<iframe(.+?)</iframe>', webpage, 'iframe', fatal=False)
-        video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None
+        response = self._call_zype_api('/videos', {'friendly_title': display_id}, display_id, api_key)
+        if 'response' not in response or len(response['response']) != 1:
+            raise ExtractorError('Unable to find video on Zype API')
+        return response['response'][0]
 
-        # fallback: reconstruct using video ID and access token from state object
-        if not video_url:
-            access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'],
-                                   compat_str)
-            video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(
-                video_id=video_id, access_token=access_token)
+    def _call_nebula_api(self, path, video_id, access_token):
+        """
+        A helper for making calls to the Nebula API.
+        """
+        return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={
+            'Authorization': 'Token {access_token}'.format(access_token=access_token)
+        })
 
-        return video_url
+    def _fetch_zype_access_token(self, video_id, nebula_token):
+        """
+        Requests a Zype access token from the Nebula API.
+        """
+        user_object = self._call_nebula_api('/auth/user', video_id, nebula_token)
+        access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
+        if not access_token:
+            raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
+        return access_token
+
+    def _build_video_url(self, video_id, zype_access_token):
+        """
+        Construct a Zype video URL (as supported by the Zype extractor), given a Zype video ID and a Zype access token.
+        """
+        return 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(
+            video_id=video_id,
+            access_token=zype_access_token)
 
     def _extract_channel(self, video_meta):
         """
@@ -144,23 +169,23 @@ class NebulaIE(InfoExtractor):
                 return category['value'][0]
 
     def _real_extract(self, url):
-        # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
-        if COOKIE_NEBULA_AUTH:
-            self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH)
-
         # extract the video's display ID from the URL (we'll retrieve the video ID later)
         display_id = self._match_id(url)
 
-        # download the page
-        webpage = self._download_webpage(url, video_id=display_id)    # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead...
+        # retrieve Nebula authentication information
+        nebula_token = self._retrieve_nebula_auth(display_id)
 
-        # extract the state object from the webpage, and then retrieve video meta data from it
-        state_object = self._extract_state_object(webpage, display_id)
-        video_id, video_meta = self._extract_video_metadata(state_object, display_id)
+        # fetch video meta data from the Nebula API
+        api_key = 'JlSv9XTImxelHi-eAHUVDy_NUM3uAtEogEpEdFoWHEOl9SKf5gl9pCHB1AYbY3QF'   # FIXME: extract from main chunk at runtime
+        video_meta = self._fetch_zype_video_data(display_id, api_key)
+        video_id = video_meta['_id']
+
+        # extract additional info
         channel_title = self._extract_channel(video_meta)
 
-        # extract the video URL from the webpage
-        video_url = self._extract_video_url(webpage, state_object, video_id)
+        # fetch the access token for Zype, then construct the video URL
+        zype_access_token = self._fetch_zype_access_token(video_id, nebula_token=nebula_token)
+        video_url = self._build_video_url(video_id, zype_access_token)
 
         return {
             'id': video_id,
@@ -179,7 +204,7 @@ class NebulaIE(InfoExtractor):
             'timestamp': parse_iso8601(video_meta.get('published_at')),
             'thumbnails': [
                 {
-                    'id': tn.get('name'),   # this appears to be null in all cases I've seen
+                    'id': tn.get('name'),   # this appears to be null in all cases I've encountered
                     'url': tn['url'],
                     'width': tn.get('width'),
                     'height': tn.get('height'),

From b27ab412eed214368242082116f2477562d32759 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Fri, 15 May 2020 06:14:08 +0200
Subject: [PATCH 06/10] [nebula] Implement Zype API key retrieval from JS chunk

---
 youtube_dl/extractor/nebula.py | 63 +++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 9 deletions(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 9a6ddf6f3..0ce229ad5 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -5,7 +5,7 @@ import os
 
 from .common import InfoExtractor
 from ..compat import compat_urllib_parse_unquote, compat_str
-from ..utils import parse_iso8601, ExtractorError, try_get
+from ..utils import parse_iso8601, ExtractorError, try_get, urljoin
 
 
 class NebulaIE(InfoExtractor):
@@ -97,36 +97,81 @@ class NebulaIE(InfoExtractor):
                                  'c) setting the environment variable NEBULA_TOKEN.')
         return nebula_token
 
-    def _call_zype_api(self, path, params, video_id, api_key):
+    def _retrieve_zype_api_key(self, page_url, display_id):
+        """
+        Retrieves the Zype API key required to make calls to the Zype API.
+
+        Unfortunately, the Nebula frontend stores this as a JS object literal in one of its JS chunks,
+        looking somewhat like this (but minified):
+
+            return {
+                NODE_ENV: "production",
+                REACT_APP_NAME: "Nebula",
+                REACT_APP_NEBULA_API: "https://api.watchnebula.com/api/v1/",
+                REACT_APP_ZYPE_API: "https://api.zype.com/",
+                REACT_APP_ZYPE_API_KEY: "<redacted>",
+                REACT_APP_ZYPE_APP_KEY: "<redacted>",
+                // ...
+            }
+
+        So we have to find the reference to the chunk in the video page (as it is hashed and the hash will
+        change when they do a new release), then download the chunk and extract the API key from there,
+        hoping they won't rename the constant.
+
+        Alternatively, it is currently hardcoded and shared among all users. We haven't seen it
+        change so far, so we could also just hardcode it in the extractor as a fallback.
+        """
+        # fetch the video page
+        webpage = self._download_webpage(page_url, video_id=display_id)
+
+        # find the script tag with a file named 'main.<hash>.chunk.js' in there
+        main_script_relpath = self._search_regex(
+            r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage,
+            group='script_relpath', name='script relative path', fatal=True)
+
+        # fetch the JS chunk
+        main_script_abspath = urljoin(page_url, main_script_relpath)
+        main_script = self._download_webpage(main_script_abspath, video_id=display_id,
+                                             note='Retrieving Zype API key')
+
+        # find the API key named 'REACT_APP_ZYPE_API_KEY' in there
+        api_key = self._search_regex(
+            r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script,
+            group='api_key', name='API key', fatal=True)
+
+        return api_key
+
+    def _call_zype_api(self, path, params, video_id, api_key, note):
         """
         A helper for making calls to the Zype API.
         """
         query = {'api_key': api_key, 'per_page': 1}
         query.update(params)
-        return self._download_json('https://api.zype.com' + path, video_id, query=query)
+        return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)
 
     def _fetch_zype_video_data(self, display_id, api_key):
         """
         Fetch video meta data from the Zype API.
         """
-        response = self._call_zype_api('/videos', {'friendly_title': display_id}, display_id, api_key)
+        response = self._call_zype_api('/videos', {'friendly_title': display_id},
+                                       display_id, api_key, note='Retrieving metadata from Zype')
         if 'response' not in response or len(response['response']) != 1:
             raise ExtractorError('Unable to find video on Zype API')
         return response['response'][0]
 
-    def _call_nebula_api(self, path, video_id, access_token):
+    def _call_nebula_api(self, path, video_id, access_token, note):
         """
         A helper for making calls to the Nebula API.
         """
         return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={
             'Authorization': 'Token {access_token}'.format(access_token=access_token)
-        })
+        }, note=note)
 
     def _fetch_zype_access_token(self, video_id, nebula_token):
         """
         Requests a Zype access token from the Nebula API.
         """
-        user_object = self._call_nebula_api('/auth/user', video_id, nebula_token)
+        user_object = self._call_nebula_api('/auth/user', video_id, nebula_token, note='Retrieving Zype access token')
         access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
         if not access_token:
             raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
@@ -176,7 +221,7 @@ class NebulaIE(InfoExtractor):
         nebula_token = self._retrieve_nebula_auth(display_id)
 
         # fetch video meta data from the Nebula API
-        api_key = 'JlSv9XTImxelHi-eAHUVDy_NUM3uAtEogEpEdFoWHEOl9SKf5gl9pCHB1AYbY3QF'   # FIXME: extract from main chunk at runtime
+        api_key = self._retrieve_zype_api_key(url, display_id)
         video_meta = self._fetch_zype_video_data(display_id, api_key)
         video_id = video_meta['_id']
 
@@ -184,7 +229,7 @@ class NebulaIE(InfoExtractor):
         channel_title = self._extract_channel(video_meta)
 
         # fetch the access token for Zype, then construct the video URL
-        zype_access_token = self._fetch_zype_access_token(video_id, nebula_token=nebula_token)
+        zype_access_token = self._fetch_zype_access_token(display_id, nebula_token=nebula_token)
         video_url = self._build_video_url(video_id, zype_access_token)
 
         return {

From 97fe7cd386e2dff88a927a7fd28f3bd144248b79 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Tue, 2 Jun 2020 04:57:37 +0200
Subject: [PATCH 07/10] [nebula] Improve performance by avoiding redirect

---
 youtube_dl/extractor/nebula.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 0ce229ad5..9a2828e19 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -171,7 +171,7 @@ class NebulaIE(InfoExtractor):
         """
         Requests a Zype access token from the Nebula API.
         """
-        user_object = self._call_nebula_api('/auth/user', video_id, nebula_token, note='Retrieving Zype access token')
+        user_object = self._call_nebula_api('/auth/user/', video_id, nebula_token, note='Retrieving Zype access token')
         access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
         if not access_token:
             raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')

From 4411fcec81352f1bfb5b74e014dde0eb8489e3d7 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Sat, 3 Oct 2020 05:41:27 +0200
Subject: [PATCH 08/10] [nebula] Update test video checksums

---
 youtube_dl/extractor/nebula.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 9a2828e19..4b8cca8fd 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -40,7 +40,7 @@ class NebulaIE(InfoExtractor):
         },
         {
             'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
-            'md5': 'b0b171504d67e2822179149ccd6787db',
+            'md5': '6d4edd14ce65720fa63aba5c583fb328',
             'info_dict': {
                 'id': '5e7e78171aaf320001fbd6be',
                 'ext': 'mp4',
@@ -54,7 +54,7 @@ class NebulaIE(InfoExtractor):
         },
         {
             'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
-            'md5': '98e96346caa3b303fec4493c5d49dcb5',
+            'md5': '8c7d272910eea320f6f8e6d3084eecf5',
             'info_dict': {
                 'id': '5e779ebdd157bc0001d1c75a',
                 'ext': 'mp4',

From 8e3842e156ab562d7e73650bca718d91ee4b5046 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Sat, 3 Oct 2020 06:08:30 +0200
Subject: [PATCH 09/10] [nebula] Implement PoC of netrc authentication

---
 youtube_dl/extractor/nebula.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 4b8cca8fd..6566dc2d2 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -1,11 +1,12 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import json
 import os
 
 from .common import InfoExtractor
 from ..compat import compat_urllib_parse_unquote, compat_str
-from ..utils import parse_iso8601, ExtractorError, try_get, urljoin
+from ..utils import parse_iso8601, ExtractorError, try_get, urljoin, sanitized_Request
 
 
 class NebulaIE(InfoExtractor):
@@ -68,6 +69,22 @@ class NebulaIE(InfoExtractor):
         },
     ]
     _WORKING = True   # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription?
+    _NETRC_MACHINE = 'watchnebula'
+
+    def _perform_login(self, username, password, video_id):
+        """
+        Perform login to Nebula.
+
+        Takes a username (email address) and password. Returns a Nebula token.
+        """
+        data = json.dumps({'email': username, 'password': password}).encode('utf8')
+        request = sanitized_Request(method='POST',
+                                    url='https://api.watchnebula.com/api/v1/auth/login/',
+                                    data=data,
+                                    headers={'content-type': 'application/json'})
+        response = self._download_json(request, fatal=True, video_id=video_id,
+                                       note='Logging in to Nebula')
+        return response['key']
 
     def _retrieve_nebula_auth(self, video_id):
         """
@@ -79,6 +96,11 @@ class NebulaIE(InfoExtractor):
 
         # TODO: are these authentication methods, in this order, the best practice for youtube-dl?
         """
+
+        username, password = self._get_login_info()
+        nebula_token = self._perform_login(username, password, video_id)
+        return nebula_token
+
         nebula_token = self._downloader.params.get('videopassword')
         if not nebula_token:
             # TODO: is there a helper to do all this cookie extraction?

From aeda0f33563d779bec9fa82bf0015d29923c6679 Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Fri, 23 Oct 2020 05:43:55 +0200
Subject: [PATCH 10/10] [nebula] Clean up credentials-based authentication

---
 youtube_dl/extractor/nebula.py | 51 +++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 6566dc2d2..e5e2b4048 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -15,12 +15,15 @@ class NebulaIE(InfoExtractor):
     off-YouTube from a small hand-picked group of creators.
 
     All videos require a subscription to watch. There are no known freely available videos. An authentication token to
-    an account with a valid subscription can be specified in multiple ways.
+    an account with a valid subscription can be specified in multiple ways, including credentials in .netrc or a cookie
+    jar.
+    As neither of these parameters appear to be supported by the unit test runner, it's recommended to set the envvar
+    NEBULA_TOKEN to execute the test runs.
 
     Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off
     video extraction to the Zype extractor.
 
-    This description has been last updated on 2020-05-11.
+    This description has been last updated on 2020-10-22.
     """
 
     _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P<id>[-\w]+)'   # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id()
@@ -73,35 +76,44 @@ class NebulaIE(InfoExtractor):
 
     def _perform_login(self, username, password, video_id):
         """
-        Perform login to Nebula.
+        Log in to Nebula, authenticating using a given username and password.
 
-        Takes a username (email address) and password. Returns a Nebula token.
+        Returns a Nebula token, as the frontend would store it in the
+        nebula-auth cookie. Or False, if authentication fails.
         """
         data = json.dumps({'email': username, 'password': password}).encode('utf8')
         request = sanitized_Request(method='POST',
                                     url='https://api.watchnebula.com/api/v1/auth/login/',
                                     data=data,
                                     headers={'content-type': 'application/json'})
-        response = self._download_json(request, fatal=True, video_id=video_id,
-                                       note='Logging in to Nebula')
+        response = self._download_json(request, fatal=False, video_id=video_id,
+                                       note='Authenticating to Nebula with supplied credentials',
+                                       errnote='Authentication failed or rejected')
+        if not response or 'key' not in response:
+            return False
         return response['key']
 
     def _retrieve_nebula_auth(self, video_id):
         """
-        Attempt to find a Nebula API token. Makes multiple attempts in the following order:
-        a) the --video-password command line argument
+        Attempt to find a Nebula API token. Makes multiple attempts in the
+        following order:
+        a) login credentials used to authenticate to the Nebula login endpoint,
+           either from .netrc or specified using --username/--password
         b) the --cookies supplied cookie jar
         c) the NEBULA_TOKEN environment variable
-        If none of these are successful, an end user-intended error message is returned, listing some solutions.
-
-        # TODO: are these authentication methods, in this order, the best practice for youtube-dl?
+        d) the --video-password command line argument (this isn't documented in
+           the error message, because probably highly unpopular)
+        If none of these are successful, an end user-intended error message is
+        raised, listing some solutions.
         """
+        nebula_token = None
 
+        # option #1: login credentials via .netrc or --username and --password
         username, password = self._get_login_info()
-        nebula_token = self._perform_login(username, password, video_id)
-        return nebula_token
+        if username and password:
+            nebula_token = self._perform_login(username, password, video_id)
 
-        nebula_token = self._downloader.params.get('videopassword')
+        # option #2: nebula token via cookie jar
         if not nebula_token:
             # TODO: is there a helper to do all this cookie extraction?
             nebula_cookies = self._get_cookies('https://watchnebula.com')
@@ -109,12 +121,19 @@ class NebulaIE(InfoExtractor):
             if nebula_cookie:
                 nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)
                 nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken')
+
+        # option #3: nebula token via environment variable
         if not nebula_token and 'NEBULA_TOKEN' in os.environ:
             nebula_token = os.environ.get('NEBULA_TOKEN')
+
+        # option #4: nebula token via --videopassword
+        if not nebula_token:
+            nebula_token = self._downloader.params.get('videopassword')
+
         if not nebula_token:
             raise ExtractorError('Nebula requires an account with an active subscription. '
-                                 'You can supply a corresponding token by either '
-                                 'a) finding your nebula-auth cookie and then specifying it via --video-password, or '
+                                 'You can supply your authentication information by either '
+                                 'a) storing your credentials in .netrc or supplying them via --username and --password, or '
                                  'b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or '
                                  'c) setting the environment variable NEBULA_TOKEN.')
         return nebula_token