From f08371c07ced991c580412e0b1672e52a4e1e5b5 Mon Sep 17 00:00:00 2001
From: Niklas Sombert <niklas@ytvwld.de>
Date: Wed, 2 Oct 2019 21:18:36 +0200
Subject: [PATCH] [hhu] Parse video player config

---
 youtube_dl/extractor/hhu.py | 98 ++++++++++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 29 deletions(-)
diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py
index 5ecf4a9bb..c994662e2 100644
--- a/youtube_dl/extractor/hhu.py
+++ b/youtube_dl/extractor/hhu.py
@@ -2,6 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import js_to_json, RegexNotFoundError, urljoin
+
+import json
+import re
 
 
 class HHUIE(InfoExtractor):
@@ -20,53 +24,89 @@ class HHUIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        # TODO: Login for some videos.
         video_id = self._match_id(url)
         webpage, webpage_url = self._download_webpage_handle(url, video_id)
         if webpage_url.geturl().startswith("https://sts."):
             self.raise_login_required()
-        file_id = self._html_search_regex(
-            r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '",
-            webpage, 'file_id'
-        )
-        formats = [
-            ({'url': format_url.format(file_id)})
-            for format_url in (
-                'https://mediathek.hhu.de/movies/{}/v_10.webm',
-                'https://mediathek.hhu.de/movies/{}/v_10.mp4',
-                'https://mediathek.hhu.de/movies/{}/v_50.webm',
-                'https://mediathek.hhu.de/movies/{}/v_50.mp4',
-                'https://mediathek.hhu.de/movies/{}/v_100.webm',
-                'https://mediathek.hhu.de/movies/{}/v_100.mp4',
-            )
-        ]
+            # Some videos need a login, maybe TODO.
         try:
-            title = self._og_search_title(webpage)
-        except:
+            config_js = self._search_regex(
+                r'playerInstance\.setup\(([^;]+)\);', webpage, 'config_js'
+            )
+            # remove 'link: encodeURI("<our url>"),'
+            if 'link: encodeURI' in config_js:
+                encode_begin = config_js.find('link: encodeURI')
+                encode_end = config_js.find(')', encode_begin)
+                config_js = (
+                    config_js[:encode_begin] + config_js[encode_end + 2:]
+                )
+                del encode_begin, encode_end
+            config = json.loads(js_to_json(config_js))
+            if len(config['playlist']) > 1:
+                self.report_warning(
+                    'more than one video, just taking the first one'
+                )
+            video = config['playlist'][0]
+            formats = [
+                {
+                    'url': urljoin('https://mediathek.hhu.de/', source['file']),
+                    'format_note': source.get('label'),
+                    'format_id': source['file'].split("/")[-1],
+                }
+                for source in video['sources']
+            ]
+            formats.reverse()  # config sorts from highest to lowest quality
+            title = video.get('title')
+            thumbnail = video.get('image')
+            thumbnail = urljoin('https://mediathek.hhu.de/', thumbnail) if thumbnail else None
+
+        except (RegexNotFoundError, ValueError):
+            self.report_warning('failed to get player config, guessing formats')
+            # This will likely work but better warn.
+            file_id = self._html_search_regex(
+                r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '",
+                webpage, 'file_id'
+            )
+            formats = [
+                ({'url': format_url.format(file_id)})
+                for format_url in (
+                    'https://mediathek.hhu.de/movies/{}/v_10.webm',
+                    'https://mediathek.hhu.de/movies/{}/v_10.mp4',
+                    'https://mediathek.hhu.de/movies/{}/v_50.webm',
+                    'https://mediathek.hhu.de/movies/{}/v_50.mp4',
+                    'https://mediathek.hhu.de/movies/{}/v_100.webm',
+                    'https://mediathek.hhu.de/movies/{}/v_100.mp4',
+                )
+            ]
+            title = thumbnail = None
+        if not title:
             title = self._html_search_regex(
                 r'<h1 id="mt_watch-headline-title">\s+(.+?)\s+<\/h1>',
                 webpage, 'title'
             )
-        try:
-            description = self._og_search_description(webpage)
-        except:
-            description = self._html_search_regex(
-                r'<p id="mt_watch-description" class="watch-description">\s+(.+?)\s+<\/p>',
-                webpage, 'description', fatal=False
-            )
-        thumbnail = self._og_search_property(
-            'image:secure_url', webpage, 'thumbnail'
+        if not title:
+            title = self._og_search_title(webpage, fatal=False)
+        description = self._html_search_regex(
+            r'<p id="mt_watch-description" class="watch-description">\s+(.+?)\s+<\/p>',
+            webpage, 'description', fatal=False
         )
-        uploader_id = self._html_search_regex(
+        if not description:
+            description = self._og_search_description(webpage, default='')
+        if not thumbnail:
+            thumbnail = self._og_search_property(
+                'image:secure_url', webpage, 'thumbnail', fatal=False
+            )
+        uploader = self._html_search_regex(
             r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href=".+">(.+?)<\/a>',
             webpage, 'uploader', fatal=False
         )
 
+
         return {
             'id': video_id,
             'title': title,
             'description': description,
-            'uploader_id': uploader_id,
+            'uploader': uploader,
             'thumbnail': thumbnail,
             'formats': formats,
         }