From 4d52506dbefbd4dafd90689fcce820f522bad983 Mon Sep 17 00:00:00 2001
From: Niklas Sombert <niklas@ytvwld.de>
Date: Wed, 9 Oct 2019 21:39:40 +0200
Subject: [PATCH] [hhu] Use _parse_jwplayer_data

---
 youtube_dl/extractor/hhu.py | 92 ++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 57 deletions(-)
diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py
index a23ae7d96..13eaca6fc 100644
--- a/youtube_dl/extractor/hhu.py
+++ b/youtube_dl/extractor/hhu.py
@@ -3,10 +3,9 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    js_to_json, RegexNotFoundError, urljoin, get_element_by_id, unified_strdate
+    js_to_json, RegexNotFoundError, get_element_by_id, unified_strdate
 )
 
-import json
 import re
 
 
@@ -49,88 +48,67 @@ class HHUIE(InfoExtractor):
                 config_js = (
                     config_js[:encode_begin] + config_js[encode_end + 2:])
                 del encode_begin, encode_end
-            config = json.loads(js_to_json(config_js))
-            if len(config['playlist']) > 1:
-                self.report_warning(
-                    'more than one video, just taking the first one')
-            video = config['playlist'][0]
-            formats = [
-                {
-                    'url': urljoin('https://mediathek.hhu.de/', source['file']),
-                    'format_note': source.get('label'),
-                    'format_id': source['file'].split("/")[-1], }
-                for source in video['sources']]
-            formats.reverse()  # config sorts from highest to lowest quality
-            title = video.get('title')
-            thumbnail = video.get('image')
-            thumbnail = urljoin('https://mediathek.hhu.de/', thumbnail) if thumbnail else None
-
+            config = self._parse_json(
+                config_js, video_id, transform_source=js_to_json)
+            info = self._parse_jwplayer_data(
+                config, video_id, require_title=False,
+                base_url='https://mediathek.hhu.de/')
         except (RegexNotFoundError, ValueError):
             self.report_warning('failed to get player config, guessing formats')
             # This will likely work but better warn.
             file_id = self._html_search_regex(
                 r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '",
                 webpage, 'file_id')
-            formats = [
-                ({'url': format_url.format(file_id)})
-                for format_url in (
-                    'https://mediathek.hhu.de/movies/{}/v_10.webm',
-                    'https://mediathek.hhu.de/movies/{}/v_10.mp4',
-                    'https://mediathek.hhu.de/movies/{}/v_50.webm',
-                    'https://mediathek.hhu.de/movies/{}/v_50.mp4',
-                    'https://mediathek.hhu.de/movies/{}/v_100.webm',
-                    'https://mediathek.hhu.de/movies/{}/v_100.mp4',)]
-            title = thumbnail = None
-        if not title:
-            title = self._html_search_regex(
+            info = {
+                'video_id': video_id,
+                'formats': [
+                    ({'url': format_url.format(file_id)})
+                    for format_url in (
+                        'https://mediathek.hhu.de/movies/{}/v_10.webm',
+                        'https://mediathek.hhu.de/movies/{}/v_10.mp4',
+                        'https://mediathek.hhu.de/movies/{}/v_50.webm',
+                        'https://mediathek.hhu.de/movies/{}/v_50.mp4',
+                        'https://mediathek.hhu.de/movies/{}/v_100.webm',
+                        'https://mediathek.hhu.de/movies/{}/v_100.mp4',)]}
+        if not info.get('title'):
+            info['title'] = self._html_search_regex(
                 r'<h1 id="mt_watch-headline-title">\s+(.+?)\s+<\/h1>',
                 webpage, 'title')
-        if not title:
-            title = self._og_search_title(webpage, fatal=False)
-        description = self._html_search_regex(
+        if not info.get('title'):
+            info['title'] = self._og_search_title(webpage, fatal=False)
+        info['description'] = self._html_search_regex(
             r'<p id="mt_watch-description" class="watch-description">\s+(.+?)\s+<\/p>',
             webpage, 'description', fatal=False)
-        if not description:
-            description = self._og_search_description(webpage, default='')
-        if not thumbnail:
-            thumbnail = self._og_search_property(
+        if not info.get('description'):
+            info['description'] = self._og_search_description(webpage, default='')
+        if not info.get('thumbnail'):
+            info['thumbnail'] = self._og_search_property(
                 'image:secure_url', webpage, 'thumbnail', fatal=False)
-        uploader = self._html_search_regex(
+        info['uploader'] = self._html_search_regex(
             r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href=".+">(.+?)<\/a>',
             webpage, 'uploader', fatal=False)
-        uploader_id = self._html_search_regex(
+        info['uploader_id'] = self._html_search_regex(
             r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href="/user/(.+)">.+?<\/a>',
             webpage, 'uploader_id', fatal=False)
         # CC licenses get a image with an appropriate alt text
         license_img = get_element_by_id('mt_watch_license', webpage)
         if license_img:
-            license = self._search_regex(
+            info['license'] = self._search_regex(
                 r'alt="(.+)"', license_img, 'license_img', fatal=False)
-        if not license_img or not license:
+        if not license_img or not info.get('license'):
             # other licenses are just text
-            license = self._html_search_regex(
+            info['license'] = self._html_search_regex(
                 r'<div id="mt_content_placeholder_videotabs_mt_videotabs_formview_video_license" class="video-license">(.+)<\/div>',
                 webpage, 'license_text', fatal=False)
-        upload_date = _date(self._html_search_regex(
+        info['upload_date'] = _date(self._html_search_regex(
             r'<span class="watch-information-date added">(.+?)<\/span>',
             webpage, 'upload_date', fatal=False))
         category = self._html_search_regex(
             r'<a href="/category/.+">(.+)</a>', webpage, 'category', fatal=False)
+        info['categories'] = [category]  # there's just one category per video
         tags_html = get_element_by_id('mt_watch_info_tag_list', webpage)
-        tags = _tags(tags_html)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'license': license,
-            'categories': [category],  # there's just one category per video
-            'tags': tags,
-            'uploader': uploader,
-            'uploader_id': uploader_id,
-            'upload_date': upload_date,
-            'thumbnail': thumbnail,
-            'formats': formats, }
+        info['tags'] = _tags(tags_html)
+        return info
 
 
 def _date(str_containing_date):