From 4d52506dbefbd4dafd90689fcce820f522bad983 Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 9 Oct 2019 21:39:40 +0200 Subject: [PATCH] [hhu] Use _parse_jwplayer_data --- youtube_dl/extractor/hhu.py | 92 ++++++++++++++----------------------- 1 file changed, 35 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py index a23ae7d96..13eaca6fc 100644 --- a/youtube_dl/extractor/hhu.py +++ b/youtube_dl/extractor/hhu.py @@ -3,10 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - js_to_json, RegexNotFoundError, urljoin, get_element_by_id, unified_strdate + js_to_json, RegexNotFoundError, get_element_by_id, unified_strdate ) -import json import re @@ -49,88 +48,67 @@ class HHUIE(InfoExtractor): config_js = ( config_js[:encode_begin] + config_js[encode_end + 2:]) del encode_begin, encode_end - config = json.loads(js_to_json(config_js)) - if len(config['playlist']) > 1: - self.report_warning( - 'more than one video, just taking the first one') - video = config['playlist'][0] - formats = [ - { - 'url': urljoin('https://mediathek.hhu.de/', source['file']), - 'format_note': source.get('label'), - 'format_id': source['file'].split("/")[-1], } - for source in video['sources']] - formats.reverse() # config sorts from highest to lowest quality - title = video.get('title') - thumbnail = video.get('image') - thumbnail = urljoin('https://mediathek.hhu.de/', thumbnail) if thumbnail else None - + config = self._parse_json( + config_js, video_id, transform_source=js_to_json) + info = self._parse_jwplayer_data( + config, video_id, require_title=False, + base_url='https://mediathek.hhu.de/') except (RegexNotFoundError, ValueError): self.report_warning('failed to get player config, guessing formats') # This will likely work but better warn. file_id = self._html_search_regex( r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", webpage, 'file_id') - formats = [ - ({'url': format_url.format(file_id)}) - for format_url in ( - 'https://mediathek.hhu.de/movies/{}/v_10.webm', - 'https://mediathek.hhu.de/movies/{}/v_10.mp4', - 'https://mediathek.hhu.de/movies/{}/v_50.webm', - 'https://mediathek.hhu.de/movies/{}/v_50.mp4', - 'https://mediathek.hhu.de/movies/{}/v_100.webm', - 'https://mediathek.hhu.de/movies/{}/v_100.mp4',)] - title = thumbnail = None - if not title: - title = self._html_search_regex( + info = { + 'video_id': video_id, + 'formats': [ + ({'url': format_url.format(file_id)}) + for format_url in ( + 'https://mediathek.hhu.de/movies/{}/v_10.webm', + 'https://mediathek.hhu.de/movies/{}/v_10.mp4', + 'https://mediathek.hhu.de/movies/{}/v_50.webm', + 'https://mediathek.hhu.de/movies/{}/v_50.mp4', + 'https://mediathek.hhu.de/movies/{}/v_100.webm', + 'https://mediathek.hhu.de/movies/{}/v_100.mp4',)]} + if not info.get('title'): + info['title'] = self._html_search_regex( r'

\s+(.+?)\s+<\/h1>', webpage, 'title') - if not title: - title = self._og_search_title(webpage, fatal=False) - description = self._html_search_regex( + if not info.get('title'): + info['title'] = self._og_search_title(webpage, fatal=False) + info['description'] = self._html_search_regex( r'

\s+(.+?)\s+<\/p>', webpage, 'description', fatal=False) - if not description: - description = self._og_search_description(webpage, default='') - if not thumbnail: - thumbnail = self._og_search_property( + if not info.get('description'): + info['description'] = self._og_search_description(webpage, default='') + if not info.get('thumbnail'): + info['thumbnail'] = self._og_search_property( 'image:secure_url', webpage, 'thumbnail', fatal=False) - uploader = self._html_search_regex( + info['uploader'] = self._html_search_regex( r'(.+?)<\/a>', webpage, 'uploader', fatal=False) - uploader_id = self._html_search_regex( + info['uploader_id'] = self._html_search_regex( r'.+?<\/a>', webpage, 'uploader_id', fatal=False) # CC licenses get a image with an appropriate alt text license_img = get_element_by_id('mt_watch_license', webpage) if license_img: - license = self._search_regex( + info['license'] = self._search_regex( r'alt="(.+)"', license_img, 'license_img', fatal=False) - if not license_img or not license: + if not license_img or not info.get('license'): # other licenses are just text - license = self._html_search_regex( + info['license'] = self._html_search_regex( r'

(.+)<\/div>', webpage, 'license_text', fatal=False) - upload_date = _date(self._html_search_regex( + info['upload_date'] = _date(self._html_search_regex( r'(.+?)<\/span>', webpage, 'upload_date', fatal=False)) category = self._html_search_regex( r'(.+)', webpage, 'category', fatal=False) + info['categories'] = [category] # there's just one category per video tags_html = get_element_by_id('mt_watch_info_tag_list', webpage) - tags = _tags(tags_html) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'license': license, - 'categories': [category], # there's just one category per video - 'tags': tags, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'formats': formats, } + info['tags'] = _tags(tags_html) + return info def _date(str_containing_date):