From f08371c07ced991c580412e0b1672e52a4e1e5b5 Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 2 Oct 2019 21:18:36 +0200 Subject: [PATCH] [hhu] Parse video player config --- youtube_dl/extractor/hhu.py | 98 ++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py index 5ecf4a9bb..c994662e2 100644 --- a/youtube_dl/extractor/hhu.py +++ b/youtube_dl/extractor/hhu.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import js_to_json, RegexNotFoundError, urljoin + +import json +import re class HHUIE(InfoExtractor): @@ -20,53 +24,89 @@ class HHUIE(InfoExtractor): } def _real_extract(self, url): - # TODO: Login for some videos. video_id = self._match_id(url) webpage, webpage_url = self._download_webpage_handle(url, video_id) if webpage_url.geturl().startswith("https://sts."): self.raise_login_required() - file_id = self._html_search_regex( - r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", - webpage, 'file_id' - ) - formats = [ - ({'url': format_url.format(file_id)}) - for format_url in ( - 'https://mediathek.hhu.de/movies/{}/v_10.webm', - 'https://mediathek.hhu.de/movies/{}/v_10.mp4', - 'https://mediathek.hhu.de/movies/{}/v_50.webm', - 'https://mediathek.hhu.de/movies/{}/v_50.mp4', - 'https://mediathek.hhu.de/movies/{}/v_100.webm', - 'https://mediathek.hhu.de/movies/{}/v_100.mp4', - ) - ] + # Some videos need a login, maybe TODO. try: - title = self._og_search_title(webpage) - except: + config_js = self._search_regex( + r'playerInstance\.setup\(([^;]+)\);', webpage, 'config_js' + ) + # remove 'link: encodeURI(""),' + if 'link: encodeURI' in config_js: + encode_begin = config_js.find('link: encodeURI') + encode_end = config_js.find(')', encode_begin) + config_js = ( + config_js[:encode_begin] + config_js[encode_end + 2:] + ) + del encode_begin, encode_end + config = json.loads(js_to_json(config_js)) + if len(config['playlist']) > 1: + self.report_warning( + 'more than one video, just taking the first one' + ) + video = config['playlist'][0] + formats = [ + { + 'url': urljoin('https://mediathek.hhu.de/', source['file']), + 'format_note': source.get('label'), + 'format_id': source['file'].split("/")[-1], + } + for source in video['sources'] + ] + formats.reverse() # config sorts from highest to lowest quality + title = video.get('title') + thumbnail = video.get('image') + thumbnail = urljoin('https://mediathek.hhu.de/', thumbnail) if thumbnail else None + + except (RegexNotFoundError, ValueError): + self.report_warning('failed to get player config, guessing formats') + # This will likely work but better warn. + file_id = self._html_search_regex( + r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", + webpage, 'file_id' + ) + formats = [ + ({'url': format_url.format(file_id)}) + for format_url in ( + 'https://mediathek.hhu.de/movies/{}/v_10.webm', + 'https://mediathek.hhu.de/movies/{}/v_10.mp4', + 'https://mediathek.hhu.de/movies/{}/v_50.webm', + 'https://mediathek.hhu.de/movies/{}/v_50.mp4', + 'https://mediathek.hhu.de/movies/{}/v_100.webm', + 'https://mediathek.hhu.de/movies/{}/v_100.mp4', + ) + ] + title = thumbnail = None + if not title: title = self._html_search_regex( r'

\s+(.+?)\s+<\/h1>', webpage, 'title' ) - try: - description = self._og_search_description(webpage) - except: - description = self._html_search_regex( - r'

\s+(.+?)\s+<\/p>', - webpage, 'description', fatal=False - ) - thumbnail = self._og_search_property( - 'image:secure_url', webpage, 'thumbnail' + if not title: + title = self._og_search_title(webpage, fatal=False) + description = self._html_search_regex( + r'

\s+(.+?)\s+<\/p>', + webpage, 'description', fatal=False ) - uploader_id = self._html_search_regex( + if not description: + description = self._og_search_description(webpage, default='') + if not thumbnail: + thumbnail = self._og_search_property( + 'image:secure_url', webpage, 'thumbnail', fatal=False + ) + uploader = self._html_search_regex( r'(.+?)<\/a>', webpage, 'uploader', fatal=False ) + return { 'id': video_id, 'title': title, 'description': description, - 'uploader_id': uploader_id, + 'uploader': uploader, 'thumbnail': thumbnail, 'formats': formats, }