1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-29 19:47:54 +01:00

[hhu] Use _parse_jwplayer_data

This commit is contained in:
Niklas Sombert 2019-10-09 21:39:40 +02:00
parent dada9f6db9
commit 4d52506dbe

View File

@ -3,10 +3,9 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
js_to_json, RegexNotFoundError, urljoin, get_element_by_id, unified_strdate js_to_json, RegexNotFoundError, get_element_by_id, unified_strdate
) )
import json
import re import re
@ -49,29 +48,20 @@ class HHUIE(InfoExtractor):
config_js = ( config_js = (
config_js[:encode_begin] + config_js[encode_end + 2:]) config_js[:encode_begin] + config_js[encode_end + 2:])
del encode_begin, encode_end del encode_begin, encode_end
config = json.loads(js_to_json(config_js)) config = self._parse_json(
if len(config['playlist']) > 1: config_js, video_id, transform_source=js_to_json)
self.report_warning( info = self._parse_jwplayer_data(
'more than one video, just taking the first one') config, video_id, require_title=False,
video = config['playlist'][0] base_url='https://mediathek.hhu.de/')
formats = [
{
'url': urljoin('https://mediathek.hhu.de/', source['file']),
'format_note': source.get('label'),
'format_id': source['file'].split("/")[-1], }
for source in video['sources']]
formats.reverse() # config sorts from highest to lowest quality
title = video.get('title')
thumbnail = video.get('image')
thumbnail = urljoin('https://mediathek.hhu.de/', thumbnail) if thumbnail else None
except (RegexNotFoundError, ValueError): except (RegexNotFoundError, ValueError):
self.report_warning('failed to get player config, guessing formats') self.report_warning('failed to get player config, guessing formats')
# This will likely work but better warn. # This will likely work but better warn.
file_id = self._html_search_regex( file_id = self._html_search_regex(
r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '",
webpage, 'file_id') webpage, 'file_id')
formats = [ info = {
'video_id': video_id,
'formats': [
({'url': format_url.format(file_id)}) ({'url': format_url.format(file_id)})
for format_url in ( for format_url in (
'https://mediathek.hhu.de/movies/{}/v_10.webm', 'https://mediathek.hhu.de/movies/{}/v_10.webm',
@ -79,58 +69,46 @@ class HHUIE(InfoExtractor):
'https://mediathek.hhu.de/movies/{}/v_50.webm', 'https://mediathek.hhu.de/movies/{}/v_50.webm',
'https://mediathek.hhu.de/movies/{}/v_50.mp4', 'https://mediathek.hhu.de/movies/{}/v_50.mp4',
'https://mediathek.hhu.de/movies/{}/v_100.webm', 'https://mediathek.hhu.de/movies/{}/v_100.webm',
'https://mediathek.hhu.de/movies/{}/v_100.mp4',)] 'https://mediathek.hhu.de/movies/{}/v_100.mp4',)]}
title = thumbnail = None if not info.get('title'):
if not title: info['title'] = self._html_search_regex(
title = self._html_search_regex(
r'<h1 id="mt_watch-headline-title">\s+(.+?)\s+<\/h1>', r'<h1 id="mt_watch-headline-title">\s+(.+?)\s+<\/h1>',
webpage, 'title') webpage, 'title')
if not title: if not info.get('title'):
title = self._og_search_title(webpage, fatal=False) info['title'] = self._og_search_title(webpage, fatal=False)
description = self._html_search_regex( info['description'] = self._html_search_regex(
r'<p id="mt_watch-description" class="watch-description">\s+(.+?)\s+<\/p>', r'<p id="mt_watch-description" class="watch-description">\s+(.+?)\s+<\/p>',
webpage, 'description', fatal=False) webpage, 'description', fatal=False)
if not description: if not info.get('description'):
description = self._og_search_description(webpage, default='') info['description'] = self._og_search_description(webpage, default='')
if not thumbnail: if not info.get('thumbnail'):
thumbnail = self._og_search_property( info['thumbnail'] = self._og_search_property(
'image:secure_url', webpage, 'thumbnail', fatal=False) 'image:secure_url', webpage, 'thumbnail', fatal=False)
uploader = self._html_search_regex( info['uploader'] = self._html_search_regex(
r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href=".+">(.+?)<\/a>', r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href=".+">(.+?)<\/a>',
webpage, 'uploader', fatal=False) webpage, 'uploader', fatal=False)
uploader_id = self._html_search_regex( info['uploader_id'] = self._html_search_regex(
r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href="/user/(.+)">.+?<\/a>', r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href="/user/(.+)">.+?<\/a>',
webpage, 'uploader_id', fatal=False) webpage, 'uploader_id', fatal=False)
# CC licenses get a image with an appropriate alt text # CC licenses get a image with an appropriate alt text
license_img = get_element_by_id('mt_watch_license', webpage) license_img = get_element_by_id('mt_watch_license', webpage)
if license_img: if license_img:
license = self._search_regex( info['license'] = self._search_regex(
r'alt="(.+)"', license_img, 'license_img', fatal=False) r'alt="(.+)"', license_img, 'license_img', fatal=False)
if not license_img or not license: if not license_img or not info.get('license'):
# other licenses are just text # other licenses are just text
license = self._html_search_regex( info['license'] = self._html_search_regex(
r'<div id="mt_content_placeholder_videotabs_mt_videotabs_formview_video_license" class="video-license">(.+)<\/div>', r'<div id="mt_content_placeholder_videotabs_mt_videotabs_formview_video_license" class="video-license">(.+)<\/div>',
webpage, 'license_text', fatal=False) webpage, 'license_text', fatal=False)
upload_date = _date(self._html_search_regex( info['upload_date'] = _date(self._html_search_regex(
r'<span class="watch-information-date added">(.+?)<\/span>', r'<span class="watch-information-date added">(.+?)<\/span>',
webpage, 'upload_date', fatal=False)) webpage, 'upload_date', fatal=False))
category = self._html_search_regex( category = self._html_search_regex(
r'<a href="/category/.+">(.+)</a>', webpage, 'category', fatal=False) r'<a href="/category/.+">(.+)</a>', webpage, 'category', fatal=False)
info['categories'] = [category] # there's just one category per video
tags_html = get_element_by_id('mt_watch_info_tag_list', webpage) tags_html = get_element_by_id('mt_watch_info_tag_list', webpage)
tags = _tags(tags_html) info['tags'] = _tags(tags_html)
return info
return {
'id': video_id,
'title': title,
'description': description,
'license': license,
'categories': [category], # there's just one category per video
'tags': tags,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
'thumbnail': thumbnail,
'formats': formats, }
def _date(str_containing_date): def _date(str_containing_date):