1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-26 18:34:32 +01:00

[hhu] Add more details

This commit is contained in:
Niklas Sombert 2019-10-02 21:18:57 +02:00
parent f08371c07c
commit 9f07fb2382

View File

@ -2,7 +2,9 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import js_to_json, RegexNotFoundError, urljoin
from ..utils import (
js_to_json, RegexNotFoundError, urljoin, get_element_by_id, unified_strdate
)
import json
import re
@ -18,7 +20,18 @@ class HHUIE(InfoExtractor):
'ext': 'mp4',
'title': 'Das Multimediazentrum',
'description': '',
'categories': ['Imagefilme'],
'tags': [
'MMZ', 'Multimediazentrum', 'Heinrich-Heine-Universität',
'UKD', 'eLearning', 'Abstimmsysteme', 'Portale',
'Studierendenportal', 'Lehrfilme', 'Lehrfilm',
'Operationsfilme', 'Vorlesungsaufzeichnung', 'Multimedia',
'ZIM', 'HHU', 'Ute', 'Clames', # yes, that's incorrect
],
'uploader': 'clames',
'uploader_id': 'clames',
'license': 'CC BY 3.0 DE',
'upload_date': '20150126',
'thumbnail': 'https://mediathek.hhu.de/thumbs/2dd05982-ea45-4108-9620-0c36e6ed8df5/thumb_000.jpg',
}
}
@ -100,13 +113,53 @@ class HHUIE(InfoExtractor):
r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href=".+">(.+?)<\/a>',
webpage, 'uploader', fatal=False
)
uploader_id = self._html_search_regex(
r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href="/user/(.+)">.+?<\/a>',
webpage, 'uploader_id', fatal=False
)
# CC licenses get a image with an appropriate alt text
license_img = get_element_by_id('mt_watch_license', webpage)
if license_img:
license = self._search_regex(
r'alt="(.+)"', license_img, 'license_img', fatal=False
)
if not license_img or not license:
# other licenses are just text
license = self._html_search_regex(
r'<div id="mt_content_placeholder_videotabs_mt_videotabs_formview_video_license" class="video-license">(.+)<\/div>',
webpage, 'license_text', fatal=False
)
upload_date = _date(self._html_search_regex(
r'<span class="watch-information-date added">(.+?)<\/span>',
webpage, 'upload_date', fatal=False
))
category = self._html_search_regex(
r'<a href="/category/.+">(.+)</a>', webpage, 'category', fatal=False
)
tags_html = get_element_by_id('mt_watch_info_tag_list', webpage)
tags = _tags(tags_html)
return {
'id': video_id,
'title': title,
'description': description,
'license': license,
'categories': [category], # there's just one category per video
'tags': tags,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
'thumbnail': thumbnail,
'formats': formats,
}
def _date(str_containing_date):
"""Parse the string 'at (M)M/(D)D/YYYY' to YYYYMMDD."""
return unified_strdate(str_containing_date.split(' ')[1], day_first=False)
def _tags(tags_html):
"""Parse the HTML markup containing the tags."""
matches = re.findall(r'<a.+>(.+)<\/a>', tags_html)
return [match.rstrip(',') for match in matches]