From 9f07fb23820cc962e365c4f712c9da7c81e9841d Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 2 Oct 2019 21:18:57 +0200 Subject: [PATCH] [hhu] Add more details --- youtube_dl/extractor/hhu.py | 57 +++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py index c994662e2..ca4a36e18 100644 --- a/youtube_dl/extractor/hhu.py +++ b/youtube_dl/extractor/hhu.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json, RegexNotFoundError, urljoin +from ..utils import ( + js_to_json, RegexNotFoundError, urljoin, get_element_by_id, unified_strdate +) import json import re @@ -18,7 +20,18 @@ class HHUIE(InfoExtractor): 'ext': 'mp4', 'title': 'Das Multimediazentrum', 'description': '', + 'categories': ['Imagefilme'], + 'tags': [ + 'MMZ', 'Multimediazentrum', 'Heinrich-Heine-Universität', + 'UKD', 'eLearning', 'Abstimmsysteme', 'Portale', + 'Studierendenportal', 'Lehrfilme', 'Lehrfilm', + 'Operationsfilme', 'Vorlesungsaufzeichnung', 'Multimedia', + 'ZIM', 'HHU', 'Ute', 'Clames', # yes, that's incorrect + ], + 'uploader': 'clames', 'uploader_id': 'clames', + 'license': 'CC BY 3.0 DE', + 'upload_date': '20150126', 'thumbnail': 'https://mediathek.hhu.de/thumbs/2dd05982-ea45-4108-9620-0c36e6ed8df5/thumb_000.jpg', } } @@ -100,13 +113,53 @@ class HHUIE(InfoExtractor): r'(.+?)<\/a>', webpage, 'uploader', fatal=False ) - + uploader_id = self._html_search_regex( + r'.+?<\/a>', + webpage, 'uploader_id', fatal=False + ) + # CC licenses get a image with an appropriate alt text + license_img = get_element_by_id('mt_watch_license', webpage) + if license_img: + license = self._search_regex( + r'alt="(.+)"', license_img, 'license_img', fatal=False + ) + if not license_img or not license: + # other licenses are just text + license = self._html_search_regex( + r'
(.+)<\/div>', + webpage, 'license_text', fatal=False + ) + upload_date = _date(self._html_search_regex( + r'(.+?)<\/span>', + webpage, 'upload_date', fatal=False + )) + category = self._html_search_regex( + r'(.+)', webpage, 'category', fatal=False + ) + tags_html = get_element_by_id('mt_watch_info_tag_list', webpage) + tags = _tags(tags_html) return { 'id': video_id, 'title': title, 'description': description, + 'license': license, + 'categories': [category], # there's just one category per video + 'tags': tags, 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, 'thumbnail': thumbnail, 'formats': formats, } + + +def _date(str_containing_date): + """Parse the string 'at (M)M/(D)D/YYYY' to YYYYMMDD.""" + return unified_strdate(str_containing_date.split(' ')[1], day_first=False) + + +def _tags(tags_html): + """Parse the HTML markup containing the tags.""" + matches = re.findall(r'(.+)<\/a>', tags_html) + return [match.rstrip(',') for match in matches]