diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 8adae4644..2f47e21c3 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -22,7 +23,101 @@ from ..utils import ( from ..compat import compat_etree_fromstring -class ARDMediathekIE(InfoExtractor): +class ARDMediathekBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) + + def _parse_media_info(self, media_info, video_id, fsk): + formats = self._extract_formats(media_info, video_id) + + if not formats: + if fsk: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + self.raise_geo_restricted( + 'This video is not available due to geoblocking', + countries=self._GEO_COUNTRIES) + + self._sort_formats(formats) + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'ttml', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': int_or_none(media_info.get('_duration')), + 'thumbnail': media_info.get('_previewImage'), + 'is_live': media_info.get('_isLive') is True, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + if not url_or_none(stream_url): + continue + ext = determine_ext(stream_url) + if quality != 'auto' and ext in ('f4m', 'm3u8'): + continue + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + else: + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + m = re.search( + r'_(?P\d+)x(?P\d+)\.mp4$', + stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + + +class ARDMediathekIE(ARDMediathekBaseIE): IE_NAME = 'ARD:mediathek' _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' @@ -63,94 +158,6 @@ class ARDMediathekIE(InfoExtractor): def suitable(cls, url): return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) - def _extract_media_info(self, media_info_url, webpage, video_id): - media_info = self._download_json( - media_info_url, video_id, 'Downloading media JSON') - - formats = self._extract_formats(media_info, video_id) - - if not formats: - if '"fsk"' in webpage: - raise ExtractorError( - 'This video is only available after 20:00', expected=True) - elif media_info.get('_geoblocked'): - raise ExtractorError('This video is not available due to geo restriction', expected=True) - - self._sort_formats(formats) - - duration = int_or_none(media_info.get('_duration')) - thumbnail = media_info.get('_previewImage') - is_live = media_info.get('_isLive') is True - - subtitles = {} - subtitle_url = media_info.get('_subtitleUrl') - if subtitle_url: - subtitles['de'] = [{ - 'ext': 'ttml', - 'url': subtitle_url, - }] - - return { - 'id': video_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'is_live': is_live, - 'formats': formats, - 'subtitles': subtitles, - } - - def _extract_formats(self, media_info, video_id): - type_ = media_info.get('_type') - media_array = media_info.get('_mediaArray', []) - formats = [] - for num, media in enumerate(media_array): - for stream in media.get('_mediaStreamArray', []): - stream_urls = stream.get('_stream') - if not stream_urls: - continue - if not isinstance(stream_urls, list): - stream_urls = [stream_urls] - quality = stream.get('_quality') - server = stream.get('_server') - for stream_url in stream_urls: - if not url_or_none(stream_url): - continue - ext = determine_ext(stream_url) - if quality != 'auto' and ext in ('f4m', 'm3u8'): - continue - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(stream_url, { - 'hdcore': '3.1.1', - 'plugin': 'aasp-3.1.1.69.124' - }), - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - if server and server.startswith('rtmp'): - f = { - 'url': server, - 'play_path': stream_url, - 'format_id': 'a%s-rtmp-%s' % (num, quality), - } - else: - f = { - 'url': stream_url, - 'format_id': 'a%s-%s-%s' % (num, ext, quality) - } - m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - if type_ == 'audio': - f['vcodec'] = 'none' - formats.append(f) - return formats - def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -302,19 +309,20 @@ class ARDIE(InfoExtractor): } -class ARDBetaMediathekIE(InfoExtractor): - _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' +class ARDBetaMediathekIE(ARDMediathekBaseIE): + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P[^/]+)/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', - 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', + 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'title': 'Tatort: Die robuste Roswita', + 'id': '70153354', + 'title': 'Die robuste Roswita', 'description': r're:^Der Mord.*trüber ist als die Ilm.', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', - 'upload_date': '20180826', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', + 'timestamp': 1577047500, + 'upload_date': '20191222', 'ext': 'mp4', }, }, { @@ -330,71 +338,69 @@ class ARDBetaMediathekIE(InfoExtractor): video_id = mobj.group('video_id') display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') - data = self._parse_json(data_json, display_id) - - res = { - 'id': video_id, - 'display_id': display_id, + player_page = self._download_json( + 'https://api.ardmediathek.de/public-gateway', + display_id, data=json.dumps({ + 'query': '''{ + playerPage(client:"%s", clipId: "%s") { + blockedByFsk + broadcastedOn + maturityContentRating + mediaCollection { + _duration + _geoblocked + _isLive + _mediaArray { + _mediaStreamArray { + _quality + _server + _stream } - formats = [] - subtitles = {} - geoblocked = False - for widget in data.values(): - if widget.get('_geoblocked') is True: - geoblocked = True - if '_duration' in widget: - res['duration'] = int_or_none(widget['_duration']) - if 'clipTitle' in widget: - res['title'] = widget['clipTitle'] - if '_previewImage' in widget: - res['thumbnail'] = widget['_previewImage'] - if 'broadcastedOn' in widget: - res['timestamp'] = unified_timestamp(widget['broadcastedOn']) - if 'synopsis' in widget: - res['description'] = widget['synopsis'] - subtitle_url = url_or_none(widget.get('_subtitleUrl')) - if subtitle_url: - subtitles.setdefault('de', []).append({ - 'ext': 'ttml', - 'url': subtitle_url, - }) - if '_quality' in widget: - format_url = url_or_none(try_get( - widget, lambda x: x['_stream']['json'][0])) - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url + '?hdcore=3.11.0', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - fatal=False)) - else: - # HTTP formats are not available when geoblocked is True, - # other formats are fine though - if geoblocked: - continue - quality = str_or_none(widget.get('_quality')) - formats.append({ - 'format_id': ('http-' + quality) if quality else 'http', - 'url': format_url, - 'preference': 10, # Plain HTTP, that's nice - }) - - if not formats and geoblocked: - self.raise_geo_restricted( - msg='This video is not available due to geoblocking', - countries=['DE']) - - self._sort_formats(formats) - res.update({ - 'subtitles': subtitles, - 'formats': formats, + } + _previewImage + _subtitleUrl + _type + } + show { + title + } + synopsis + title + tracking { + atiCustomVars { + contentId + } + } + } +}''' % (mobj.group('client'), video_id), + }).encode(), headers={ + 'Content-Type': 'application/json' + })['data']['playerPage'] + title = player_page['title'] + content_id = str_or_none(try_get( + player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) + media_collection = player_page.get('mediaCollection') or {} + if not media_collection and content_id: + media_collection = self._download_json( + 'https://www.ardmediathek.de/play/media/' + content_id, + content_id, fatal=False) or {} + info = self._parse_media_info( + media_collection, content_id or video_id, + player_page.get('blockedByFsk')) + age_limit = None + description = player_page.get('synopsis') + maturity_content_rating = player_page.get('maturityContentRating') + if maturity_content_rating: + age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) + if not age_limit and description: + age_limit = int_or_none(self._search_regex( + r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) + info.update({ + 'age_limit': age_limit, + 'display_id': display_id, + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), + 'series': try_get(player_page, lambda x: x['show']['title']), }) - - return res + return info diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index fcbdc71b9..b1e20def5 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,39 +47,19 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' _PARTNER_ID = '1719221' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') - entry_id = mobj.group('kaltura_id') + host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups() if not entry_id: - api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0]) - payload = { - 'query': '''query VideoContext($articleId: ID!) { - article: node(id: $articleId) { - ... on Article { - mainAssetRelation { - asset { - ... on VideoAsset { - kalturaId - } - } - } - } - } - }''', - 'variables': {'articleId': 'Article:%s' % mobj.group('article_id')}, - } - json_data = self._download_json( - api_url, video_id, headers={ - 'Content-Type': 'application/json', - }, - data=json.dumps(payload).encode()) - entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] + entry_id = self._download_json( + self._API_TEMPL % (host, host.split('.')[0]), display_id, query={ + 'variables': json.dumps({ + 'contextId': 'NewsArticle:' + article_id, + }), + })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] return self.url_result( 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py index dfcf9bc6b..73a57b1e4 100644 --- a/youtube_dl/extractor/businessinsider.py +++ b/youtube_dl/extractor/businessinsider.py @@ -9,21 +9,26 @@ class BusinessInsiderIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', - 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'md5': 'ffed3e1e12a6f950aa2f7d83851b497a', 'info_dict': { - 'id': 'hZRllCfw', + 'id': 'cjGDb0X9', 'ext': 'mp4', - 'title': "Here's how much radiation you're exposed to in everyday life", - 'description': 'md5:9a0d6e2c279948aadaa5e84d6d9b99bd', - 'upload_date': '20170709', - 'timestamp': 1499606400, - }, - 'params': { - 'skip_download': True, + 'title': "Bananas give you more radiation exposure than living next to a nuclear power plant", + 'description': 'md5:0175a3baf200dd8fa658f94cade841b3', + 'upload_date': '20160611', + 'timestamp': 1465675620, }, }, { 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', - 'only_matching': True, + 'md5': '43f438dbc6da0b89f5ac42f68529d84a', + 'info_dict': { + 'id': '5zJwd4FK', + 'ext': 'mp4', + 'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort', + 'description': 'md5:2af8975825d38a4fed24717bbe51db49', + 'upload_date': '20170705', + 'timestamp': 1499270528, + }, }, { 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', 'only_matching': True, @@ -35,7 +40,8 @@ class BusinessInsiderIE(InfoExtractor): jwplatform_id = self._search_regex( (r'data-media-id=["\']([a-zA-Z0-9]{8})', r'id=["\']jwplayer_([a-zA-Z0-9]{8})', - r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})'), + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})', + r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'), webpage, 'jwplatform id') return self.url_result( 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index a502e8806..b5a740a01 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -239,7 +239,7 @@ class IviCompilationIE(InfoExtractor): self.url_result( 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) for serie in re.findall( - r']+data-id="\1"' % compilation_id, html)] + r']+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 28baf901c..359dadaa3 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,14 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .ard import ARDMediathekIE +from .ard import ARDMediathekBaseIE from ..utils import ( ExtractorError, get_element_by_attribute, ) -class SRMediathekIE(ARDMediathekIE): +class SRMediathekIE(ARDMediathekBaseIE): IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P[0-9]+)' diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index ae2ac1b42..4dbead2ba 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -5,44 +5,28 @@ from ..utils import int_or_none class StretchInternetIE(InfoExtractor): - _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P\d+)' + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P\d+)' _TEST = { - 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video', 'info_dict': { - 'id': '313900', + 'id': '573272', 'ext': 'mp4', - 'title': 'Augustana (S.D.) Baseball vs University of Mary', - 'description': 'md5:7578478614aae3bdd4a90f578f787438', - 'timestamp': 1490468400, - 'upload_date': '20170325', + 'title': 'University of Mary Wrestling vs. Upper Iowa', + 'timestamp': 1575668361, + 'upload_date': '20191206', } } def _real_extract(self, url): video_id = self._match_id(url) - stream = self._download_json( - 'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' - % video_id, video_id) - - video_url = 'https://%s' % stream['source'] - event = self._download_json( - 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', - video_id, query={ - 'clientID': 99997, - 'eventID': video_id, - 'token': 'asdf', - })['event'] - - title = event.get('title') or event['mobileTitle'] - description = event.get('customText') - timestamp = int_or_none(event.get('longtime')) + 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, + video_id)[0] return { 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'url': video_url, + 'title': event['title'], + 'timestamp': int_or_none(event.get('dateCreated'), 1000), + 'url': 'https://' + event['media'][0]['url'], } diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 1d66eeaff..2830c212e 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -18,7 +18,7 @@ class TwentyFourVideoIE(InfoExtractor): https?:// (?P (?:(?:www|porno)\.)?24video\. - (?:net|me|xxx|sexy?|tube|adult|site) + (?:net|me|xxx|sexy?|tube|adult|site|vip) )/ (?: video/(?:(?:view|xml)/)?| @@ -59,6 +59,9 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', 'only_matching': True, + }, { + 'url': 'https://www.24video.vip/video/view/1044982', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 59e1359c4..a52e40afa 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,17 +1,12 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, int_or_none, - sanitized_Request, + urljoin, ) @@ -26,8 +21,7 @@ class VoiceRepublicIE(InfoExtractor): 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', - 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', - 'duration': 1800, + 'duration': 1556, 'view_count': int, } }, { @@ -38,63 +32,31 @@ class VoiceRepublicIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - req = sanitized_Request( - compat_urlparse.urljoin(url, '/talks/%s' % display_id)) - # Older versions of Firefox get redirected to an "upgrade browser" page - req.add_header('User-Agent', 'youtube-dl') - webpage = self._download_webpage(req, display_id) + webpage = self._download_webpage(url, display_id) if '>Queued for processing, please stand by...<' in webpage: raise ExtractorError( 'Audio is still queued for processing', expected=True) - config = self._search_regex( - r'(?s)return ({.+?});\s*\n', webpage, - 'data', default=None) - data = self._parse_json(config, display_id, fatal=False) if config else None - if data: - title = data['title'] - description = data.get('teaser') - talk_id = compat_str(data.get('talk_id') or display_id) - talk = data['talk'] - duration = int_or_none(talk.get('duration')) - formats = [{ - 'url': compat_urlparse.urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in talk['links'].items()] - else: - title = self._og_search_title(webpage) - description = self._html_search_regex( - r"(?s)
]*>(.+?)
", - webpage, 'description', fatal=False) - talk_id = self._search_regex( - [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], - webpage, 'talk id', default=None) or display_id - duration = None - player = self._search_regex( - r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') - formats = [{ - 'url': compat_urlparse.urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] + talk = self._parse_json(self._search_regex( + r'initialSnapshot\s*=\s*({.+?});', + webpage, 'talk'), display_id)['talk'] + title = talk['title'] + formats = [{ + 'url': urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in talk['media_links'].items()] self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) - view_count = int_or_none(self._search_regex( - r"class='play-count[^']*'>\s*(\d+) plays", - webpage, 'play count', fatal=False)) - return { - 'id': talk_id, + 'id': compat_str(talk.get('id') or display_id), 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, + 'description': talk.get('teaser'), + 'thumbnail': talk.get('image_url'), + 'duration': int_or_none(talk.get('archived_duration')), + 'view_count': int_or_none(talk.get('play_count')), 'formats': formats, }