From d4bb825b83a87813f54d007febd79d2f3dcee7b9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 9 Oct 2019 11:07:46 +0100 Subject: [PATCH 001/154] [globo] fix format extraction(closes #20319) --- youtube_dl/extractor/globo.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index fb8f7679b..b9c400a57 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -96,6 +96,8 @@ class GloboIE(InfoExtractor): video = self._download_json( 'http://api.globovideos.com/videos/%s/playlist' % video_id, video_id)['videos'][0] + if video.get('encrypted') is True: + raise ExtractorError('This video is DRM protected.', expected=True) title = video['title'] @@ -109,8 +111,8 @@ class GloboIE(InfoExtractor): security = self._download_json( 'http://security.video.globo.com/videos/%s/hash' % video_id, video_id, 'Downloading security hash for %s' % resource_id, query={ - 'player': 'flash', - 'version': '17.0.0.132', + 'player': 'desktop', + 'version': '5.19.1', 'resource_id': resource_id, }) @@ -122,19 +124,18 @@ class GloboIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, message), expected=True) continue - hash_code = security_hash[:2] - received_time = security_hash[2:12] - received_random = security_hash[12:22] - received_md5 = security_hash[22:] + assert security_hash[:2] in ('04', '14') + received_time = security_hash[3:13] + received_md5 = security_hash[24:] sign_time = compat_str(int(received_time) + 86400) padding = '%010d' % random.randint(1, 10000000000) - md5_data = (received_md5 + sign_time + padding + '0xFF01DD').encode() + md5_data = (received_md5 + sign_time + padding + '0xAC10FD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = hash_code + received_time + received_random + sign_time + padding + signed_md5 + signed_hash = security_hash[:23] + sign_time + padding + signed_md5 - signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') + signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', From 1907f06e7b0689840b75810e5ad2683581f83924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 10 Oct 2019 00:11:41 +0700 Subject: [PATCH 002/154] [kaltura] Fix embed info strip (refs #22658) --- youtube_dl/extractor/kaltura.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 0a733424c..1c486c038 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -151,7 +151,8 @@ class KalturaIE(InfoExtractor): if mobj: embed_info = mobj.groupdict() for k, v in embed_info.items(): - embed_info[k] = v.strip() + if v: + embed_info[k] = v.strip() url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) service_url = re.search( From 07b50f616e407c8b7b2c183298acbb58e2ddf09b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 10 Oct 2019 00:24:03 +0700 Subject: [PATCH 003/154] [kaltura] Fix service URL extraction (closes #22658) --- youtube_dl/extractor/kaltura.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 1c486c038..2d38b758b 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -155,11 +155,11 @@ class KalturaIE(InfoExtractor): embed_info[k] = v.strip() url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) - service_url = re.search( - r']+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), + service_mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), webpage) - if service_url: - url = smuggle_url(url, {'service_url': service_url.group(1)}) + if service_mobj: + url = smuggle_url(url, {'service_url': service_mobj.group('id')}) return url def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): From 2765c47a8c4e7154fa0a9be0bb63f3bcba592b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 10 Oct 2019 03:40:01 +0700 Subject: [PATCH 004/154] [promptfile] Remove extractor (closes #6239) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/promptfile.py | 70 ------------------------------ 2 files changed, 71 deletions(-) delete mode 100644 youtube_dl/extractor/promptfile.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8d3e433c3..f393683da 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -893,7 +893,6 @@ from .puhutv import ( PuhuTVSerieIE, ) from .presstv import PressTVIE -from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .puls4 import Puls4IE from .pyvideo import PyvideoIE diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py deleted file mode 100644 index 23ac93d7e..000000000 --- a/youtube_dl/extractor/promptfile.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - urlencode_postdata, -) - - -class PromptFileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?promptfile\.com/l/(?P[0-9A-Z\-]+)' - _TEST = { - 'url': 'http://www.promptfile.com/l/86D1CE8462-576CAAE416', - 'md5': '5a7e285a26e0d66d9a263fae91bc92ce', - 'info_dict': { - 'id': '86D1CE8462-576CAAE416', - 'ext': 'mp4', - 'title': 'oceans.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if re.search(r'(?!We are).+[^-]', webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - chash = self._search_regex( - r'val\("([^"]*)"\s*\+\s*\$\("#chash"\)', webpage, 'chash') - fields = self._hidden_inputs(webpage) - keys = list(fields.keys()) - chash_key = keys[0] if len(keys) == 1 else next( - key for key in keys if key.startswith('cha')) - fields[chash_key] = chash + fields[chash_key] - - webpage = self._download_webpage( - url, video_id, 'Downloading video page', - data=urlencode_postdata(fields), - headers={'Content-type': 'application/x-www-form-urlencoded'}) - - video_url = self._search_regex( - (r']+href=(["\'])(?P(?:(?!\1).)+)\1[^>]*>\s*Download File', - r']+href=(["\'])(?Phttps?://(?:www\.)?promptfile\.com/file/(?:(?!\1).)+)\1'), - webpage, 'video url', group='url') - title = self._html_search_regex( - r'', webpage, 'title') - thumbnail = self._html_search_regex( - r'
.*button>.*? Date: Thu, 10 Oct 2019 00:01:37 +0100 Subject: [PATCH 005/154] [vessel] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/generic.py | 6 -- youtube_dl/extractor/vessel.py | 157 ----------------------------- 3 files changed, 164 deletions(-) delete mode 100644 youtube_dl/extractor/vessel.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f393683da..7a1e0dad6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1282,7 +1282,6 @@ from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE -from .vessel import VesselIE from .vesti import VestiIE from .vevo import ( VevoIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d1725d98b..ec43c5ae4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -77,7 +77,6 @@ from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE -from .vessel import VesselIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE @@ -2491,11 +2490,6 @@ class GenericIE(InfoExtractor): if tp_urls: return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - # Look for Vessel embeds - vessel_urls = VesselIE._extract_urls(webpage) - if vessel_urls: - return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) - # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py deleted file mode 100644 index 31eee0ba7..000000000 --- a/youtube_dl/extractor/vessel.py +++ /dev/null @@ -1,157 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_iso8601, - sanitized_Request, -) - - -class VesselIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P[0-9a-zA-Z-_]+)' - _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' - _LOGIN_URL = 'https://www.vessel.com/api/account/login' - _NETRC_MACHINE = 'vessel' - _TESTS = [{ - 'url': 'https://www.vessel.com/videos/HDN7G5UMs', - 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', - 'info_dict': { - 'id': 'HDN7G5UMs', - 'ext': 'mp4', - 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20150317', - 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', - 'timestamp': int, - }, - }, { - 'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346', - 'only_matching': True, - }, { - 'url': 'https://www.vessel.com/videos/F01_dsLj1', - 'only_matching': True, - }, { - 'url': 'https://www.vessel.com/videos/RRX-sir-J', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r']+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z-_]+.*?)\1', - webpage)] - - @staticmethod - def make_json_request(url, data): - payload = json.dumps(data).encode('utf-8') - req = sanitized_Request(url, payload) - req.add_header('Content-Type', 'application/json; charset=utf-8') - return req - - @staticmethod - def find_assets(data, asset_type, asset_id=None): - for asset in data.get('assets', []): - if not asset.get('type') == asset_type: - continue - elif asset_id is not None and not asset.get('id') == asset_id: - continue - else: - yield asset - - def _check_access_rights(self, data): - access_info = data.get('__view', {}) - if not access_info.get('allow_access', True): - err_code = access_info.get('error_code') or '' - if err_code == 'ITEM_PAID_ONLY': - raise ExtractorError( - 'This video requires subscription.', expected=True) - else: - raise ExtractorError( - 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - self.report_login() - data = { - 'client_id': 'web', - 'type': 'password', - 'user_key': username, - 'password': password, - } - login_request = VesselIE.make_json_request(self._LOGIN_URL, data) - self._download_webpage(login_request, None, False, 'Wrong login info') - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) - asset_id = data['model']['data']['id'] - - req = VesselIE.make_json_request( - self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) - data = self._download_json(req, video_id) - video_asset_id = data.get('main_video_asset') - - self._check_access_rights(data) - - try: - video_asset = next( - VesselIE.find_assets(data, 'video', asset_id=video_asset_id)) - except StopIteration: - raise ExtractorError('No video assets found') - - formats = [] - for f in video_asset.get('sources', []): - location = f.get('location') - if not location: - continue - name = f.get('name') - if name == 'hls-index': - formats.extend(self._extract_m3u8_formats( - location, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False)) - elif name == 'dash-index': - formats.extend(self._extract_mpd_formats( - location, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'format_id': name, - 'tbr': f.get('bitrate'), - 'height': f.get('height'), - 'width': f.get('width'), - 'url': location, - }) - self._sort_formats(formats) - - thumbnails = [] - for im_asset in VesselIE.find_assets(data, 'image'): - thumbnails.append({ - 'url': im_asset['location'], - 'width': im_asset.get('width', 0), - 'height': im_asset.get('height', 0), - }) - - return { - 'id': video_id, - 'title': data['title'], - 'formats': formats, - 'thumbnails': thumbnails, - 'description': data.get('short_description'), - 'duration': data.get('duration'), - 'comment_count': data.get('comment_count'), - 'like_count': data.get('like_count'), - 'view_count': data.get('view_count'), - 'timestamp': parse_iso8601(data.get('released_at')), - } From 311ee457314359662c975cd29f2ee58ad068db49 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 14 Oct 2019 18:36:25 +0100 Subject: [PATCH 006/154] [nbc] switch to graphql api(closes #18581)(closes #22693)(closes #22701) --- youtube_dl/extractor/nbc.py | 39 ++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3282f84ee..10680b202 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( smuggle_url, - try_get, update_url_query, int_or_none, ) @@ -85,27 +84,41 @@ class NBCIE(AdobePassIE): permalink, video_id = re.match(self._VALID_URL, url).groups() permalink = 'http' + compat_urllib_parse_unquote(permalink) response = self._download_json( - 'https://api.nbc.com/v3/videos', video_id, query={ - 'filter[permalink]': permalink, - 'fields[videos]': 'description,entitlement,episodeNumber,guid,keywords,seasonNumber,title,vChipRating', - 'fields[shows]': 'shortTitle', - 'include': 'show.shortTitle', + 'https://friendship.nbc.co/v2/graphql', video_id, query={ + 'query': '''{ + page(name: "%s", platform: web, type: VIDEO, userId: "0") { + data { + ... on VideoPageData { + description + episodeNumber + keywords + locked + mpxAccountId + mpxGuid + rating + seasonNumber + secondaryTitle + seriesShortTitle + } + } + } +}''' % permalink, }) - video_data = response['data'][0]['attributes'] + video_data = response['data']['page']['data'] query = { 'mbr': 'true', 'manifest': 'm3u', } - video_id = video_data['guid'] - title = video_data['title'] - if video_data.get('entitlement') == 'auth': + video_id = video_data['mpxGuid'] + title = video_data['secondaryTitle'] + if video_data.get('locked'): resource = self._get_mvpd_resource( 'nbcentertainment', title, video_id, - video_data.get('vChipRating')) + video_data.get('rating')) query['auth'] = self._extract_mvpd_auth( url, video_id, 'nbcentertainment', resource) theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, + 'http://link.theplatform.com/s/NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id), query), {'force_smil_url': True}) return { '_type': 'url_transparent', @@ -117,7 +130,7 @@ class NBCIE(AdobePassIE): 'season_number': int_or_none(video_data.get('seasonNumber')), 'episode_number': int_or_none(video_data.get('episodeNumber')), 'episode': title, - 'series': try_get(response, lambda x: x['included'][0]['attributes']['shortTitle']), + 'series': video_data.get('seriesShortTitle'), 'ie_key': 'ThePlatform', } From a1ee23e98fe2ec80b8726829927fcae1267e76b1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 14 Oct 2019 18:37:35 +0100 Subject: [PATCH 007/154] [vimeo] fix VHX embed extraction --- youtube_dl/extractor/vimeo.py | 97 ++++------------------------------- 1 file changed, 9 insertions(+), 88 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ddf375c6c..5dc38e243 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -23,7 +23,6 @@ from ..utils import ( NO_DEFAULT, OnDemandPagedList, parse_filesize, - qualities, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -211,6 +210,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_uploader_url = owner.get('url') return { + 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, 'uploader': owner.get('name'), 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, @@ -730,7 +730,6 @@ class VimeoIE(VimeoBaseInfoExtractor): channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None info_dict = { - 'id': video_id, 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, @@ -1061,7 +1060,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): if source_format: info_dict['formats'].append(source_format) self._vimeo_sort_formats(info_dict['formats']) - info_dict['id'] = video_id return info_dict @@ -1115,94 +1113,17 @@ class VimeoLikesIE(VimeoChannelIE): return self._extract_videos(user_id, 'https://vimeo.com/%s/likes' % user_id) -class VHXEmbedIE(InfoExtractor): +class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P\d+)' - def _call_api(self, video_id, access_token, path='', query=None): - return self._download_json( - 'https://api.vhx.tv/videos/' + video_id + path, video_id, headers={ - 'Authorization': 'Bearer ' + access_token, - }, query=query) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - credentials = self._parse_json(self._search_regex( - r'(?s)credentials\s*:\s*({.+?}),', webpage, - 'config'), video_id, js_to_json) - access_token = credentials['access_token'] - - query = {} - for k, v in credentials.items(): - if k in ('authorization', 'authUserToken', 'ticket') and v and v != 'undefined': - if k == 'authUserToken': - query['auth_user_token'] = v - else: - query[k] = v - files = self._call_api(video_id, access_token, '/files', query) - - formats = [] - for f in files: - href = try_get(f, lambda x: x['_links']['source']['href']) - if not href: - continue - method = f.get('method') - if method == 'hls': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif method == 'dash': - formats.extend(self._extract_mpd_formats( - href, video_id, mpd_id='dash', fatal=False)) - else: - fmt = { - 'filesize': int_or_none(try_get(f, lambda x: x['size']['bytes'])), - 'format_id': 'http', - 'preference': 1, - 'url': href, - 'vcodec': f.get('codec'), - } - quality = f.get('quality') - if quality: - fmt.update({ - 'format_id': 'http-' + quality, - 'height': int_or_none(self._search_regex(r'(\d+)p', quality, 'height', default=None)), - }) - formats.append(fmt) - self._sort_formats(formats) - - video_data = self._call_api(video_id, access_token) - title = video_data.get('title') or video_data['name'] - - subtitles = {} - for subtitle in try_get(video_data, lambda x: x['tracks']['subtitles'], list) or []: - lang = subtitle.get('srclang') or subtitle.get('label') - for _link in subtitle.get('_links', {}).values(): - href = _link.get('href') - if not href: - continue - subtitles.setdefault(lang, []).append({ - 'url': href, - }) - - q = qualities(['small', 'medium', 'large', 'source']) - thumbnails = [] - for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'preference': q(thumbnail_id), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'timestamp': unified_timestamp(video_data.get('created_at')), - 'view_count': int_or_none(video_data.get('plays_count')), - } + config_url = self._parse_json(self._search_regex( + r'window\.OTTData\s*=\s*({.+})', webpage, + 'ott data'), video_id, js_to_json)['config_url'] + config = self._download_json(config_url, video_id) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info From 7e05df71b7d8c0e1ea9beafff48275ef3c9e27d2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 15 Oct 2019 00:10:22 +0100 Subject: [PATCH 008/154] [nexx] handle result list(closes #22666) --- youtube_dl/extractor/nexx.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 82d526c22..f9aad83c4 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -295,13 +295,23 @@ class NexxIE(InfoExtractor): video = None + def find_video(result): + if isinstance(result, dict): + return result + elif isinstance(result, list): + vid = int(video_id) + for v in result: + if try_get(v, lambda x: x['general']['ID'], int) == vid: + return v + return None + response = self._download_json( 'https://arc.nexx.cloud/api/video/%s.json' % video_id, video_id, fatal=False) if response and isinstance(response, dict): result = response.get('result') - if result and isinstance(result, dict): - video = result + if result: + video = find_video(result) # not all videos work via arc, e.g. nexx:741:1269984 if not video: @@ -348,7 +358,7 @@ class NexxIE(InfoExtractor): request_token = hashlib.md5( ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() - video = self._call_api( + result = self._call_api( domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', 'addInteractionOptions': '1', @@ -363,6 +373,7 @@ class NexxIE(InfoExtractor): 'X-Request-CID': cid, 'X-Request-Token': request_token, }) + video = find_video(result) general = video['general'] title = general['title'] From 2af01c0293db53dc80c552df3986d0e088b65b76 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 15 Oct 2019 15:18:51 +0100 Subject: [PATCH 009/154] [bokecc] improve player params extraction(closes #22638) --- youtube_dl/extractor/bokecc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py index 86a7f4d7d..6017e8344 100644 --- a/youtube_dl/extractor/bokecc.py +++ b/youtube_dl/extractor/bokecc.py @@ -11,8 +11,8 @@ from ..utils import ExtractorError class BokeCCBaseIE(InfoExtractor): def _extract_bokecc_formats(self, webpage, video_id, format_id=None): player_params_str = self._html_search_regex( - r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', - webpage, 'player params') + r'<(?:script|embed)[^>]+src=(?P["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P.+?)(?P=q)', + webpage, 'player params', group='query') player_params = compat_parse_qs(player_params_str) @@ -36,9 +36,9 @@ class BokeCCIE(BokeCCBaseIE): _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P.*)' _TESTS = [{ - 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A', 'info_dict': { - 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30', + 'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461', 'ext': 'flv', 'title': 'BokeCC Video', }, From 30eb05cb41d95a73f7baff8da9ec1d6a50b08f50 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 15 Oct 2019 19:54:53 +0100 Subject: [PATCH 010/154] [globo] extract subtitles(closes #22713) --- youtube_dl/extractor/globo.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index b9c400a57..9ad1d95fb 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -102,10 +102,18 @@ class GloboIE(InfoExtractor): title = video['title'] formats = [] + subtitles = {} for resource in video['resources']: resource_id = resource.get('_id') resource_url = resource.get('url') - if not resource_id or not resource_url: + resource_type = resource.get('type') + if not resource_url or (resource_type == 'media' and not resource_id) or resource_type not in ('subtitle', 'media'): + continue + + if resource_type == 'subtitle': + subtitles.setdefault(resource.get('language') or 'por', []).append({ + 'url': resource_url, + }) continue security = self._download_json( @@ -165,7 +173,8 @@ class GloboIE(InfoExtractor): 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, } From 974311b5aa1a53564a00915b9228af30e2a5b40d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 15 Oct 2019 21:01:59 +0100 Subject: [PATCH 011/154] [vimeo] improve album videos id extraction(closes #22599) --- youtube_dl/extractor/vimeo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5dc38e243..9abd59d98 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -938,7 +938,7 @@ class VimeoAlbumIE(VimeoChannelIE): def _fetch_page(self, album_id, authorizaion, hashed_pass, page): api_page = page + 1 query = { - 'fields': 'link', + 'fields': 'link,uri', 'page': api_page, 'per_page': self._PAGE_SIZE, } @@ -953,7 +953,9 @@ class VimeoAlbumIE(VimeoChannelIE): link = video.get('link') if not link: continue - yield self.url_result(link, VimeoIE.ie_key(), VimeoIE._match_id(link)) + uri = video.get('uri') + video_id = self._search_regex(r'/videos/(\d+)', uri, 'video_id', default=None) if uri else None + yield self.url_result(link, VimeoIE.ie_key(), video_id) def _real_extract(self, url): album_id = self._match_id(url) From 173190f5e3946173daea0539cf0e749cb14acd12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Oct 2019 03:25:13 +0700 Subject: [PATCH 012/154] [ChangeLog] Actualize [ci skip] --- ChangeLog | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/ChangeLog b/ChangeLog index 80681a9ae..8a59398d9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,39 @@ +version + +Core +* [extractor/common] Make _is_valid_url more relaxed + +Extractors +* [vimeo] Improve album videos id extraction (#22599) ++ [globo] Extract subtitles (#22713) +* [bokecc] Improve player params extraction (#22638) +* [nexx] Handle result list (#22666) +* [vimeo] Fix VHX embed extraction +* [nbc] Switch to graphql API (#18581, #22693, #22701) +- [vessel] Remove extractor +- [promptfile] Remove extractor (#6239) +* [kaltura] Fix service URL extraction (#22658) +* [kaltura] Fix embed info strip (#22658) +* [globo] Fix format extraction (#20319) +* [redtube] Improve metadata extraction (#22492, #22615) +* [pornhub:uservideos:upload] Fix extraction (#22619) ++ [telequebec:squat] Add support for squat.telequebec.tv (#18503) +- [wimp] Remove extractor (#22088, #22091) ++ [gfycat] Extend URL regular expression (#22225) ++ [chaturbate] Extend URL regular expression (#22309) +* [peertube] Update instances (#22414) ++ [telequebec] Add support for coucou.telequebec.tv (#22482) ++ [xvideos] Extend URL regular expression (#22471) +- [youtube] Remove support for invidious.enkirton.net (#22543) ++ [openload] Add support for oload.monster (#22592) +* [nrktv:seriebase] Fix extraction (#22596) ++ [youtube] Add support for yt.lelux.fi (#22597) +* [orf:tvthek] Make manifest requests non fatal (#22578) +* [teachable] Skip login when already logged in (#22572) +* [viewlift] Improve extraction (#22545) +* [nonktube] Fix extraction (#22544) + + version 2019.09.28 Core From 7815d6b74373feb90d969b5fcde7df11702fa5d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Oct 2019 03:26:47 +0700 Subject: [PATCH 013/154] release 2019.10.16 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 +--- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 17 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2fea0120e..5cd9f0dc0 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.28 + [debug] youtube-dl version 2019.10.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 6116acc79..6cc34796a 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 79d1a7f3c..0b7911e79 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 9bda3d440..a6f417d38 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.28 + [debug] youtube-dl version 2019.10.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 581344917..3fe753b62 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 8a59398d9..dc5c32a1f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.10.16 Core * [extractor/common] Make _is_valid_url more relaxed diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 35275278b..0cbad28ea 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -694,7 +694,6 @@ - **PornoXO** - **PornTube** - **PressTV** - - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital - **puhutv** - **puhutv:serie** @@ -884,6 +883,7 @@ - **TeleQuebec** - **TeleQuebecEmission** - **TeleQuebecLive** + - **TeleQuebecSquat** - **TeleTask** - **Telewebion** - **TennisTV** @@ -991,7 +991,6 @@ - **VeeHD** - **Veoh** - **verystream** - - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - **VevoPlaylist** @@ -1090,7 +1089,6 @@ - **Weibo** - **WeiboMobile** - **WeiqiTV**: WQTV - - **Wimp** - **Wistia** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c3eafb068..53889b7cb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.09.28' +__version__ = '2019.10.16' From 6d394a66f54216cc2b0b68fadd958eaf455c2778 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 Oct 2019 12:03:46 +0100 Subject: [PATCH 014/154] [atresplayer] fix extraction(closes #16277)(closes #16716) --- youtube_dl/extractor/atresplayer.py | 213 +++++++++------------------- 1 file changed, 64 insertions(+), 149 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index ae1c09427..b96218f6c 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -1,202 +1,117 @@ from __future__ import unicode_literals -import time -import hmac -import hashlib import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - float_or_none, int_or_none, - sanitized_Request, urlencode_postdata, - xpath_text, ) class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P.+?)_\d+\.html' + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P.+?)_(?P[0-9a-f]{24})' _NETRC_MACHINE = 'atresplayer' _TESTS = [ { - 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', - 'md5': 'efd56753cda1bb64df52a3074f62e38a', + 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', 'info_dict': { - 'id': 'capitulo-10-especial-solidario-nochebuena', + 'id': '5d4aa2c57ed1a88fc715a615', 'ext': 'mp4', - 'title': 'Especial Solidario de Nochebuena', - 'description': 'md5:e2d52ff12214fa937107d21064075bf1', - 'duration': 5527.6, - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Capítulo 7: Asuntos pendientes', + 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', + 'duration': 3413, + }, + 'params': { + 'format': 'bestvideo', }, 'skip': 'This video is only available for registered users' }, { - 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', - 'md5': '6e52cbb513c405e403dbacb7aacf8747', - 'info_dict': { - 'id': 'capitulo-112-david-bustamante', - 'ext': 'flv', - 'title': 'David Bustamante', - 'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', - 'duration': 1439.0, - 'thumbnail': r're:^https?://.*\.jpg$', - }, + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, }, { - 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', 'only_matching': True, }, ] - - _USER_AGENT = 'Dalvik/1.6.0 (Linux; U; Android 4.3; GT-I9300 Build/JSS15J' - _MAGIC = 'QWtMLXs414Yo+c#_+Q#K@NN)' - _TIMESTAMP_SHIFT = 30000 - - _TIME_API_URL = 'http://servicios.atresplayer.com/api/admin/time.json' - _URL_VIDEO_TEMPLATE = 'https://servicios.atresplayer.com/api/urlVideo/{1}/{0}/{1}|{2}|{3}.json' - _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s' - _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s' - - _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' - - _ERRORS = { - 'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', - 'DELETED': 'This video has expired and is no longer available for online streaming.', - 'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', - # 'PREMIUM': 'PREMIUM', - } + _API_BASE = 'https://api.atresplayer.com/' def _real_initialize(self): self._login() + def _handle_error(self, e, code): + if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: + error = self._parse_json(e.cause.read(), None) + if error.get('error') == 'required_registered': + self.raise_login_required() + raise ExtractorError(error['error_description'], expected=True) + raise + def _login(self): username, password = self._get_login_info() if username is None: return - login_form = { - 'j_username': username, - 'j_password': password, - } + self._request_webpage( + self._API_BASE + 'login', None, 'Downloading login page') - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - response = self._download_webpage( - request, None, 'Logging in') + try: + target_url = self._download_json( + 'https://account.atresmedia.com/api/login', None, + 'Logging in', headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=urlencode_postdata({ + 'username': username, + 'password': password, + }))['targetUrl'] + except ExtractorError as e: + self._handle_error(e, 400) - error = self._html_search_regex( - r'(?s)]+class="[^"]*\blist_error\b[^"]*">(.+?)', - response, 'error', default=None) - if error: - raise ExtractorError( - 'Unable to login: %s' % error, expected=True) + self._request_webpage(target_url, None, 'Following Target URL') def _real_extract(self, url): - video_id = self._match_id(url) + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) + try: + episode = self._download_json( + self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + except ExtractorError as e: + self._handle_error(e, 403) - episode_id = self._search_regex( - r'episode="([^"]+)"', webpage, 'episode id') - - request = sanitized_Request( - self._PLAYER_URL_TEMPLATE % episode_id, - headers={'User-Agent': self._USER_AGENT}) - player = self._download_json(request, episode_id, 'Downloading player JSON') - - episode_type = player.get('typeOfEpisode') - error_message = self._ERRORS.get(episode_type) - if error_message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + title = episode['titulo'] formats = [] - video_url = player.get('urlVideo') - if video_url: - format_info = { - 'url': video_url, - 'format_id': 'http', - } - mobj = re.search(r'(?P\d+)K_(?P\d+)x(?P\d+)', video_url) - if mobj: - format_info.update({ - 'width': int_or_none(mobj.group('width')), - 'height': int_or_none(mobj.group('height')), - 'tbr': int_or_none(mobj.group('bitrate')), - }) - formats.append(format_info) - - timestamp = int_or_none(self._download_webpage( - self._TIME_API_URL, - video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) - timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) - token = hmac.new( - self._MAGIC.encode('ascii'), - (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5 - ).hexdigest() - - request = sanitized_Request( - self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), - headers={'User-Agent': self._USER_AGENT}) - - fmt_json = self._download_json( - request, video_id, 'Downloading windows video JSON') - - result = fmt_json.get('resultDes') - if result.lower() != 'ok': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, result), expected=True) - - for format_id, video_url in fmt_json['resultObject'].items(): - if format_id == 'token' or not video_url.startswith('http'): + for source in episode.get('sources', []): + src = source.get('src') + if not src: continue - if 'geodeswowsmpra3player' in video_url: - # f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] - # f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) - # this videos are protected by DRM, the f4m downloader doesn't support them - continue - video_url_hd = video_url.replace('free_es', 'es') - formats.extend(self._extract_f4m_formats( - video_url_hd[:-9] + '/manifest.f4m', video_id, f4m_id='hds', - fatal=False)) - formats.extend(self._extract_mpd_formats( - video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash', - fatal=False)) + src_type = source.get('type') + if src_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif src_type == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) - path_data = player.get('pathData') - - episode = self._download_xml( - self._EPISODE_URL_TEMPLATE % path_data, video_id, - 'Downloading episode XML') - - duration = float_or_none(xpath_text( - episode, './media/asset/info/technical/contentDuration', 'duration')) - - art = episode.find('./media/asset/info/art') - title = xpath_text(art, './name', 'title') - description = xpath_text(art, './description', 'description') - thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') - - subtitles = {} - subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') - if subtitle_url: - subtitles['es'] = [{ - 'ext': 'srt', - 'url': subtitle_url, - }] + heartbeat = episode.get('heartbeat') or {} + omniture = episode.get('omniture') or {} + get_meta = lambda x: heartbeat.get(x) or omniture.get(x) return { + 'display_id': display_id, 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, + 'description': episode.get('descripcion'), + 'thumbnail': episode.get('imgPoster'), + 'duration': int_or_none(episode.get('duration')), 'formats': formats, - 'subtitles': subtitles, + 'channel': get_meta('channel'), + 'season': get_meta('season'), + 'episode_number': int_or_none(get_meta('episodeNumber')), } From e29e96a9f5bc390789d176d509f592e208aa30d8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 Oct 2019 15:06:48 +0100 Subject: [PATCH 015/154] [dumpert] fix extraction(closes #22428)(closes #22564) --- youtube_dl/extractor/dumpert.py | 83 +++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index be2e3d378..d9d9afdec 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -1,20 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import compat_b64decode from ..utils import ( + int_or_none, qualities, - sanitized_Request, ) class DumpertIE(InfoExtractor): - _VALID_URL = r'(?Phttps?)://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P[0-9]+/[0-9a-zA-Z]+)' + _VALID_URL = r'(?Phttps?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P[0-9]+[/_][0-9a-zA-Z]+)' _TESTS = [{ - 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', + 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', 'info_dict': { 'id': '6646981/951bc60f', @@ -24,46 +21,60 @@ class DumpertIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', } }, { - 'url': 'http://www.dumpert.nl/embed/6675421/dc440fe7/', + 'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7', + 'only_matching': True, + }, { + 'url': 'http://legacy.dumpert.nl/mediabase/6646981/951bc60f', + 'only_matching': True, + }, { + 'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7', 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - protocol = mobj.group('protocol') - - url = '%s://www.dumpert.nl/mediabase/%s' % (protocol, video_id) - req = sanitized_Request(url) - req.add_header('Cookie', 'nsfw=1; cpc=10') - webpage = self._download_webpage(req, video_id) - - files_base64 = self._search_regex( - r'data-files="([^"]+)"', webpage, 'data files') - - files = self._parse_json( - compat_b64decode(files_base64).decode('utf-8'), - video_id) + video_id = self._match_id(url).replace('_', '/') + item = self._download_json( + 'http://api-live.dumpert.nl/mobile_api/json/info/' + video_id.replace('/', '_'), + video_id)['items'][0] + title = item['title'] + media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO') quality = qualities(['flv', 'mobile', 'tablet', '720p']) - - formats = [{ - 'url': video_url, - 'format_id': format_id, - 'quality': quality(format_id), - } for format_id, video_url in files.items() if format_id != 'still'] + formats = [] + for variant in media.get('variants', []): + uri = variant.get('uri') + if not uri: + continue + version = variant.get('version') + formats.append({ + 'url': uri, + 'format_id': version, + 'quality': quality(version), + }) self._sort_formats(formats) - title = self._html_search_meta( - 'title', webpage) or self._og_search_title(webpage) - description = self._html_search_meta( - 'description', webpage) or self._og_search_description(webpage) - thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + thumbnails = [] + stills = item.get('stills') or {} + for t in ('thumb', 'still'): + for s in ('', '-medium', '-large'): + still_id = t + s + still_url = stills.get(still_id) + if not still_url: + continue + thumbnails.append({ + 'id': still_id, + 'url': still_url, + }) + + stats = item.get('stats') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats + 'description': item.get('description'), + 'thumbnails': thumbnails, + 'formats': formats, + 'duration': int_or_none(media.get('duration')), + 'like_count': int_or_none(stats.get('kudos_total')), + 'view_count': int_or_none(stats.get('views_total')), } From 2b115b9460502944d6088cf42810c440495128a3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 Oct 2019 15:41:58 +0100 Subject: [PATCH 016/154] [servingsys] Remove extractor(closes #22639) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/servingsys.py | 72 ------------------------------ 2 files changed, 73 deletions(-) delete mode 100644 youtube_dl/extractor/servingsys.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7a1e0dad6..53d527440 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -995,7 +995,6 @@ from .scrippsnetworks import ScrippsNetworksWatchIE from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE -from .servingsys import ServingSysIE from .servus import ServusIE from .sevenplus import SevenPlusIE from .sexu import SexuIE diff --git a/youtube_dl/extractor/servingsys.py b/youtube_dl/extractor/servingsys.py deleted file mode 100644 index c013d678f..000000000 --- a/youtube_dl/extractor/servingsys.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, -) - - -class ServingSysIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P[0-9]+)' - - _TEST = { - 'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?', - 'info_dict': { - 'id': '5349193', - 'title': 'AdAPPter_Hyundai_demo', - }, - 'playlist': [{ - 'md5': 'baed851342df6846eb8677a60a011a0f', - 'info_dict': { - 'id': '29955898', - 'ext': 'flv', - 'title': 'AdAPPter_Hyundai_demo (1)', - 'duration': 74, - 'tbr': 1378, - 'width': 640, - 'height': 400, - }, - }, { - 'md5': '979b4da2655c4bc2d81aeb915a8c5014', - 'info_dict': { - 'id': '29907998', - 'ext': 'flv', - 'title': 'AdAPPter_Hyundai_demo (2)', - 'duration': 34, - 'width': 854, - 'height': 480, - 'tbr': 516, - }, - }], - 'params': { - 'playlistend': 2, - }, - '_skip': 'Blocked in the US [sic]', - } - - def _real_extract(self, url): - pl_id = self._match_id(url) - vast_doc = self._download_xml(url, pl_id) - - title = vast_doc.find('.//AdTitle').text - media = vast_doc.find('.//MediaFile').text - info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL') - - doc = self._download_xml(info_url, pl_id, 'Downloading video info') - entries = [{ - '_type': 'video', - 'id': a.attrib['id'], - 'title': '%s (%s)' % (title, a.attrib['assetID']), - 'url': a.attrib['URL'], - 'duration': int_or_none(a.attrib.get('length')), - 'tbr': int_or_none(a.attrib.get('bitrate')), - 'height': int_or_none(a.attrib.get('height')), - 'width': int_or_none(a.attrib.get('width')), - } for a in doc.findall('.//AdditionalAssets/asset')] - - return { - '_type': 'playlist', - 'id': pl_id, - 'title': title, - 'entries': entries, - } From d07866f13efac39bf3f0b331870a15e0f5e98057 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 Oct 2019 15:45:45 +0100 Subject: [PATCH 017/154] [mit] Remove support for video.mit.edu(closes #22403) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/mit.py | 24 ------------------------ 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 53d527440..ea47b99f6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -644,7 +644,7 @@ from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE -from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mit import TechTVMITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import ( MixcloudIE, diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 1aea78d11..e1506a745 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -65,30 +65,6 @@ class TechTVMITIE(InfoExtractor): } -class MITIE(TechTVMITIE): - IE_NAME = 'video.mit.edu' - _VALID_URL = r'https?://video\.mit\.edu/watch/(?P[^/]+)' - - _TEST = { - 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', - 'md5': '7db01d5ccc1895fc5010e9c9e13648da', - 'info_dict': { - 'id': '21783', - 'ext': 'mp4', - 'title': 'The Government is Profiling You', - 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_title = mobj.group('title') - webpage = self._download_webpage(url, page_title) - embed_url = self._search_regex( - r'<iframe .*?src="(.+?)"', webpage, 'embed url') - return self.url_result(embed_url) - - class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' From bc48773ed4c068adfe67078714814035660e5ca4 Mon Sep 17 00:00:00 2001 From: MobiDotS <msaad615@gmail.com> Date: Wed, 16 Oct 2019 10:13:35 -0500 Subject: [PATCH 018/154] [twitch] update VOD URL matching (closes #22395) (#22727) --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 0500e33a6..ca7676fe2 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -248,7 +248,7 @@ class TwitchVodIE(TwitchItemBaseIE): https?:// (?: (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| - player\.twitch\.tv/\?.*?\bvideo=v + player\.twitch\.tv/\?.*?\bvideo=v? ) (?P<id>\d+) ''' @@ -306,6 +306,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'https://www.twitch.tv/northernlion/video/291940395', 'only_matching': True, + }, { + 'url': 'https://player.twitch.tv/?video=480452374', + 'only_matching': True, }] def _real_extract(self, url): From 000115759485797be719c71716c1ac35f003ba6c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 Oct 2019 23:57:40 +0100 Subject: [PATCH 019/154] [atresplayer] Add coding cookie --- youtube_dl/extractor/atresplayer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index b96218f6c..c2cec9845 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re From 86f63633c8e7c62ce245d1352d4d381efb614466 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 13:20:16 +0100 Subject: [PATCH 020/154] [audioboom] improve metadata extraction --- youtube_dl/extractor/audioboom.py | 34 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index 393f381c6..c51837b40 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -2,22 +2,25 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( + clean_html, + float_or_none, +) class AudioBoomIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', - 'md5': '63a8d73a055c6ed0f1e51921a10a5a76', + 'url': 'https://audioboom.com/posts/7398103-asim-chaudhry', + 'md5': '7b00192e593ff227e6a315486979a42d', 'info_dict': { - 'id': '4279833', + 'id': '7398103', 'ext': 'mp3', - 'title': '3/09/2016 Czaban Hour 3', - 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', - 'duration': 2245.72, - 'uploader': 'SB Nation A.M.', - 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', + 'title': 'Asim Chaudhry', + 'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc', + 'duration': 4000.99, + 'uploader': 'Sue Perkins: An hour or so with...', + 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', } }, { 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', @@ -32,8 +35,8 @@ class AudioBoomIE(InfoExtractor): clip = None clip_store = self._parse_json( - self._search_regex( - r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, + self._html_search_regex( + r'data-new-clip-store=(["\'])(?P<json>{.+?})\1', webpage, 'clip store', default='{}', group='json'), video_id, fatal=False) if clip_store: @@ -47,14 +50,15 @@ class AudioBoomIE(InfoExtractor): audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( 'audio', webpage, 'audio url') - title = from_clip('title') or self._og_search_title(webpage) - description = from_clip('description') or self._og_search_description(webpage) + title = from_clip('title') or self._html_search_meta( + ['og:title', 'og:audio:title', 'audio_title'], webpage) + description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage) duration = float_or_none(from_clip('duration') or self._html_search_meta( 'weibo:audio:duration', webpage)) - uploader = from_clip('author') or self._og_search_property( - 'audio:artist', webpage, 'uploader', fatal=False) + uploader = from_clip('author') or self._html_search_meta( + ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader') uploader_url = from_clip('author_url') or self._html_search_meta( 'audioboo:channel', webpage, 'uploader url') From 755541a4c8ac3dd4e8b9abd0c7df95182a1f3fd4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 13:21:44 +0100 Subject: [PATCH 021/154] [mangomolo] fix video format extraction and add support for player URLs --- youtube_dl/extractor/generic.py | 8 ++++++-- youtube_dl/extractor/mangomolo.py | 17 +++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec43c5ae4..5ed952b29 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2962,10 +2962,14 @@ class GenericIE(InfoExtractor): # Look for Mangomolo embeds mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/ + r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// + (?: + admin\.mangomolo\.com/analytics/index\.php/customers/embed| + player\.mangomolo\.com/v1 + )/ (?: video\?.*?\bid=(?P<video_id>\d+)| - index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) + (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) ).+?)\1''', webpage) if mobj is not None: info = { diff --git a/youtube_dl/extractor/mangomolo.py b/youtube_dl/extractor/mangomolo.py index 482175a34..acee370e9 100644 --- a/youtube_dl/extractor/mangomolo.py +++ b/youtube_dl/extractor/mangomolo.py @@ -10,18 +10,21 @@ from ..utils import int_or_none class MangomoloBaseIE(InfoExtractor): + _BASE_REGEX = r'https?://(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)' + def _get_real_id(self, page_id): return page_id def _real_extract(self, url): page_id = self._get_real_id(self._match_id(url)) - webpage = self._download_webpage(url, page_id) + webpage = self._download_webpage( + 'https://player.mangomolo.com/v1/%s?%s' % (self._TYPE, url.split('?')[1]), page_id) hidden_inputs = self._hidden_inputs(webpage) m3u8_entry_protocol = 'm3u8' if self._IS_LIVE else 'm3u8_native' format_url = self._html_search_regex( [ - r'file\s*:\s*"(https?://[^"]+?/playlist\.m3u8)', + r'(?:file|src)\s*:\s*"(https?://[^"]+?/playlist\.m3u8)', r'<a[^>]+href="(rtsp://[^"]+)"' ], webpage, 'format url') formats = self._extract_wowza_formats( @@ -39,14 +42,16 @@ class MangomoloBaseIE(InfoExtractor): class MangomoloVideoIE(MangomoloBaseIE): - IE_NAME = 'mangomolo:video' - _VALID_URL = r'https?://admin\.mangomolo\.com/analytics/index\.php/customers/embed/video\?.*?\bid=(?P<id>\d+)' + _TYPE = 'video' + IE_NAME = 'mangomolo:' + _TYPE + _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'video\?.*?\bid=(?P<id>\d+)' _IS_LIVE = False class MangomoloLiveIE(MangomoloBaseIE): - IE_NAME = 'mangomolo:live' - _VALID_URL = r'https?://admin\.mangomolo\.com/analytics/index\.php/customers/embed/index\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' + _TYPE = 'live' + IE_NAME = 'mangomolo:' + _TYPE + _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'(live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' _IS_LIVE = True def _get_real_id(self, page_id): From 59296bae7ec6d15b0df37dce34bdd96381c0e743 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 13:26:45 +0100 Subject: [PATCH 022/154] [xfileshare] clean extractor - update the list of domains - add support for aa-encoded video data - improve jwplayer format extraction - add support for Clappr sources closes #17032 closes #17906 closes #18237 closes #18239 --- youtube_dl/extractor/xfileshare.py | 192 +++++++++++++---------------- 1 file changed, 86 insertions(+), 106 deletions(-) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index b38c7a7b3..48ef07ed1 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -4,37 +4,64 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_chr from ..utils import ( decode_packed_codes, determine_ext, ExtractorError, int_or_none, - NO_DEFAULT, + js_to_json, urlencode_postdata, ) +# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 +def aa_decode(aa_code): + symbol_table = [ + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('2', '((o^_^o) - (゚Θ゚))'), + ('4', '(゚ー゚)'), + ('3', '(o^_^o)'), + ('1', '(゚Θ゚)'), + ('0', '(c^_^o)'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aa_char in aa_code.split(delim): + for val, pat in symbol_table: + aa_char = aa_char.replace(pat, val) + aa_char = aa_char.replace('+ ', '') + m = re.match(r'^\d+', aa_char) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aa_char) + if m: + ret += compat_chr(int(m.group(1), 16)) + return ret + + class XFileShareIE(InfoExtractor): _SITES = ( - (r'daclips\.(?:in|com)', 'DaClips'), - (r'filehoot\.com', 'FileHoot'), - (r'gorillavid\.(?:in|com)', 'GorillaVid'), - (r'movpod\.in', 'MovPod'), - (r'powerwatch\.pw', 'PowerWatch'), - (r'rapidvideo\.ws', 'Rapidvideo.ws'), + (r'clipwatching\.com', 'ClipWatching'), + (r'gounlimited\.to', 'GoUnlimited'), + (r'govid\.me', 'GoVid'), + (r'holavid\.com', 'HolaVid'), + (r'streamty\.com', 'Streamty'), (r'thevideobee\.to', 'TheVideoBee'), - (r'vidto\.(?:me|se)', 'Vidto'), - (r'streamin\.to', 'Streamin.To'), - (r'xvidstage\.com', 'XVIDSTAGE'), - (r'vidabc\.com', 'Vid ABC'), + (r'uqload\.com', 'Uqload'), (r'vidbom\.com', 'VidBom'), (r'vidlo\.us', 'vidlo'), - (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'), - (r'fastvideo\.me', 'FastVideo.me'), + (r'vidlocker\.xyz', 'VidLocker'), + (r'vidshare\.tv', 'VidShare'), + (r'vup\.to', 'VUp'), + (r'xvideosharing\.com', 'XVideoSharing'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) - _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEXES = ( @@ -43,82 +70,14 @@ class XFileShareIE(InfoExtractor): ) _TESTS = [{ - 'url': 'http://gorillavid.in/06y9juieqpmi', - 'md5': '5ae4a3580620380619678ee4875893ba', + 'url': 'http://xvideosharing.com/fq65f94nd2ve', + 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', 'info_dict': { - 'id': '06y9juieqpmi', + 'id': 'fq65f94nd2ve', 'ext': 'mp4', - 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', + 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, - }, { - 'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', - 'only_matching': True, - }, { - 'url': 'http://daclips.in/3rso4kdn6f9m', - 'md5': '1ad8fd39bb976eeb66004d3a4895f106', - 'info_dict': { - 'id': '3rso4kdn6f9m', - 'ext': 'mp4', - 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', - 'thumbnail': r're:http://.*\.jpg', - } - }, { - 'url': 'http://movpod.in/0wguyyxi1yca', - 'only_matching': True, - }, { - 'url': 'http://filehoot.com/3ivfabn7573c.html', - 'info_dict': { - 'id': '3ivfabn7573c', - 'ext': 'mp4', - 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', - 'thumbnail': r're:http://.*\.jpg', - }, - 'skip': 'Video removed', - }, { - 'url': 'http://vidto.me/ku5glz52nqe1.html', - 'info_dict': { - 'id': 'ku5glz52nqe1', - 'ext': 'mp4', - 'title': 'test' - } - }, { - 'url': 'http://powerwatch.pw/duecjibvicbu', - 'info_dict': { - 'id': 'duecjibvicbu', - 'ext': 'mp4', - 'title': 'Big Buck Bunny trailer', - }, - }, { - 'url': 'http://xvidstage.com/e0qcnl03co6z', - 'info_dict': { - 'id': 'e0qcnl03co6z', - 'ext': 'mp4', - 'title': 'Chucky Prank 2015.mp4', - }, - }, { - # removed by administrator - 'url': 'http://xvidstage.com/amfy7atlkx25', - 'only_matching': True, - }, { - 'url': 'http://vidabc.com/i8ybqscrphfv', - 'info_dict': { - 'id': 'i8ybqscrphfv', - 'ext': 'mp4', - 'title': 're:Beauty and the Beast 2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.rapidvideo.cool/b667kprndr8w', - 'only_matching': True, - }, { - 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', - 'only_matching': True, - }, { - 'url': 'http://vidto.se/1tx1pf6t12cg.html', - 'only_matching': True, }] @staticmethod @@ -131,10 +90,9 @@ class XFileShareIE(InfoExtractor): webpage)] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() - url = 'http://%s/%s' % (mobj.group('host'), video_id) + url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) webpage = self._download_webpage(url, video_id) if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): @@ -142,7 +100,7 @@ class XFileShareIE(InfoExtractor): fields = self._hidden_inputs(webpage) - if fields['op'] == 'download1': + if fields.get('op') == 'download1': countdown = int_or_none(self._search_regex( r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>', webpage, 'countdown', default=None)) @@ -160,13 +118,37 @@ class XFileShareIE(InfoExtractor): (r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+) ', + r'>Watch (.+)[ <]', r'<h2 class="video-page-head">([^<]+)</h2>', - r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<'), # streamin.to + r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to + r'title\s*:\s*"([^"]+)"'), # govid.me webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or video_id).strip() - def extract_formats(default=NO_DEFAULT): + for regex, func in ( + (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), + (r'(゚.+)', aa_decode)): + obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) + if obf_code: + webpage = webpage.replace(obf_code, func(obf_code)) + + formats = [] + + jwplayer_data = self._search_regex( + [ + r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', + r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', + ], webpage, + 'jwplayer data', default=None) + if jwplayer_data: + jwplayer_data = self._parse_json( + jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) + if jwplayer_data: + formats = self._parse_jwplayer_data( + jwplayer_data, video_id, False, + m3u8_id='hls', mpd_id='dash')['formats'] + + if not formats: urls = [] for regex in ( r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', @@ -177,6 +159,12 @@ class XFileShareIE(InfoExtractor): video_url = mobj.group('url') if video_url not in urls: urls.append(video_url) + + sources = self._search_regex( + r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) + if sources: + urls.extend(self._parse_json(sources, video_id)) + formats = [] for video_url in urls: if determine_ext(video_url) == 'm3u8': @@ -189,21 +177,13 @@ class XFileShareIE(InfoExtractor): 'url': video_url, 'format_id': 'sd', }) - if not formats and default is not NO_DEFAULT: - return default - self._sort_formats(formats) - return formats - - formats = extract_formats(default=None) - - if not formats: - webpage = decode_packed_codes(self._search_regex( - r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", - webpage, 'packed code')) - formats = extract_formats() + self._sort_formats(formats) thumbnail = self._search_regex( - r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) + [ + r'<video[^>]+poster="([^"]+)"', + r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', + ], webpage, 'thumbnail', default=None) return { 'id': video_id, From 34e3885bc9e3aecab104b96eabce03854ac8f7a2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 15:55:44 +0100 Subject: [PATCH 023/154] [viewster->contv] remove viewster extractor and add support for contv.com --- youtube_dl/extractor/contv.py | 118 ++++++++++++++++ youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/viewster.py | 217 ----------------------------- 3 files changed, 119 insertions(+), 218 deletions(-) create mode 100644 youtube_dl/extractor/contv.py delete mode 100644 youtube_dl/extractor/viewster.py diff --git a/youtube_dl/extractor/contv.py b/youtube_dl/extractor/contv.py new file mode 100644 index 000000000..84b462d40 --- /dev/null +++ b/youtube_dl/extractor/contv.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, +) + + +class CONtvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?contv\.com/details-movie/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.contv.com/details-movie/CEG10022949/days-of-thrills-&-laughter', + 'info_dict': { + 'id': 'CEG10022949', + 'ext': 'mp4', + 'title': 'Days Of Thrills & Laughter', + 'description': 'md5:5d6b3d0b1829bb93eb72898c734802eb', + 'upload_date': '20180703', + 'timestamp': 1530634789.61, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.contv.com/details-movie/CLIP-show_fotld_bts/fight-of-the-living-dead:-behind-the-scenes-bites', + 'info_dict': { + 'id': 'CLIP-show_fotld_bts', + 'title': 'Fight of the Living Dead: Behind the Scenes Bites', + }, + 'playlist_mincount': 7, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + details = self._download_json( + 'http://metax.contv.live.junctiontv.net/metax/2.5/details/' + video_id, + video_id, query={'device': 'web'}) + + if details.get('type') == 'episodic': + seasons = self._download_json( + 'http://metax.contv.live.junctiontv.net/metax/2.5/seriesfeed/json/' + video_id, + video_id) + entries = [] + for season in seasons: + for episode in season.get('episodes', []): + episode_id = episode.get('id') + if not episode_id: + continue + entries.append(self.url_result( + 'https://www.contv.com/details-movie/' + episode_id, + CONtvIE.ie_key(), episode_id)) + return self.playlist_result(entries, video_id, details.get('title')) + + m_details = details['details'] + title = details['title'] + + formats = [] + + media_hls_url = m_details.get('media_hls_url') + if media_hls_url: + formats.extend(self._extract_m3u8_formats( + media_hls_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + + media_mp4_url = m_details.get('media_mp4_url') + if media_mp4_url: + formats.append({ + 'format_id': 'http', + 'url': media_mp4_url, + }) + + self._sort_formats(formats) + + subtitles = {} + captions = m_details.get('captions') or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({ + 'url': caption_url + }) + + thumbnails = [] + for image in m_details.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + description = None + for p in ('large_', 'medium_', 'small_', ''): + d = m_details.get(p + 'description') + if d: + description = d + break + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': description, + 'timestamp': float_or_none(details.get('metax_added_on'), 1000), + 'subtitles': subtitles, + 'duration': float_or_none(m_details.get('duration'), 1000), + 'view_count': int_or_none(details.get('num_watched')), + 'like_count': int_or_none(details.get('num_fav')), + 'categories': details.get('category'), + 'tags': details.get('tags'), + 'season_number': int_or_none(details.get('season')), + 'episode_number': int_or_none(details.get('episode')), + 'release_year': int_or_none(details.get('pub_year')), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ea47b99f6..1db21529f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -231,6 +231,7 @@ from .commonprotocols import ( RtmpIE, ) from .condenast import CondeNastIE +from .contv import CONtvIE from .corus import CorusIE from .cracked import CrackedIE from .crackle import CrackleIE @@ -1322,7 +1323,6 @@ from .viewlift import ( ViewLiftIE, ViewLiftEmbedIE, ) -from .viewster import ViewsterIE from .viidea import ViideaIE from .vimeo import ( VimeoIE, diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py deleted file mode 100644 index 6e318479c..000000000 --- a/youtube_dl/extractor/viewster.py +++ /dev/null @@ -1,217 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_unquote, -) -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_iso8601, - sanitized_Request, - HEADRequest, - url_basename, -) - - -class ViewsterIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)' - _TESTS = [{ - # movie, Type=Movie - 'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', - 'md5': 'e642d1b27fcf3a4ffa79f194f5adde36', - 'info_dict': { - 'id': '1140-11855-000', - 'ext': 'mp4', - 'title': 'The listening Project', - 'description': 'md5:bac720244afd1a8ea279864e67baa071', - 'timestamp': 1214870400, - 'upload_date': '20080701', - 'duration': 4680, - }, - }, { - # series episode, Type=Episode - 'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/', - 'md5': '9243079a8531809efe1b089db102c069', - 'info_dict': { - 'id': '1284-19427-001', - 'ext': 'mp4', - 'title': 'The World and a Wall', - 'description': 'md5:24814cf74d3453fdf5bfef9716d073e3', - 'timestamp': 1428192000, - 'upload_date': '20150405', - 'duration': 1500, - }, - }, { - # serie, Type=Serie - 'url': 'http://www.viewster.com/serie/1303-19426-000/', - 'info_dict': { - 'id': '1303-19426-000', - 'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?', - 'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11', - }, - 'playlist_count': 13, - }, { - # unfinished serie, no Type - 'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/', - 'info_dict': { - 'id': '1284-19427-000', - 'title': 'Baby Steps—Season 2', - 'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1', - }, - 'playlist_mincount': 16, - }, { - # geo restricted series - 'url': 'https://www.viewster.com/serie/1280-18794-002/', - 'only_matching': True, - }, { - # geo restricted video - 'url': 'https://www.viewster.com/serie/1280-18794-002/what-is-extraterritoriality-lawo/', - 'only_matching': True, - }] - - _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - - def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}): - request = sanitized_Request(url) - request.add_header('Accept', self._ACCEPT_HEADER) - request.add_header('Auth-token', self._AUTH_TOKEN) - return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query) - - def _real_extract(self, url): - video_id = self._match_id(url) - # Get 'api_token' cookie - self._request_webpage( - HEADRequest('http://www.viewster.com/'), - video_id, headers=self.geo_verification_headers()) - cookies = self._get_cookies('http://www.viewster.com/') - self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) - - info = self._download_json( - 'https://public-api.viewster.com/search/%s' % video_id, - video_id, 'Downloading entry JSON') - - entry_id = info.get('Id') or info['id'] - - # unfinished serie has no Type - if info.get('Type') in ('Serie', None): - try: - episodes = self._download_json( - 'https://public-api.viewster.com/series/%s/episodes' % entry_id, - video_id, 'Downloading series JSON') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - self.raise_geo_restricted() - else: - raise - entries = [ - self.url_result( - 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster') - for episode in episodes] - title = (info.get('Title') or info['Synopsis']['Title']).strip() - description = info.get('Synopsis', {}).get('Detailed') - return self.playlist_result(entries, video_id, title, description) - - formats = [] - for language_set in info.get('LanguageSets', []): - manifest_url = None - m3u8_formats = [] - audio = language_set.get('Audio') or '' - subtitle = language_set.get('Subtitle') or '' - base_format_id = audio - if subtitle: - base_format_id += '-%s' % subtitle - - def concat(suffix, sep='-'): - return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix - - medias = self._download_json( - 'https://public-api.viewster.com/movies/%s/videos' % entry_id, - video_id, fatal=False, query={ - 'mediaTypes': ['application/f4m+xml', 'application/x-mpegURL', 'video/mp4'], - 'language': audio, - 'subtitle': subtitle, - }) - if not medias: - continue - for media in medias: - video_url = media.get('Uri') - if not video_url: - continue - ext = determine_ext(video_url) - if ext == 'f4m': - manifest_url = video_url - video_url += '&' if '?' in video_url else '?' - video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=concat('hds'))) - elif ext == 'm3u8': - manifest_url = video_url - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=concat('hls'), - fatal=False) # m3u8 sometimes fail - if m3u8_formats: - formats.extend(m3u8_formats) - else: - qualities_basename = self._search_regex( - r'/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if not qualities_basename: - continue - QUALITIES_RE = r'((,\d+k)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if not qualities: - continue - qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) - http_url_basename = url_basename(video_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) - - if not formats and not info.get('VODSettings'): - self.raise_geo_restricted() - - self._sort_formats(formats) - - synopsis = info.get('Synopsis') or {} - # Prefer title outside synopsis since it's less messy - title = (info.get('Title') or synopsis['Title']).strip() - description = synopsis.get('Detailed') or (info.get('Synopsis') or {}).get('Short') - duration = int_or_none(info.get('Duration')) - timestamp = parse_iso8601(info.get('ReleaseDate')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } From 824fa51165d92ceee01589bf995ebbf009df328c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Oct 2019 04:03:53 +0700 Subject: [PATCH 024/154] [utils] Improve subtitles_filename (closes #22753) --- test/test_utils.py | 6 ++++++ youtube_dl/YoutubeDL.py | 2 +- youtube_dl/postprocessor/ffmpeg.py | 8 ++++---- youtube_dl/utils.py | 4 ++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 659c6ece5..3920542bb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -74,6 +74,7 @@ from youtube_dl.utils import ( str_to_int, strip_jsonp, strip_or_none, + subtitles_filename, timeconvert, unescapeHTML, unified_strdate, @@ -261,6 +262,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') + def test_subtitles_filename(self): + self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt'), 'abc.en.vtt') + self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt', 'ext'), 'abc.en.vtt') + self.assertEqual(subtitles_filename('abc.unexpected_ext', 'en', 'vtt', 'ext'), 'abc.unexpected_ext.en.vtt') + def test_remove_start(self): self.assertEqual(remove_start(None, 'A - '), None) self.assertEqual(remove_start('A - B', 'A - '), 'B') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c3d1407f9..f5cb46308 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1814,7 +1814,7 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - sub_filename = subtitles_filename(filename, sub_lang, sub_format) + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 70416c25e..fd3f921a8 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -393,7 +393,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): sub_ext = sub_info['ext'] if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': sub_langs.append(lang) - sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) else: if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': webm_vtt_warn = True @@ -606,9 +606,9 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): self._downloader.to_screen( '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) continue - old_file = subtitles_filename(filename, lang, ext) + old_file = subtitles_filename(filename, lang, ext, info.get('ext')) sub_filenames.append(old_file) - new_file = subtitles_filename(filename, lang, new_ext) + new_file = subtitles_filename(filename, lang, new_ext, info.get('ext')) if ext in ('dfxp', 'ttml', 'tt'): self._downloader.report_warning( @@ -616,7 +616,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): 'which results in style information loss') dfxp_file = old_file - srt_file = subtitles_filename(filename, lang, 'srt') + srt_file = subtitles_filename(filename, lang, 'srt', info.get('ext')) with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 798757241..53117ea90 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2906,8 +2906,8 @@ def determine_ext(url, default_ext='unknown_video'): return default_ext -def subtitles_filename(filename, sub_lang, sub_format): - return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format +def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): + return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) def date_from_str(date_str): From 2297c0d7d977921dca865e6c9cbc7ee5282ba8ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Oct 2019 23:56:36 +0700 Subject: [PATCH 025/154] [facebook] Bypass download rate limits (closes #21018) --- youtube_dl/extractor/facebook.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index a3dcdca3e..a56f85c21 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -405,6 +405,11 @@ class FacebookIE(InfoExtractor): if not formats: raise ExtractorError('Cannot find video formats') + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in formats: + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + self._sort_formats(formats) video_title = self._html_search_regex( From b4818e3c7a718428d3366c34da8e21e2f416f5e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Oct 2019 00:02:22 +0700 Subject: [PATCH 026/154] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index dc5c32a1f..045349b05 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version <unreleased> + +Core +* [utils] Improve subtitles_filename (#22753) + +Extractors +* [facebook] Bypass download rate limits (#21018) ++ [contv] Add support for contv.com +- [viewster] Remove extractor +* [xfileshare] Improve extractor (#17032, #17906, #18237, #18239) + * Update the list of domains + + Add support for aa-encoded video data + * Improve jwplayer format extraction + + Add support for Clappr sources +* [mangomolo] Fix video format extraction and add support for player URLs +* [audioboom] Improve metadata extraction +* [twitch] Update VOD URL matching (#22395, #22727) +- [mit] Remove support for video.mit.edu (#22403) +- [servingsys] Remove extractor (#22639) +* [dumpert] Fix extraction (#22428, #22564) +* [atresplayer] Fix extraction (#16277, #16716) + + version 2019.10.16 Core From 820215f0e34813089d559fed24a398d9e91810e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Oct 2019 00:09:02 +0700 Subject: [PATCH 027/154] release 2019.10.22 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 ++---- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 18 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 5cd9f0dc0..f1afe704c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.16 + [debug] youtube-dl version 2019.10.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 6cc34796a..a4dc9b005 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 0b7911e79..5bf86adce 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index a6f417d38..7aa5534e5 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.16 + [debug] youtube-dl version 2019.10.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 3fe753b62..5d3645e3d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 045349b05..64233b03b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.10.22 Core * [utils] Improve subtitles_filename (#22753) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0cbad28ea..a1b0edeeb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -183,6 +183,7 @@ - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED + - **CONtv** - **Corus** - **Coub** - **Cracked** @@ -784,7 +785,6 @@ - **Seeker** - **SenateISVP** - **SendtoNews** - - **ServingSys** - **Servus** - **Sexu** - **SeznamZpravy** @@ -1005,7 +1005,6 @@ - **Viddler** - **Videa** - **video.google:search**: Google Video search - - **video.mit.edu** - **VideoDetective** - **videofy.me** - **videomore** @@ -1023,7 +1022,6 @@ - **vier:videos** - **ViewLift** - **ViewLiftEmbed** - - **Viewster** - **Viidea** - **viki** - **viki:channel** @@ -1097,7 +1095,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me + - **XFileShare**: XFileShare based sites: ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 53889b7cb..39b355b9e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.10.16' +__version__ = '2019.10.22' From 0c2d10d225f61ac1fb534d8ed1788250401465b2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 22 Oct 2019 17:49:50 +0100 Subject: [PATCH 028/154] [globo] handle alternative hash signing method --- youtube_dl/extractor/globo.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 9ad1d95fb..60d842d3a 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -132,18 +132,24 @@ class GloboIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, message), expected=True) continue - assert security_hash[:2] in ('04', '14') - received_time = security_hash[3:13] - received_md5 = security_hash[24:] - - sign_time = compat_str(int(received_time) + 86400) + hash_code = security_hash[:2] padding = '%010d' % random.randint(1, 10000000000) + if hash_code in ('04', '14'): + received_time = security_hash[3:13] + received_md5 = security_hash[24:] + hash_prefix = security_hash[:23] + elif hash_code in ('02', '12', '03', '13'): + received_time = security_hash[2:12] + received_md5 = security_hash[22:] + padding += '1' + hash_prefix = '05' + security_hash[:22] - md5_data = (received_md5 + sign_time + padding + '0xAC10FD').encode() + padded_sign_time = compat_str(int(received_time) + 86400) + padding + md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = security_hash[:23] + sign_time + padding + signed_md5 - + signed_hash = hash_prefix + padded_sign_time + signed_md5 signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '') + if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', From 07154c793065bca816793186590d8d6461e07478 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 22 Oct 2019 17:53:47 +0100 Subject: [PATCH 029/154] [facebook] extract subtitles(closes #22777) --- youtube_dl/extractor/ceskatelevize.py | 2 ++ youtube_dl/extractor/facebook.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 1ec58f7d8..7cb4efb74 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -147,6 +147,8 @@ class CeskaTelevizeIE(InfoExtractor): is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): + if 'drmOnly=true' in stream_url: + continue if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index a56f85c21..c723726b7 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -379,6 +379,7 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') + subtitles = {} formats = [] for f in video_data: format_id = f['stream_type'] @@ -402,6 +403,9 @@ class FacebookIE(InfoExtractor): if dash_manifest: formats.extend(self._parse_mpd_formats( compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + subtitles_src = f[0].get('subtitles_src') + if subtitles_src: + subtitles.setdefault('en', []).append({'url': subtitles_src}) if not formats: raise ExtractorError('Cannot find video formats') @@ -447,6 +451,7 @@ class FacebookIE(InfoExtractor): 'timestamp': timestamp, 'thumbnail': thumbnail, 'view_count': view_count, + 'subtitles': subtitles, } return webpage, info_dict From 162bcc68dc73706699b559fffdd8bed3db6643b9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 24 Oct 2019 12:53:33 +0100 Subject: [PATCH 030/154] [puhutv] improve extraction - fix subtitles extraction - transform HLS URLs to http URLs - improve metadata extraction --- youtube_dl/extractor/puhutv.py | 90 ++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py index 5465e8ab7..fb704a3c4 100644 --- a/youtube_dl/extractor/puhutv.py +++ b/youtube_dl/extractor/puhutv.py @@ -25,21 +25,21 @@ class PuhuTVIE(InfoExtractor): _TESTS = [{ # film 'url': 'https://puhutv.com/sut-kardesler-izle', - 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7', + 'md5': 'a347470371d56e1585d1b2c8dab01c96', 'info_dict': { 'id': '5085', 'display_id': 'sut-kardesler', 'ext': 'mp4', 'title': 'Süt Kardeşler', - 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'description': 'md5:ca09da25b7e57cbb5a9280d6e48d17aa', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 4832.44, 'creator': 'Arzu Film', - 'timestamp': 1469778212, - 'upload_date': '20160729', + 'timestamp': 1561062602, + 'upload_date': '20190620', 'release_year': 1976, 'view_count': int, - 'tags': ['Aile', 'Komedi', 'Klasikler'], + 'tags': list, }, }, { # episode, geo restricted, bypassable with --geo-verification-proxy @@ -64,9 +64,10 @@ class PuhuTVIE(InfoExtractor): display_id)['data'] video_id = compat_str(info['id']) - title = info.get('name') or info['title']['name'] + show = info.get('title') or {} + title = info.get('name') or show['name'] if info.get('display_name'): - title = '%s %s' % (title, info.get('display_name')) + title = '%s %s' % (title, info['display_name']) try: videos = self._download_json( @@ -78,17 +79,36 @@ class PuhuTVIE(InfoExtractor): self.raise_geo_restricted() raise + urls = [] formats = [] + + def add_http_from_hls(m3u8_f): + http_url = m3u8_f['url'].replace('/hls/', '/mp4/').replace('/chunklist.m3u8', '.mp4') + if http_url != m3u8_f['url']: + f = m3u8_f.copy() + f.update({ + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + 'url': http_url, + }) + formats.append(f) + for video in videos['data']['videos']: media_url = url_or_none(video.get('url')) - if not media_url: + if not media_url or media_url in urls: continue + urls.append(media_url) + playlist = video.get('is_playlist') - if video.get('stream_type') == 'hls' and playlist is True: - formats.extend(self._extract_m3u8_formats( + if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: + m3u8_formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + for m3u8_f in m3u8_formats: + formats.append(m3u8_f) + add_http_from_hls(m3u8_f) continue + quality = int_or_none(video.get('quality')) f = { 'url': media_url, @@ -96,34 +116,29 @@ class PuhuTVIE(InfoExtractor): 'height': quality } video_format = video.get('video_format') - if video_format == 'hls' and playlist is False: + is_hls = (video_format == 'hls' or '/hls/' in media_url or '/chunklist.m3u8' in media_url) and playlist is False + if is_hls: format_id = 'hls' f['protocol'] = 'm3u8_native' elif video_format == 'mp4': format_id = 'http' - else: continue if quality: format_id += '-%sp' % quality f['format_id'] = format_id formats.append(f) + if is_hls: + add_http_from_hls(f) self._sort_formats(formats) - description = try_get( - info, lambda x: x['title']['description'], - compat_str) or info.get('description') - timestamp = unified_timestamp(info.get('created_at')) creator = try_get( - info, lambda x: x['title']['producer']['name'], compat_str) + show, lambda x: x['producer']['name'], compat_str) - duration = float_or_none( - try_get(info, lambda x: x['content']['duration_in_ms'], int), - scale=1000) - view_count = try_get(info, lambda x: x['content']['watch_count'], int) + content = info.get('content') or {} images = try_get( - info, lambda x: x['content']['images']['wide'], dict) or {} + content, lambda x: x['images']['wide'], dict) or {} thumbnails = [] for image_id, image_url in images.items(): if not isinstance(image_url, compat_str): @@ -137,14 +152,8 @@ class PuhuTVIE(InfoExtractor): }) thumbnails.append(t) - release_year = try_get(info, lambda x: x['title']['released_at'], int) - - season_number = int_or_none(info.get('season_number')) - season_id = str_or_none(info.get('season_id')) - episode_number = int_or_none(info.get('episode_number')) - tags = [] - for genre in try_get(info, lambda x: x['title']['genres'], list) or []: + for genre in show.get('genres') or []: if not isinstance(genre, dict): continue genre_name = genre.get('name') @@ -152,12 +161,11 @@ class PuhuTVIE(InfoExtractor): tags.append(genre_name) subtitles = {} - for subtitle in try_get( - info, lambda x: x['content']['subtitles'], list) or []: + for subtitle in content.get('subtitles') or []: if not isinstance(subtitle, dict): continue lang = subtitle.get('language') - sub_url = url_or_none(subtitle.get('url')) + sub_url = url_or_none(subtitle.get('url') or subtitle.get('file')) if not lang or not isinstance(lang, compat_str) or not sub_url: continue subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ @@ -168,15 +176,15 @@ class PuhuTVIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'season_id': season_id, - 'season_number': season_number, - 'episode_number': episode_number, - 'release_year': release_year, - 'timestamp': timestamp, + 'description': info.get('description') or show.get('description'), + 'season_id': str_or_none(info.get('season_id')), + 'season_number': int_or_none(info.get('season_number')), + 'episode_number': int_or_none(info.get('episode_number')), + 'release_year': int_or_none(show.get('released_at')), + 'timestamp': unified_timestamp(info.get('created_at')), 'creator': creator, - 'view_count': view_count, - 'duration': duration, + 'view_count': int_or_none(content.get('watch_count')), + 'duration': float_or_none(content.get('duration_in_ms'), 1000), 'tags': tags, 'subtitles': subtitles, 'thumbnails': thumbnails, From 416c3ca7f53dab76b9e5ec46a0c0335698252c2d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 25 Oct 2019 19:27:28 +0100 Subject: [PATCH 031/154] [odnoklassniki] add support for Schemeless embed extraction --- youtube_dl/extractor/generic.py | 7 ++++--- youtube_dl/extractor/odnoklassniki.py | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5ed952b29..f66cae0eb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -118,6 +118,7 @@ from .foxnews import FoxNewsIE from .viqeo import ViqeoIE from .expressen import ExpressenIE from .zype import ZypeIE +from .odnoklassniki import OdnoklassnikiIE class GenericIE(InfoExtractor): @@ -2627,9 +2628,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'VK') # Look for embedded Odnoklassniki player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Odnoklassniki') + odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) # Look for embedded ivi player mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 114b93c07..7ed9fac55 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -121,6 +123,13 @@ class OdnoklassnikiIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): start_time = int_or_none(compat_parse_qs( compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) From 3c989818e7dc7706da069312bbdd040165a97517 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 25 Oct 2019 19:35:07 +0100 Subject: [PATCH 032/154] [vk] improve extraction - add support for Odnoklassniki embeds - update tests - extract more video from user lists(closes #4470) - fix wall post audio extraction(closes #18332) - improve error detection(closes #22568) --- youtube_dl/extractor/vk.py | 329 +++++++++++++++++++------------------ 1 file changed, 173 insertions(+), 156 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 8b6dc0e24..c289fcad3 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -12,7 +12,6 @@ from ..utils import ( get_element_by_class, int_or_none, orderedSet, - remove_start, str_or_none, str_to_int, unescapeHTML, @@ -21,6 +20,7 @@ from ..utils import ( urlencode_postdata, ) from .dailymotion import DailymotionIE +from .odnoklassniki import OdnoklassnikiIE from .pladform import PladformIE from .vimeo import VimeoIE from .youtube import YoutubeIE @@ -60,6 +60,18 @@ class VKBaseIE(InfoExtractor): def _real_initialize(self): self._login() + def _download_payload(self, path, video_id, data, fatal=True): + data['al'] = 1 + code, payload = self._download_json( + 'https://vk.com/%s.php' % path, video_id, + data=urlencode_postdata(data), fatal=fatal, + headers={'X-Requested-With': 'XMLHttpRequest'})['payload'] + if code == '3': + self.raise_login_required() + elif code == '8': + raise ExtractorError(clean_html(payload[0][1:-1]), expected=True) + return payload + class VKIE(VKBaseIE): IE_NAME = 'vk' @@ -96,7 +108,6 @@ class VKIE(VKBaseIE): }, { 'url': 'http://vk.com/video205387401_165548505', - 'md5': '6c0aeb2e90396ba97035b9cbde548700', 'info_dict': { 'id': '205387401_165548505', 'ext': 'mp4', @@ -110,18 +121,18 @@ class VKIE(VKBaseIE): }, { 'note': 'Embedded video', - 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', - 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', + 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { - 'id': '32194266_162925554', + 'id': '-77521_162222515', 'ext': 'mp4', - 'uploader': 'Vladimir Gavrin', - 'title': 'Lin Dan', - 'duration': 101, - 'upload_date': '20120730', - 'view_count': int, + 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'title': 'ProtivoGunz - Хуёвая песня', + 'duration': 195, + 'upload_date': '20120212', + 'timestamp': 1329049880, + 'uploader_id': '-77521', }, - 'skip': 'This video has been removed from public access.', }, { # VIDEO NOW REMOVED @@ -138,18 +149,19 @@ class VKIE(VKBaseIE): 'upload_date': '20121218', 'view_count': int, }, - 'skip': 'Requires vk account credentials', + 'skip': 'Removed', }, { 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', - 'md5': '4d7a5ef8cf114dfa09577e57b2993202', 'info_dict': { 'id': '-43215063_168067957', 'ext': 'mp4', - 'uploader': 'Киномания - лучшее из мира кино', + 'uploader': 'Bro Mazter', 'title': ' ', 'duration': 7291, 'upload_date': '20140328', + 'uploader_id': '223413403', + 'timestamp': 1396018030, }, 'skip': 'Requires vk account credentials', }, @@ -165,7 +177,7 @@ class VKIE(VKBaseIE): 'upload_date': '20140626', 'view_count': int, }, - 'skip': 'Only works from Russia', + 'skip': 'Removed', }, { # video (removed?) only available with list id @@ -247,6 +259,9 @@ class VKIE(VKBaseIE): 'uploader_id': '-387766', 'timestamp': 1475137527, }, + 'params': { + 'skip_download': True, + }, }, { # live stream, hls and rtmp links, most likely already finished live @@ -288,80 +303,94 @@ class VKIE(VKBaseIE): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') + mv_data = {} if video_id: - info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id + data = { + 'act': 'show_inline', + 'video': video_id, + } # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: - info_url += '&list=%s' % list_id + data['list'] = list_id + + payload = self._download_payload('al_video', video_id, data) + info_page = payload[1] + opts = payload[-1] + mv_data = opts.get('mvData') or {} + player = opts.get('player') or {} else: - info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query') video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_page = self._download_webpage(info_url, video_id) + info_page = self._download_webpage( + 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) - error_message = self._html_search_regex( - [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', - r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], - info_page, 'error message', default=None) - if error_message: - raise ExtractorError(error_message, expected=True) + error_message = self._html_search_regex( + [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], + info_page, 'error message', default=None) + if error_message: + raise ExtractorError(error_message, expected=True) - if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): - raise ExtractorError( - 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', - expected=True) + if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): + raise ExtractorError( + 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', + expected=True) - ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' - ERRORS = { - r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - ERROR_COPYRIGHT, + ERRORS = { + r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': + ERROR_COPYRIGHT, - r'>The video .*? was removed from public access by request of the copyright holder.<': - ERROR_COPYRIGHT, + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, - r'<!>Please log in or <': - 'Video %s is only available for registered users, ' - 'use --username and --password options to provide account credentials.', + r'<!>Please log in or <': + 'Video %s is only available for registered users, ' + 'use --username and --password options to provide account credentials.', - r'<!>Unknown error': - 'Video %s does not exist.', + r'<!>Unknown error': + 'Video %s does not exist.', - r'<!>Видео временно недоступно': - 'Video %s is temporarily unavailable.', + r'<!>Видео временно недоступно': + 'Video %s is temporarily unavailable.', - r'<!>Access denied': - 'Access denied to video %s.', + r'<!>Access denied': + 'Access denied to video %s.', - r'<!>Видеозапись недоступна, так как её автор был заблокирован.': - 'Video %s is no longer available, because its author has been blocked.', + r'<!>Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', - r'<!>This video is no longer available, because its author has been blocked.': - 'Video %s is no longer available, because its author has been blocked.', + r'<!>This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', - r'<!>This video is no longer available, because it has been deleted.': - 'Video %s is no longer available, because it has been deleted.', + r'<!>This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', - r'<!>The video .+? is not available in your region.': - 'Video %s is not available in your region.', - } + r'<!>The video .+? is not available in your region.': + 'Video %s is not available in your region.', + } - for error_re, error_msg in ERRORS.items(): - if re.search(error_re, info_page): - raise ExtractorError(error_msg % video_id, expected=True) + for error_re, error_msg in ERRORS.items(): + if re.search(error_re, info_page): + raise ExtractorError(error_msg % video_id, expected=True) + + player = self._parse_json(self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', + info_page, 'player params'), video_id) youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + return self.url_result(youtube_url, YoutubeIE.ie_key()) vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: - return self.url_result(vimeo_url) + return self.url_result(vimeo_url, VimeoIE.ie_key()) pladform_url = PladformIE._extract_url(info_page) if pladform_url: - return self.url_result(pladform_url) + return self.url_result(pladform_url, PladformIE.ie_key()) m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) @@ -374,6 +403,10 @@ class VKIE(VKBaseIE): if dailymotion_urls: return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) + odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) @@ -383,38 +416,7 @@ class VKIE(VKBaseIE): opts_url = 'http:' + opts_url return self.url_result(opts_url) - # vars does not look to be served anymore since 24.10.2016 - data = self._parse_json( - self._search_regex( - r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'), - video_id, fatal=False) - - # <!json> is served instead - if not data: - data = self._parse_json( - self._search_regex( - [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'], - info_page, 'json', default='{}'), - video_id) - if data: - data = data['player']['params'][0] - - if not data: - data = self._parse_json( - self._search_regex( - r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, - 'player params', default='{}'), - video_id) - if data: - data = data['params'][0] - - # <!--{...} - if not data: - data = self._parse_json( - self._search_regex( - r'<!--\s*({.+})', info_page, 'payload'), - video_id)['payload'][-1][-1]['player']['params'][0] - + data = player['params'][0] title = unescapeHTML(data['md_title']) # 2 = live @@ -463,12 +465,12 @@ class VKIE(VKBaseIE): 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), - 'uploader_id': str_or_none(data.get('author_id')), - 'duration': data.get('duration'), + 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')), + 'duration': int_or_none(data.get('duration') or mv_data.get('duration')), 'timestamp': timestamp, 'view_count': view_count, - 'like_count': int_or_none(data.get('liked')), - 'dislike_count': int_or_none(data.get('nolikes')), + 'like_count': int_or_none(mv_data.get('likes')), + 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, } @@ -482,7 +484,6 @@ class VKUserVideosIE(VKBaseIE): 'url': 'http://vk.com/videos205387401', 'info_dict': { 'id': '205387401', - 'title': "Tom Cruise's Videos", }, 'playlist_mincount': 4, }, { @@ -498,22 +499,25 @@ class VKUserVideosIE(VKBaseIE): 'url': 'http://new.vk.com/videos205387401', 'only_matching': True, }] + _VIDEO = collections.namedtuple( + 'Video', ['owner_id', 'id', 'thumb', 'title', 'flags', 'duration', 'hash', 'moder_acts', 'owner', 'date', 'views', 'platform', 'blocked', 'music_video_meta']) def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + l = self._download_payload('al_video', page_id, { + 'act': 'load_videos_silent', + 'oid': page_id, + })[0]['']['list'] - entries = [ - self.url_result( - 'http://vk.com/video' + video_id, 'VK', video_id=video_id) - for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] + entries = [] + for video in l: + v = self._VIDEO._make(video) + video_id = '%d_%d' % (v.owner_id, v.id) + entries.append(self.url_result( + 'http://vk.com/video' + video_id, 'VK', video_id=video_id)) - title = unescapeHTML(self._search_regex( - r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', - webpage, 'title', default=page_id)) - - return self.playlist_result(entries, page_id, title) + return self.playlist_result(entries, page_id) class VKWallPostIE(VKBaseIE): @@ -523,15 +527,15 @@ class VKWallPostIE(VKBaseIE): # public page URL, audio playlist 'url': 'https://vk.com/bs.official?w=wall-23538238_35', 'info_dict': { - 'id': '23538238_35', - 'title': 'Black Shadow - Wall post 23538238_35', + 'id': '-23538238_35', + 'title': 'Black Shadow - Wall post -23538238_35', 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', }, 'playlist': [{ 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'info_dict': { 'id': '135220665_111806521', - 'ext': 'mp3', + 'ext': 'mp4', 'title': 'Black Shadow - Слепое Верование', 'duration': 370, 'uploader': 'Black Shadow', @@ -542,18 +546,16 @@ class VKWallPostIE(VKBaseIE): 'md5': '4cc7e804579122b17ea95af7834c9233', 'info_dict': { 'id': '135220665_111802303', - 'ext': 'mp3', + 'ext': 'mp4', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'duration': 423, 'uploader': 'Black Shadow', 'artist': 'Black Shadow', 'track': 'Война - Негасимое Бездны Пламя!', }, - 'params': { - 'skip_download': True, - }, }], 'params': { + 'skip_download': True, 'usenetrc': True, }, 'skip': 'Requires vk account credentials', @@ -562,7 +564,7 @@ class VKWallPostIE(VKBaseIE): 'url': 'https://vk.com/wall85155021_6319', 'info_dict': { 'id': '85155021_6319', - 'title': 'Sergey Gorbunov - Wall post 85155021_6319', + 'title': 'Сергей Горбунов - Wall post 85155021_6319', }, 'playlist_count': 1, 'params': { @@ -578,58 +580,73 @@ class VKWallPostIE(VKBaseIE): 'url': 'https://m.vk.com/wall-23538238_35', 'only_matching': True, }] + _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' + _AUDIO = collections.namedtuple( + 'Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads', 'subtitle', 'main_artists', 'feat_artists', 'album', 'track_code', 'restriction', 'album_part', 'new_stats', 'access_key']) + + def _decode(self, enc): + dec = '' + e = n = 0 + for c in enc: + r = self._BASE64_CHARS.index(c) + cond = n % 4 + e = 64 * e + r if cond else r + n += 1 + if cond: + dec += chr(255 & e >> (-2 * n & 6)) + return dec + + def _unmask_url(self, mask_url, vk_id): + if 'audio_api_unavailable' in mask_url: + extra = mask_url.split('?extra=')[1].split('#') + func, base = self._decode(extra[1]).split(chr(11)) + assert (func == 'i') + mask_url = list(self._decode(extra[0])) + url_len = len(mask_url) + indexes = [None] * url_len + index = int(base) ^ vk_id + for n in range(url_len - 1, -1, -1): + index = (url_len * (n + 1) ^ index + n) % url_len + indexes[n] = index + for n in range(1, url_len): + c = mask_url[n] + index = indexes[url_len - 1 - n] + mask_url[n] = mask_url[index] + mask_url[index] = c + mask_url = ''.join(mask_url) + return mask_url def _real_extract(self, url): post_id = self._match_id(url) - wall_url = 'https://vk.com/wall%s' % post_id - - post_id = remove_start(post_id, '-') - - webpage = self._download_webpage(wall_url, post_id) - - error = self._html_search_regex( - r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)', - webpage, 'error', default=None) - if error: - raise ExtractorError('VK said: %s' % error, expected=True) + webpage = self._download_payload('wkview', post_id, { + 'act': 'show', + 'w': 'wall' + post_id, + })[1] description = clean_html(get_element_by_class('wall_post_text', webpage)) uploader = clean_html(get_element_by_class('author', webpage)) - thumbnail = self._og_search_thumbnail(webpage) entries = [] - audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) - if audio_ids: - al_audio = self._download_webpage( - 'https://vk.com/al_audio.php', post_id, - note='Downloading audio info', fatal=False, - data=urlencode_postdata({ - 'act': 'reload_audio', - 'al': '1', - 'ids': ','.join(audio_ids) - })) - if al_audio: - Audio = collections.namedtuple( - 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) - audios = self._parse_json( - self._search_regex( - r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'), - post_id, fatal=False, transform_source=unescapeHTML) - if isinstance(audios, list): - for audio in audios: - a = Audio._make(audio[:6]) - entries.append({ - 'id': '%s_%s' % (a.user_id, a.id), - 'url': a.url, - 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, - 'thumbnail': thumbnail, - 'duration': a.duration, - 'uploader': uploader, - 'artist': a.artist, - 'track': a.track, - }) + for audio in re.findall(r'data-audio="([^"]+)', webpage): + audio = self._parse_json(unescapeHTML(audio), post_id) + a = self._AUDIO._make(audio) + if not a.url: + continue + title = unescapeHTML(a.title) + entries.append({ + 'id': '%s_%s' % (a.owner_id, a.id), + 'url': self._unmask_url(a.url, a.ads['vk_id']), + 'title': '%s - %s' % (a.performer, title) if a.performer else title, + 'thumbnail': a.cover_url.split(',') if a.cover_url else None, + 'duration': a.duration, + 'uploader': uploader, + 'artist': a.performer, + 'track': title, + 'ext': 'mp4', + 'protocol': 'm3u8', + }) for video in re.finditer( r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): From 42cd0824b3975e6ce500d8cecd60e1fc077a758b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 26 Oct 2019 00:06:05 +0100 Subject: [PATCH 033/154] [vk] remove assert statement --- youtube_dl/extractor/vk.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index c289fcad3..4c8ca4f41 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -600,7 +600,6 @@ class VKWallPostIE(VKBaseIE): if 'audio_api_unavailable' in mask_url: extra = mask_url.split('?extra=')[1].split('#') func, base = self._decode(extra[1]).split(chr(11)) - assert (func == 'i') mask_url = list(self._decode(extra[0])) url_len = len(mask_url) indexes = [None] * url_len From 235dbb434bfa724718c37d8af0a61baf93b775be Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 26 Oct 2019 14:57:42 +0100 Subject: [PATCH 034/154] [discoverynetworks] add support for dplay.co.uk --- youtube_dl/extractor/discoverynetworks.py | 63 +++++++---------------- 1 file changed, 19 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index fba1ef221..607a54948 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,63 +3,38 @@ from __future__ import unicode_literals import re -from .brightcove import BrightcoveLegacyIE from .dplay import DPlayIE -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) -from ..utils import smuggle_url class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ - (?: - .*\#(?P<id>\d+)| - (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| - programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) - )''' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' _TESTS = [{ - 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', + 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', 'info_dict': { - 'id': '3235167922001', + 'id': '78867', 'ext': 'mp4', - 'title': 'Breaking Amish: Die Welt da draußen', - 'description': ( - 'Vier Amische und eine Mennonitin wagen in New York' - ' den Sprung in ein komplett anderes Leben. Begleitet sie auf' - ' ihrem spannenden Weg.'), - 'timestamp': 1396598084, - 'upload_date': '20140404', - 'uploader_id': '1659832546', + 'title': 'Die Welt da draußen', + 'description': 'md5:61033c12b73286e409d99a41742ef608', + 'timestamp': 1554069600, + 'upload_date': '20190331', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, }, }, { - 'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/', + 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', 'only_matching': True, }, { - 'url': 'http://www.discovery.de/#5332316765001', + 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - alternate_id = mobj.group('alternate_id') - if alternate_id: - self._initialize_geo_bypass({ - 'countries': ['DE'], - }) - return self._get_disco_api_info( - url, '%s/%s' % (mobj.group('programme'), alternate_id), - 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') - brightcove_id = mobj.group('id') - if not brightcove_id: - title = mobj.group('title') - webpage = self._download_webpage(url, title) - brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse( - brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}), - 'BrightcoveNew', brightcove_id) + domain, programme, alternate_id = re.match(self._VALID_URL, url).groups() + country = 'GB' if domain == 'dplay.co.uk' else 'DE' + realm = 'questuk' if country == 'GB' else domain.replace('.', '') + return self._get_disco_api_info( + url, '%s/%s' % (programme, alternate_id), + 'sonic-eu1-prod.disco-api.com', realm, country) From 0b98f3a7517601b7d2aabc789997016b9c3c24f2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 26 Oct 2019 14:58:29 +0100 Subject: [PATCH 035/154] [dplay] improve extraction - add support for dplay.fi, dplay.jp and es.dplay.com(closes #16969) - fix it.dplay.com extraction(closes #22826) - update tests - extract creator, tags and thumbnails - handle playback API call errors --- youtube_dl/extractor/dplay.py | 397 ++++++++++------------------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 133 insertions(+), 269 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index ebf59512c..d9c3d59cd 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,74 +1,68 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re -import time from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urlparse, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, float_or_none, int_or_none, - remove_end, - try_get, - unified_strdate, unified_timestamp, - update_url_query, - urljoin, - USER_AGENTS, ) class DPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?P<domain>www\.(?P<host>dplay\.(?P<country>dk|se|no)))/(?:video(?:er|s)/)?(?P<id>[^/]+/[^/?#]+)' + _VALID_URL = r'''(?x)https?:// + (?P<domain> + (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))| + (?P<subdomain_country>es|it)\.dplay\.com + )/[^/]+/(?P<id>[^/]+/[^/?#]+)''' _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL - 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', + 'url': 'https://www.dplay.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', 'info_dict': { - 'id': '3172', - 'display_id': 'nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet', + 'id': '13628', + 'display_id': 'nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', 'ext': 'mp4', 'title': 'Svensken lär sig njuta av livet', 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', - 'duration': 2650, - 'timestamp': 1365454320, + 'duration': 2649.856, + 'timestamp': 1365453720, 'upload_date': '20130408', - 'creator': 'Kanal 5 (Home)', + 'creator': 'Kanal 5', 'series': 'Nugammalt - 77 händelser som format Sverige', 'season_number': 1, 'episode_number': 1, - 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, }, }, { # geo restricted, via secure api, unsigned download hls URL - 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', + 'url': 'http://www.dplay.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', 'info_dict': { - 'id': '70816', - 'display_id': 'mig-og-min-mor/season-6-episode-12', + 'id': '104465', + 'display_id': 'ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', 'ext': 'mp4', - 'title': 'Episode 12', - 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', - 'duration': 2563, - 'timestamp': 1429696800, - 'upload_date': '20150422', - 'creator': 'Kanal 4 (Home)', - 'series': 'Mig og min mor', - 'season_number': 6, - 'episode_number': 12, - 'age_limit': 0, + 'title': 'Ted Bundy: Mind Of A Monster', + 'description': 'md5:8b780f6f18de4dae631668b8a9637995', + 'duration': 5290.027, + 'timestamp': 1570694400, + 'upload_date': '20191010', + 'creator': 'ID - Investigation Discovery', + 'series': 'Ted Bundy: Mind Of A Monster', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, }, - }, { - # geo restricted, via direct unsigned hls URL - 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', - 'only_matching': True, }, { # disco-api 'url': 'https://www.dplay.no/videoer/i-kongens-klr/sesong-1-episode-7', @@ -89,19 +83,59 @@ class DPlayIE(InfoExtractor): 'format': 'bestvideo', 'skip_download': True, }, + 'skip': 'Available for Premium users', }, { - - 'url': 'https://www.dplay.dk/videoer/singleliv/season-5-episode-3', + 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', + 'md5': '2b808ffb00fc47b884a172ca5d13053c', + 'info_dict': { + 'id': '6918', + 'display_id': 'biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'ext': 'mp4', + 'title': 'Luigi Di Maio: la psicosi di Stanislawskij', + 'description': 'md5:3c7a4303aef85868f867a26f5cc14813', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'upload_date': '20160524', + 'timestamp': 1464076800, + 'series': 'Biografie imbarazzanti', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + }, + }, { + 'url': 'https://es.dplay.com/dmax/la-fiebre-del-oro/temporada-8-episodio-1/', + 'info_dict': { + 'id': '21652', + 'display_id': 'la-fiebre-del-oro/temporada-8-episodio-1', + 'ext': 'mp4', + 'title': 'Episodio 1', + 'description': 'md5:b9dcff2071086e003737485210675f69', + 'thumbnail': r're:^https?://.*\.png', + 'upload_date': '20180709', + 'timestamp': 1531173540, + 'series': 'La fiebre del oro', + 'season_number': 8, + 'episode': 'Episode 1', + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dplay.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', 'only_matching': True, }, { - 'url': 'https://www.dplay.se/videos/sofias-anglar/sofias-anglar-1001', + 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, }] - def _get_disco_api_info(self, url, display_id, disco_host, realm): - disco_base = 'https://' + disco_host + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): + geo_countries = [country.upper()] + self._initialize_geo_bypass({ + 'countries': geo_countries, + }) + disco_base = 'https://%s/' % disco_host token = self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', + disco_base + 'token', display_id, 'Downloading token', query={ 'realm': realm, })['data']['attributes']['token'] @@ -110,17 +144,30 @@ class DPlayIE(InfoExtractor): 'Authorization': 'Bearer ' + token, } video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, + disco_base + 'content/videos/' + display_id, display_id, headers=headers, query={ - 'include': 'show' + 'include': 'images,primaryChannel,show,tags' }) video_id = video['data']['id'] info = video['data']['attributes'] - title = info['name'] + title = info['name'].strip() formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id, headers=headers)['data']['attributes']['streaming'].items(): + try: + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + display_id, headers=headers)['data']['attributes']['streaming'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + info = self._parse_json(e.cause.read().decode('utf-8'), display_id) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code == 'access.denied.missingpackage': + self.raise_login_required() + raise ExtractorError(info['errors'][0]['detail'], expected=True) + raise + for format_id, format_dict in streaming.items(): if not isinstance(format_dict, dict): continue format_url = format_dict.get('url') @@ -142,235 +189,55 @@ class DPlayIE(InfoExtractor): }) self._sort_formats(formats) - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass + creator = series = None + tags = [] + thumbnails = [] + included = video.get('included') or [] + if isinstance(included, list): + for e in included: + attributes = e.get('attributes') + if not attributes: + continue + e_type = e.get('type') + if e_type == 'channel': + creator = attributes.get('name') + elif e_type == 'image': + src = attributes.get('src') + if src: + thumbnails.append({ + 'url': src, + 'width': int_or_none(attributes.get('width')), + 'height': int_or_none(attributes.get('height')), + }) + if e_type == 'show': + series = attributes.get('name') + elif e_type == 'tag': + name = attributes.get('name') + if name: + tags.append(name) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), + 'duration': float_or_none(info.get('videoDuration'), 1000), 'timestamp': unified_timestamp(info.get('publishStart')), 'series': series, 'season_number': int_or_none(info.get('seasonNumber')), 'episode_number': int_or_none(info.get('episodeNumber')), 'age_limit': int_or_none(info.get('minimum_age')), + 'creator': creator, + 'tags': tags, + 'thumbnails': thumbnails, 'formats': formats, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') - domain = mobj.group('domain') - - self._initialize_geo_bypass({ - 'countries': [mobj.group('country').upper()], - }) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'video id', default=None) - - if not video_id: - host = mobj.group('host') - return self._get_disco_api_info( - url, display_id, 'disco-api.' + host, host.replace('.', '')) - - info = self._download_json( - 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), - video_id)['data'][0] - - title = info['title'] - - PROTOCOLS = ('hls', 'hds') - formats = [] - - def extract_formats(protocol, manifest_url): - if protocol == 'hls': - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False) - # Sometimes final URLs inside m3u8 are unsigned, let's fix this - # ourselves. Also fragments' URLs are only served signed for - # Safari user agent. - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query) - for m3u8_format in m3u8_formats: - m3u8_format.update({ - 'url': update_url_query(m3u8_format['url'], query), - 'http_headers': { - 'User-Agent': USER_AGENTS['Safari'], - }, - }) - formats.extend(m3u8_formats) - elif protocol == 'hds': - formats.extend(self._extract_f4m_formats( - manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0', - video_id, f4m_id=protocol, fatal=False)) - - domain_tld = domain.split('.')[-1] - if domain_tld in ('se', 'dk', 'no'): - for protocol in PROTOCOLS: - # Providing dsc-geo allows to bypass geo restriction in some cases - self._set_cookie( - 'secure.dplay.%s' % domain_tld, 'dsc-geo', - json.dumps({ - 'countryCode': domain_tld.upper(), - 'expiry': (time.time() + 20 * 60) * 1000, - })) - stream = self._download_json( - 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s' - % (domain_tld, video_id, protocol), video_id, - 'Downloading %s stream JSON' % protocol, fatal=False) - if stream and stream.get(protocol): - extract_formats(protocol, stream[protocol]) - - # The last resort is to try direct unsigned hls/hds URLs from info dictionary. - # Sometimes this does work even when secure API with dsc-geo has failed (e.g. - # http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/). - if not formats: - for protocol in PROTOCOLS: - if info.get(protocol): - extract_formats(protocol, info[protocol]) - - self._sort_formats(formats) - - subtitles = {} - for lang in ('se', 'sv', 'da', 'nl', 'no'): - for format_id in ('web_vtt', 'vtt', 'srt'): - subtitle_url = info.get('subtitles_%s_%s' % (lang, format_id)) - if subtitle_url: - subtitles.setdefault(lang, []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('video_metadata_longDescription'), - 'duration': int_or_none(info.get('video_metadata_length'), scale=1000), - 'timestamp': int_or_none(info.get('video_publish_date')), - 'creator': info.get('video_metadata_homeChannel'), - 'series': info.get('video_metadata_show'), - 'season_number': int_or_none(info.get('season')), - 'episode_number': int_or_none(info.get('episode')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - 'subtitles': subtitles, - } - - -class DPlayItIE(InfoExtractor): - _VALID_URL = r'https?://it\.dplay\.com/[^/]+/[^/]+/(?P<id>[^/?#]+)' - _GEO_COUNTRIES = ['IT'] - _TEST = { - 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', - 'md5': '2b808ffb00fc47b884a172ca5d13053c', - 'info_dict': { - 'id': '6918', - 'display_id': 'luigi-di-maio-la-psicosi-di-stanislawskij', - 'ext': 'mp4', - 'title': 'Biografie imbarazzanti: Luigi Di Maio: la psicosi di Stanislawskij', - 'description': 'md5:3c7a4303aef85868f867a26f5cc14813', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'upload_date': '20160524', - 'series': 'Biografie imbarazzanti', - 'season_number': 1, - 'episode': 'Luigi Di Maio: la psicosi di Stanislawskij', - 'episode_number': 1, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - title = remove_end(self._og_search_title(webpage), ' | Dplay') - - video_id = None - - info = self._search_regex( - r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")', - webpage, 'playback JSON', default=None) - if info: - for _ in range(2): - info = self._parse_json(info, display_id, fatal=False) - if not info: - break - else: - video_id = try_get(info, lambda x: x['data']['id']) - - if not info: - info_url = self._search_regex( - (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', - r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'), - webpage, 'info url', group='url') - - info_url = urljoin(url, info_url) - video_id = info_url.rpartition('/')[-1] - - try: - info = self._download_json( - info_url, display_id, headers={ - 'Authorization': 'Bearer %s' % self._get_cookies(url).get( - 'dplayit_token').value, - 'Referer': url, - }) - if isinstance(info, compat_str): - info = self._parse_json(info, display_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - if error.get('code') == 'access.denied.geoblocked': - self.raise_geo_restricted( - msg=error.get('detail'), countries=self._GEO_COUNTRIES) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - raise - - hls_url = info['data']['attributes']['streaming']['hls']['url'] - - formats = self._extract_m3u8_formats( - hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) - - series = self._html_search_regex( - r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', - webpage, 'series', fatal=False) - episode = self._search_regex( - r'<p[^>]+class=["\'].*?\bdesc_ep\b.*?["\'][^>]*>\s*<br/>\s*<b>([^<]+)', - webpage, 'episode', fatal=False) - - mobj = re.search( - r'(?s)<span[^>]+class=["\']dates["\'][^>]*>.+?\bS\.(?P<season_number>\d+)\s+E\.(?P<episode_number>\d+)\s*-\s*(?P<upload_date>\d{2}/\d{2}/\d{4})', - webpage) - if mobj: - season_number = int(mobj.group('season_number')) - episode_number = int(mobj.group('episode_number')) - upload_date = unified_strdate(mobj.group('upload_date')) - else: - season_number = episode_number = upload_date = None - - return { - 'id': compat_str(video_id or display_id), - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'upload_date': upload_date, - 'formats': formats, - } + domain = mobj.group('domain').lstrip('www.') + country = mobj.group('country') or mobj.group('subdomain_country') + host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + return self._get_disco_api_info( + url, display_id, host, 'dplay' + country, country) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1db21529f..a8fe0de1a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -277,10 +277,7 @@ from .douyutv import ( DouyuShowIE, DouyuTVIE, ) -from .dplay import ( - DPlayIE, - DPlayItIE, -) +from .dplay import DPlayIE from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE From 548c395716b1d5aa215e526fcb052a03926c1573 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 27 Oct 2019 17:52:46 +0100 Subject: [PATCH 036/154] [soundcloud] improve extraction - improve format extraction(closes #22123) - extract uploader_id and uploader_url(closes #21916) - extract all known thumbnails(closes #19071)(closes #20659) - fix extration for private playlists(closes #20976) - add support for playlist embeds(#20976) - skip preview formats(closes #22806) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 6 +- youtube_dl/extractor/soundcloud.py | 497 ++++++++++++++--------------- 3 files changed, 248 insertions(+), 256 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a8fe0de1a..388c1ebe6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1033,6 +1033,7 @@ from .snotr import SnotrIE from .sohu import SohuIE from .sonyliv import SonyLIVIE from .soundcloud import ( + SoundcloudEmbedIE, SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f66cae0eb..1c0780e98 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -80,7 +80,7 @@ from .theplatform import ThePlatformIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE -from .soundcloud import SoundcloudIE +from .soundcloud import SoundcloudEmbedIE from .tunein import TuneInBaseIE from .vbox7 import Vbox7IE from .dbtv import DBTVIE @@ -2749,9 +2749,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - soundcloud_urls = SoundcloudIE._extract_urls(webpage) + soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) # Look for tunein player tunein_urls = TuneInBaseIE._extract_urls(webpage) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 05538f3d6..875b9d887 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -11,14 +11,13 @@ from .common import ( from ..compat import ( compat_str, compat_urlparse, - compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, float_or_none, + HEADRequest, int_or_none, KNOWN_EXTENSIONS, - merge_dicts, mimetype2ext, str_or_none, try_get, @@ -28,6 +27,20 @@ from ..utils import ( ) +class SoundcloudEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?url=(?P<id>.*)' + + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + + def _real_extract(self, url): + return self.url_result(compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query)['url'][0]) + + class SoundcloudIE(InfoExtractor): """Information extractor for soundcloud.com To access the media, the uid of the song and a stream token @@ -44,9 +57,8 @@ class SoundcloudIE(InfoExtractor): (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) - |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) + |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) (?:/?\?secret_token=(?P<secret_token>[^&]+))?) - |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*) ) ''' IE_NAME = 'soundcloud' @@ -60,6 +72,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', + 'uploader_id': '1571244', 'timestamp': 1349920598, 'upload_date': '20121011', 'duration': 143.216, @@ -79,6 +92,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', + 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', 'duration': 30, @@ -92,6 +106,7 @@ class SoundcloudIE(InfoExtractor): # rtmp 'skip_download': True, }, + 'skip': 'Preview', }, # private link { @@ -103,6 +118,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'description': 'test chars: \"\'/\\ä↭', 'uploader': 'jaimeMF', + 'uploader_id': '69767071', 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9.927, @@ -123,6 +139,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'description': 'test chars: \"\'/\\ä↭', 'uploader': 'jaimeMF', + 'uploader_id': '69767071', 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9.927, @@ -143,6 +160,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Bus Brakes', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', + 'uploader_id': '73680509', 'timestamp': 1389232924, 'upload_date': '20140109', 'duration': 17.346, @@ -163,6 +181,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', 'uploader': 'Ori Uplift Music', + 'uploader_id': '12563093', 'timestamp': 1504206263, 'upload_date': '20170831', 'duration': 7449.096, @@ -183,6 +202,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Sideways (Prod. Mad Real)', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'uploader': 'garyvee', + 'uploader_id': '2366352', 'timestamp': 1488152409, 'upload_date': '20170226', 'duration': 207.012, @@ -207,6 +227,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Mezzo Valzer', 'description': 'md5:4138d582f81866a530317bae316e8b61', 'uploader': 'Giovanni Sarani', + 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', 'duration': 180.157, @@ -221,114 +242,81 @@ class SoundcloudIE(InfoExtractor): } ] + _API_BASE = 'https://api.soundcloud.com/' + _API_V2_BASE = 'https://api-v2.soundcloud.com/' + _BASE_URL = 'https://soundcloud.com/' _CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI' + _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', - webpage)] + _ARTWORK_MAP = { + 'mini': 16, + 'tiny': 20, + 'small': 32, + 'badge': 47, + 't67x67': 67, + 'large': 100, + 't300x300': 300, + 'crop': 400, + 't500x500': 500, + 'original': 0, + } @classmethod def _resolv_url(cls, url): - return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID - def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): + def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): track_id = compat_str(info['id']) title = info['title'] - name = full_title or track_id - if quiet: - self.report_extraction(name) - thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') - if isinstance(thumbnail, compat_str): - thumbnail = thumbnail.replace('-large', '-t500x500') - username = try_get(info, lambda x: x['user']['username'], compat_str) - - def extract_count(key): - return int_or_none(info.get('%s_count' % key)) - - like_count = extract_count('favoritings') - if like_count is None: - like_count = extract_count('likes') - - result = { - 'id': track_id, - 'uploader': username, - 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, - 'description': info.get('description'), - 'thumbnail': thumbnail, - 'duration': float_or_none(info.get('duration'), 1000), - 'webpage_url': info.get('permalink_url'), - 'license': info.get('license'), - 'view_count': extract_count('playback'), - 'like_count': like_count, - 'comment_count': extract_count('comment'), - 'repost_count': extract_count('reposts'), - 'genre': info.get('genre'), - } + track_base_url = self._API_BASE + 'tracks/%s' % track_id format_urls = set() formats = [] query = {'client_id': self._CLIENT_ID} - if secret_token is not None: + if secret_token: query['secret_token'] = secret_token - if info.get('downloadable', False): - # We can build a direct link to the song + + if info.get('downloadable'): format_url = update_url_query( - 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) + info.get('download_url') or track_base_url + '/download', query) format_urls.add(format_url) + if version == 2: + v1_info = self._download_json( + track_base_url, track_id, query=query, fatal=False) or {} + else: + v1_info = info formats.append({ 'format_id': 'download', - 'ext': info.get('original_format', 'mp3'), + 'ext': v1_info.get('original_format') or 'mp3', + 'filesize': int_or_none(v1_info.get('original_content_size')), 'url': format_url, - 'vcodec': 'none', 'preference': 10, }) - # Old API, does not work for some tracks (e.g. - # https://soundcloud.com/giovannisarani/mezzo-valzer) - format_dict = self._download_json( - 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, - track_id, 'Downloading track url', query=query, fatal=False) + def invalid_url(url): + return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url) - if format_dict: - for key, stream_url in format_dict.items(): - if stream_url in format_urls: - continue - format_urls.add(stream_url) - ext, abr = 'mp3', None - mobj = re.search(r'_([^_]+)_(\d+)_url', key) - if mobj: - ext, abr = mobj.groups() - abr = int(abr) - if key.startswith('http'): - stream_formats = [{ - 'format_id': key, - 'ext': ext, - 'url': stream_url, - }] - elif key.startswith('rtmp'): - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = stream_url.split('mp3:', 1) - stream_formats = [{ - 'format_id': key, - 'url': url, - 'play_path': 'mp3:' + path, - 'ext': 'flv', - }] - elif key.startswith('hls'): - stream_formats = self._extract_m3u8_formats( - stream_url, track_id, ext, entry_protocol='m3u8_native', - m3u8_id=key, fatal=False) - else: - continue - - if abr: - for f in stream_formats: - f['abr'] = abr - - formats.extend(stream_formats) + def add_format(f, protocol): + mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) + if mobj: + for k, v in mobj.groupdict().items(): + if not f.get(k): + f[k] = v + format_id_list = [] + if protocol: + format_id_list.append(protocol) + for k in ('ext', 'abr'): + v = f.get(k) + if v: + format_id_list.append(v) + abr = f.get('abr') + if abr: + f['abr'] = int(abr) + f.update({ + 'format_id': '_'.join(format_id_list), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + }) + formats.append(f) # New API transcodings = try_get( @@ -337,129 +325,165 @@ class SoundcloudIE(InfoExtractor): if not isinstance(t, dict): continue format_url = url_or_none(t.get('url')) - if not format_url: + if not format_url or t.get('snipped') or '/preview/' in format_url: continue stream = self._download_json( - update_url_query(format_url, query), track_id, fatal=False) + format_url, track_id, query=query, fatal=False) if not isinstance(stream, dict): continue stream_url = url_or_none(stream.get('url')) - if not stream_url: - continue - if stream_url in format_urls: + if invalid_url(stream_url): continue format_urls.add(stream_url) - protocol = try_get(t, lambda x: x['format']['protocol'], compat_str) + stream_format = t.get('format') or {} + protocol = stream_format.get('protocol') if protocol != 'hls' and '/hls' in format_url: protocol = 'hls' ext = None preset = str_or_none(t.get('preset')) if preset: ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - mimetype = try_get( - t, lambda x: x['format']['mime_type'], compat_str) - ext = mimetype2ext(mimetype) or 'mp3' - format_id_list = [] - if protocol: - format_id_list.append(protocol) - format_id_list.append(ext) - format_id = '_'.join(format_id_list) - formats.append({ + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(stream_format.get('mime_type')) + add_format({ 'url': stream_url, - 'format_id': format_id, 'ext': ext, - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', - }) + }, 'http' if protocol == 'progressive' else protocol) + + if not formats: + # Old API, does not work for some tracks (e.g. + # https://soundcloud.com/giovannisarani/mezzo-valzer) + # and might serve preview URLs (e.g. + # http://www.soundcloud.com/snbrn/ele) + format_dict = self._download_json( + track_base_url + '/streams', track_id, + 'Downloading track url', query=query, fatal=False) or {} + + for key, stream_url in format_dict.items(): + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key) + if mobj: + protocol, ext, abr = mobj.groups() + add_format({ + 'abr': abr, + 'ext': ext, + 'url': stream_url, + }, protocol) if not formats: # We fallback to the stream_url in the original info, this # cannot be always used, sometimes it can give an HTTP 404 error - formats.append({ - 'format_id': 'fallback', - 'url': update_url_query(info['stream_url'], query), - 'ext': 'mp3', - }) - self._check_formats(formats, track_id) + urlh = self._request_webpage( + HEADRequest(info.get('stream_url') or track_base_url + '/stream'), + track_id, query=query, fatal=False) + if urlh: + stream_url = urlh.geturl() + if not invalid_url(stream_url): + add_format({'url': stream_url}, 'http') for f in formats: f['vcodec'] = 'none' self._sort_formats(formats) - result['formats'] = formats - return result + user = info.get('user') or {} + + thumbnails = [] + artwork_url = info.get('artwork_url') + thumbnail = artwork_url or user.get('avatar_url') + if isinstance(thumbnail, compat_str): + if re.search(self._IMAGE_REPL_RE, thumbnail): + for image_id, size in self._ARTWORK_MAP.items(): + i = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + i['preference'] = 10 + if size: + i.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(i) + else: + thumbnails = [{'url': thumbnail}] + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + return { + 'id': track_id, + 'uploader': user.get('username'), + 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), + 'uploader_url': user.get('permalink_url'), + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), + 'thumbnails': thumbnails, + 'duration': float_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings') or extract_count('likes'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), + 'formats': formats + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) + mobj = re.match(self._VALID_URL, url) track_id = mobj.group('track_id') - new_info = {} - if track_id is not None: - info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID + query = { + 'client_id': self._CLIENT_ID, + } + if track_id: + info_json_url = self._API_V2_BASE + 'tracks/' + track_id full_title = track_id token = mobj.group('secret_token') if token: - info_json_url += '&secret_token=' + token - elif mobj.group('player'): - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - real_url = query['url'][0] - # If the token is in the query of the original url we have to - # manually add it - if 'secret_token' in query: - real_url += '?secret_token=' + query['secret_token'][0] - return self.url_result(real_url) + query['secret_token'] = token else: - # extract uploader (which is in the url) - uploader = mobj.group('uploader') - # extract simple title (uploader + slug of song title) - slug_title = mobj.group('title') + full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title') token = mobj.group('token') - full_title = resolve_title = '%s/%s' % (uploader, slug_title) if token: resolve_title += '/%s' % token + info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - webpage = self._download_webpage(url, full_title, fatal=False) - if webpage: - entries = self._parse_json( - self._search_regex( - r'var\s+c\s*=\s*(\[.+?\])\s*,\s*o\s*=Date\b', webpage, - 'data', default='[]'), full_title, fatal=False) - if entries: - for e in entries: - if not isinstance(e, dict): - continue - if e.get('id') != 67: - continue - data = try_get(e, lambda x: x['data'][0], dict) - if data: - new_info = data - break - info_json_url = self._resolv_url( - 'https://soundcloud.com/%s' % resolve_title) - - # Contains some additional info missing from new_info + version = 2 info = self._download_json( - info_json_url, full_title, 'Downloading info JSON') + info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) + if not info: + info = self._download_json( + info_json_url.replace(self._API_V2_BASE, self._API_BASE), + full_title, 'Downloading info JSON', query=query) + version = 1 - return self._extract_info_dict( - merge_dicts(info, new_info), full_title, secret_token=token) + return self._extract_info_dict(info, full_title, token, version) class SoundcloudPlaylistBaseIE(SoundcloudIE): - @staticmethod - def _extract_id(e): - return compat_str(e['id']) if e.get('id') else None - - def _extract_track_entries(self, tracks): - return [ - self.url_result( - track['permalink_url'], SoundcloudIE.ie_key(), - video_id=self._extract_id(track)) - for track in tracks if track.get('permalink_url')] + def _extract_track_entries(self, tracks, token=None): + entries = [] + for track in tracks: + track_id = str_or_none(track.get('id')) + url = track.get('permalink_url') + if not url: + if not track_id: + continue + url = self._API_V2_BASE + 'tracks/' + track_id + if token: + url += '?secret_token=' + token + entries.append(self.url_result( + url, SoundcloudIE.ie_key(), track_id)) + return entries class SoundcloudSetIE(SoundcloudPlaylistBaseIE): @@ -480,41 +504,28 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - # extract uploader (which is in the url) - uploader = mobj.group('uploader') - # extract simple title (uploader + slug of song title) - slug_title = mobj.group('slug_title') - full_title = '%s/sets/%s' % (uploader, slug_title) - url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title) - + full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title') token = mobj.group('token') if token: full_title += '/' + token - url += '/' + token - resolv_url = self._resolv_url(url) - info = self._download_json(resolv_url, full_title) + info = self._download_json(self._resolv_url( + self._BASE_URL + full_title), full_title) if 'errors' in info: msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - entries = self._extract_track_entries(info['tracks']) + entries = self._extract_track_entries(info['tracks'], token) - return { - '_type': 'playlist', - 'entries': entries, - 'id': '%s' % info['id'], - 'title': info['title'], - } + return self.playlist_result( + entries, str_or_none(info.get('id')), info.get('title')) class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): - _API_V2_BASE = 'https://api-v2.soundcloud.com' - def _extract_playlist(self, base_url, playlist_id, playlist_title): COMMON_QUERY = { - 'limit': 50, + 'limit': 2000000000, 'client_id': self._CLIENT_ID, 'linked_partitioning': '1', } @@ -522,12 +533,13 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): query = COMMON_QUERY.copy() query['offset'] = 0 - next_href = base_url + '?' + compat_urllib_parse_urlencode(query) + next_href = base_url entries = [] for i in itertools.count(): response = self._download_json( - next_href, playlist_id, 'Downloading track page %s' % (i + 1)) + next_href, playlist_id, + 'Downloading track page %s' % (i + 1), query=query) collection = response['collection'] @@ -546,9 +558,8 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): continue return self.url_result( permalink_url, - ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, - video_id=self._extract_id(cand), - video_title=cand.get('title')) + SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + str_or_none(cand.get('id')), cand.get('title')) for e in collection: entry = resolve_entry((e, e.get('track'), e.get('playlist'))) @@ -559,11 +570,10 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): if not next_href: break - parsed_next_href = compat_urlparse.urlparse(response['next_href']) - qs = compat_urlparse.parse_qs(parsed_next_href.query) - qs.update(COMMON_QUERY) - next_href = compat_urlparse.urlunparse( - parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) + next_href = response['next_href'] + parsed_next_href = compat_urlparse.urlparse(next_href) + query = compat_urlparse.parse_qs(parsed_next_href.query) + query.update(COMMON_QUERY) return { '_type': 'playlist', @@ -609,7 +619,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): 'url': 'https://soundcloud.com/jcv246/sets', 'info_dict': { 'id': '12982173', - 'title': 'Jordi / cv (Playlists)', + 'title': 'Jordi / cv (Sets)', }, 'playlist_mincount': 2, }, { @@ -636,39 +646,29 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): }] _BASE_URL_MAP = { - 'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - } - - _TITLE_MAP = { - 'all': 'All', - 'tracks': 'Tracks', - 'albums': 'Albums', - 'sets': 'Playlists', - 'reposts': 'Reposts', - 'likes': 'Likes', - 'spotlight': 'Spotlight', + 'all': 'stream/users/%s', + 'tracks': 'users/%s/tracks', + 'albums': 'users/%s/albums', + 'sets': 'users/%s/playlists', + 'reposts': 'stream/users/%s/reposts', + 'likes': 'users/%s/likes', + 'spotlight': 'users/%s/spotlight', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - url = 'https://soundcloud.com/%s/' % uploader - resolv_url = self._resolv_url(url) user = self._download_json( - resolv_url, uploader, 'Downloading user info') + self._resolv_url(self._BASE_URL + uploader), + uploader, 'Downloading user info') resource = mobj.group('rsrc') or 'all' return self._extract_playlist( - self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']), - '%s (%s)' % (user['username'], self._TITLE_MAP[resource])) + self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'], + str_or_none(user.get('id')), + '%s (%s)' % (user['username'], resource.capitalize())) class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): @@ -678,7 +678,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', 'info_dict': { 'id': '286017854', - 'title': 'Track station: your-text', + 'title': 'Track station: your text', }, 'playlist_mincount': 47, }] @@ -686,19 +686,17 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): track_name = self._match_id(url) - webpage = self._download_webpage(url, track_name) - + track = self._download_json(self._resolv_url(url), track_name) track_id = self._search_regex( - r'soundcloud:track-stations:(\d+)', webpage, 'track id') + r'soundcloud:track-stations:(\d+)', track['id'], 'track id') return self._extract_playlist( - '%s/stations/soundcloud:track-stations:%s/tracks' - % (self._API_V2_BASE, track_id), - track_id, 'Track station: %s' % track_name) + self._API_V2_BASE + 'stations/%s/tracks' % track['id'], + track_id, 'Track station: %s' % track['title']) class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): - _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' + _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' _TESTS = [{ 'url': 'https://api.soundcloud.com/playlists/4110309', @@ -713,29 +711,22 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') - base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id) - data_dict = { + query = { 'client_id': self._CLIENT_ID, } token = mobj.group('token') - if token: - data_dict['secret_token'] = token + query['secret_token'] = token - data = compat_urllib_parse_urlencode(data_dict) data = self._download_json( - base_url + data, playlist_id, 'Downloading playlist') + self._API_V2_BASE + 'playlists/' + playlist_id, + playlist_id, 'Downloading playlist', query=query) - entries = self._extract_track_entries(data['tracks']) + entries = self._extract_track_entries(data['tracks'], token) - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': data.get('title'), - 'description': data.get('description'), - 'entries': entries, - } + return self.playlist_result( + entries, playlist_id, data.get('title'), data.get('description')) class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): @@ -753,18 +744,18 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): _SEARCH_KEY = 'scsearch' _MAX_RESULTS_PER_PAGE = 200 _DEFAULT_RESULTS_PER_PAGE = 50 - _API_V2_BASE = 'https://api-v2.soundcloud.com' def _get_collection(self, endpoint, collection_id, **query): limit = min( query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), self._MAX_RESULTS_PER_PAGE) - query['limit'] = limit - query['client_id'] = self._CLIENT_ID - query['linked_partitioning'] = '1' - query['offset'] = 0 - data = compat_urllib_parse_urlencode(query) - next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) + query.update({ + 'limit': limit, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': 1, + 'offset': 0, + }) + next_url = update_url_query(self._API_V2_BASE + endpoint, query) collected_results = 0 @@ -791,5 +782,5 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): break def _get_n_results(self, query, n): - tracks = self._get_collection('/search/tracks', query, limit=n, q=query) + tracks = self._get_collection('search/tracks', query, limit=n, q=query) return self.playlist_result(tracks, playlist_title=query) From dd90451f0f4867480c5ed8cb3588b30312204e3f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 27 Oct 2019 22:02:46 +0100 Subject: [PATCH 037/154] [tenplay] Add new extractor(closes #21446) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tenplay.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/tenplay.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 388c1ebe6..339a141a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1133,6 +1133,7 @@ from .telequebec import ( from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .tennistv import TennisTVIE +from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py new file mode 100644 index 000000000..dff44a4e2 --- /dev/null +++ b/youtube_dl/extractor/tenplay.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_age_limit, + parse_iso8601, + smuggle_url, +) + + +class TenPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/[^/]+/episodes/[^/]+/[^/]+/(?P<id>tpv\d{6}[a-z]{5})' + _TEST = { + 'url': 'https://10play.com.au/masterchef/episodes/season-1/masterchef-s1-ep-1/tpv190718kwzga', + 'info_dict': { + 'id': '6060533435001', + 'ext': 'mp4', + 'title': 'MasterChef - S1 Ep. 1', + 'description': 'md5:4fe7b78e28af8f2d900cd20d900ef95c', + 'age_limit': 10, + 'timestamp': 1240828200, + 'upload_date': '20090427', + 'uploader_id': '2199827728001', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + } + BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + + def _real_extract(self, url): + content_id = self._match_id(url) + data = self._download_json( + 'https://10play.com.au/api/video/' + content_id, content_id) + video = data.get('video') or {} + metadata = data.get('metaData') or {} + brightcove_id = video.get('videoId') or metadata['showContentVideoId'] + brightcove_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['AU']}) + + return { + '_type': 'url_transparent', + 'url': brightcove_url, + 'id': content_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + 'description': video.get('description'), + 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), + 'series': metadata.get('showName'), + 'season': metadata.get('showContentSeason'), + 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), + 'ie_key': 'BrightcoveNew', + } From 71fa0b04f9099090f43f6747632a9bdc3a4b1015 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 13:30:30 +0100 Subject: [PATCH 038/154] [makertv] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/makertv.py | 32 ------------------------------ 2 files changed, 33 deletions(-) delete mode 100644 youtube_dl/extractor/makertv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 339a141a5..4229518fd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -604,7 +604,6 @@ from .mailru import ( MailRuMusicIE, MailRuMusicSearchIE, ) -from .makertv import MakerTVIE from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py deleted file mode 100644 index 8eda69cfc..000000000 --- a/youtube_dl/extractor/makertv.py +++ /dev/null @@ -1,32 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class MakerTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer\.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' - _TEST = { - 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', - 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', - 'info_dict': { - 'id': 'Fh3QgymL9gsc', - 'ext': 'mp4', - 'title': 'Maze Runner: The Scorch Trials Official Movie Review', - 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', - 'upload_date': '20150918', - 'timestamp': 1442549540, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id') - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'jwplatform:%s' % jwplatform_id, - 'ie_key': 'JWPlatform', - } From 80c2126e80bc41f7b66d325c4c67c61887c58fb0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 13:32:35 +0100 Subject: [PATCH 039/154] [thesun] fix extraction(closes #16966) --- youtube_dl/extractor/thesun.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/thesun.py b/youtube_dl/extractor/thesun.py index 22d003776..15d4a6932 100644 --- a/youtube_dl/extractor/thesun.py +++ b/youtube_dl/extractor/thesun.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .ooyala import OoyalaIE +from ..utils import extract_attributes class TheSunIE(InfoExtractor): @@ -16,6 +16,7 @@ class TheSunIE(InfoExtractor): }, 'playlist_count': 2, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): article_id = self._match_id(url) @@ -23,10 +24,15 @@ class TheSunIE(InfoExtractor): webpage = self._download_webpage(url, article_id) entries = [] - for ooyala_id in re.findall( - r'<[^>]+\b(?:id\s*=\s*"thesun-ooyala-player-|data-content-id\s*=\s*")([^"]+)', + for video in re.findall( + r'<video[^>]+data-video-id-pending=[^>]+>', webpage): - entries.append(OoyalaIE._build_url_result(ooyala_id)) + attrs = extract_attributes(video) + video_id = attrs['data-video-id-pending'] + account_id = attrs.get('data-account', '5067014667001') + entries.append(self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), + 'BrightcoveNew', video_id)) return self.playlist_result( entries, article_id, self._og_search_title(webpage, fatal=False)) From 0f9d53566a5956854af77173c0e910ed7454aadf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 15:17:06 +0100 Subject: [PATCH 040/154] [la7] update Kaltura service URL(closes #22358) --- youtube_dl/extractor/la7.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index 6373268c4..c3b4ffa7e 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -20,7 +20,7 @@ class LA7IE(InfoExtractor): 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': 'inccool8-02-10-2015-163722', + 'id': '0_42j6wd36', 'ext': 'mp4', 'title': 'Inc.Cool8', 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', @@ -57,7 +57,7 @@ class LA7IE(InfoExtractor): return { '_type': 'url_transparent', 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], { - 'service_url': 'http://kdam.iltrovatore.it', + 'service_url': 'http://nkdam.iltrovatore.it', }), 'id': video_id, 'title': player_data['title'], From 3e252cca0e81aef55b0288f86991bb566878a9fc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 17:39:01 +0100 Subject: [PATCH 041/154] [macgamestore] remove extractor Covered by generic extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/macgamestore.py | 42 ---------------------------- 2 files changed, 43 deletions(-) delete mode 100644 youtube_dl/extractor/macgamestore.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4229518fd..1807744be 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -598,7 +598,6 @@ from .lynda import ( LyndaCourseIE ) from .m6 import M6IE -from .macgamestore import MacGameStoreIE from .mailru import ( MailRuIE, MailRuMusicIE, diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py deleted file mode 100644 index 43db9929c..000000000 --- a/youtube_dl/extractor/macgamestore.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class MacGameStoreIE(InfoExtractor): - IE_NAME = 'macgamestore' - IE_DESC = 'MacGameStore trailers' - _VALID_URL = r'https?://(?:www\.)?macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)' - - _TEST = { - 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450', - 'md5': '8649b8ea684b6666b4c5be736ecddc61', - 'info_dict': { - 'id': '2450', - 'ext': 'm4v', - 'title': 'Crow', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, 'Downloading trailer page') - - if '>Missing Media<' in webpage: - raise ExtractorError( - 'Trailer %s does not exist' % video_id, expected=True) - - video_title = self._html_search_regex( - r'<title>MacGameStore: (.*?) Trailer', webpage, 'title') - - video_url = self._html_search_regex( - r'(?s)', - webpage, 'video URL') - - return { - 'id': video_id, - 'url': video_url, - 'title': video_title - } From 831b732da1d0796a1927af8767d76af780cc90f0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 17:41:17 +0100 Subject: [PATCH 042/154] [learnr] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/learnr.py | 33 ------------------------------ 2 files changed, 34 deletions(-) delete mode 100644 youtube_dl/extractor/learnr.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1807744be..9f3a5f8a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -546,7 +546,6 @@ from .lcp import ( LcpPlayIE, LcpIE, ) -from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lecturio import ( LecturioIE, diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py deleted file mode 100644 index 1435e090e..000000000 --- a/youtube_dl/extractor/learnr.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class LearnrIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P[0-9]+)' - _TEST = { - 'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript', - 'md5': '3719fdf0a68397f49899e82c308a89de', - 'info_dict': { - 'id': '51624', - 'ext': 'mp4', - 'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript', - 'description': 'md5:b36dbfa92350176cdf12b4d388485503', - 'uploader': 'LearnCode.academy', - 'uploader_id': 'learncodeacademy', - 'upload_date': '20131021', - }, - 'add_ie': ['Youtube'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - return { - '_type': 'url_transparent', - 'url': self._search_regex( - r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'), - 'id': video_id, - } From b3c2fa6dad607da6455a13d232461d4380e4b53c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 17:42:33 +0100 Subject: [PATCH 043/154] [tutv] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/tutv.py | 36 ------------------------------ 2 files changed, 37 deletions(-) delete mode 100644 youtube_dl/extractor/tutv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f3a5f8a5..39282b785 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1183,7 +1183,6 @@ from .tunein import ( ) from .tunepk import TunePkIE from .turbo import TurboIE -from .tutv import TutvIE from .tv2 import ( TV2IE, TV2ArticleIE, diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py deleted file mode 100644 index 362318b24..000000000 --- a/youtube_dl/extractor/tutv.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_parse_qs, -) - - -class TutvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P[^/?]+)' - _TEST = { - 'url': 'http://tu.tv/videos/robots-futbolistas', - 'md5': '0cd9e28ad270488911b0d2a72323395d', - 'info_dict': { - 'id': '2973058', - 'ext': 'mp4', - 'title': 'Robots futbolistas', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') - - data_content = self._download_webpage( - 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = compat_b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') - - return { - 'id': internal_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - } From 702984eca955f61811078c33337faf9eebeb48c8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 17:49:05 +0100 Subject: [PATCH 044/154] [hark] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/hark.py | 33 ------------------------------ 2 files changed, 34 deletions(-) delete mode 100644 youtube_dl/extractor/hark.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 39282b785..114ede8b9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -428,7 +428,6 @@ from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE -from .hark import HarkIE from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py deleted file mode 100644 index 342a6130e..000000000 --- a/youtube_dl/extractor/hark.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class HarkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hark\.com/clips/(?P.+?)-.+' - _TEST = { - 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', - 'md5': '6783a58491b47b92c7c1af5a77d4cbee', - 'info_dict': { - 'id': 'mmbzyhkgny', - 'ext': 'mp3', - 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', - 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', - 'duration': 11, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.hark.com/clips/%s.json' % video_id, video_id) - - return { - 'id': video_id, - 'url': data['url'], - 'title': data['name'], - 'description': data.get('description'), - 'thumbnail': data.get('image_original'), - 'duration': data.get('duration'), - } From 895e5c03db310ee97d585360ef8e6ae117e4cbd6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 19:31:20 +0100 Subject: [PATCH 045/154] [nbcnews] fix extraction closes #12569 closes #12576 closes #21703 closes #21923 --- youtube_dl/extractor/nbc.py | 86 +++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 10680b202..5bc39d002 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,9 +9,13 @@ from .theplatform import ThePlatformIE from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( - smuggle_url, - update_url_query, int_or_none, + js_to_json, + parse_duration, + smuggle_url, + try_get, + unified_timestamp, + update_url_query, ) @@ -285,13 +289,12 @@ class NBCNewsIE(ThePlatformIE): _TESTS = [ { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'af1adfa51312291a017720403826bb64', + 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', 'info_dict': { 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', - 'uploader': 'NBCU-NEWS', 'timestamp': 1401363060, 'upload_date': '20140529', }, @@ -309,28 +312,26 @@ class NBCNewsIE(ThePlatformIE): }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '73135a2e0ef819107bbb55a5a9b2a802', + 'md5': '8eb831eca25bfa7d25ddd83e85946548', 'info_dict': { 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'timestamp': 1423104900, - 'uploader': 'NBCU-NEWS', 'upload_date': '20150205', }, }, { 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': 'a49e173825e5fcd15c13fc297fced39d', + 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', 'info_dict': { - 'id': '529953347624', + 'id': 'n431456', 'ext': 'mp4', - 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', - 'description': 'md5:c8be487b2d80ff0594c005add88d8351', + 'title': "Volkswagen U.S. Chief: We 'Totally Screwed Up'", + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', 'upload_date': '20150922', 'timestamp': 1442917800, - 'uploader': 'NBCU-NEWS', }, }, { @@ -343,7 +344,6 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, - 'uploader': 'NBCU-NEWS', }, }, { @@ -357,7 +357,6 @@ class NBCNewsIE(ThePlatformIE): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', - 'uploader': 'NBCU-NEWS', }, }, { @@ -373,20 +372,61 @@ class NBCNewsIE(ThePlatformIE): def _real_extract(self, url): video_id = self._match_id(url) - if not video_id.isdigit(): - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id) - video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id'] + data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.+});', webpage, + 'bootstrap json'), video_id, js_to_json) + video_data = try_get(data, lambda x: x['video']['current'], dict) + if not video_data: + video_data = data['article']['content'][0]['primaryMedia']['video'] + title = video_data['headline']['primary'] + + formats = [] + for va in video_data.get('videoAssets', []): + public_url = va.get('publicUrl') + if not public_url: + continue + if '://link.theplatform.com/' in public_url: + public_url = update_url_query(public_url, {'format': 'redirect'}) + format_id = va.get('format') + if format_id == 'M3U': + formats.extend(self._extract_m3u8_formats( + public_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + continue + tbr = int_or_none(va.get('bitrate'), 1000) + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': public_url, + 'width': int_or_none(va.get('width')), + 'height': int_or_none(va.get('height')), + 'tbr': tbr, + 'ext': 'mp4', + }) + self._sort_formats(formats) + + subtitles = {} + closed_captioning = video_data.get('closedCaptioning') + if closed_captioning: + for cc_url in closed_captioning.values(): + if not cc_url: + continue + subtitles.setdefault('en', []).append({ + 'url': cc_url, + }) return { - '_type': 'url_transparent', 'id': video_id, - # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}), - 'ie_key': 'ThePlatformFeed', + 'title': title, + 'description': try_get(video_data, lambda x: x['description']['primary']), + 'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datePublished')), + 'formats': formats, + 'subtitles': subtitles, } From 83e49259bfd4e0b54a4b53c30742109555087e3a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 19:45:42 +0100 Subject: [PATCH 046/154] [internetvideoarchive] fix extraction --- youtube_dl/extractor/internetvideoarchive.py | 92 ++++++-------------- 1 file changed, 28 insertions(+), 64 deletions(-) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 76cc5ec3e..59b0a90c3 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,15 +1,13 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urlparse, ) -from ..utils import ( - determine_ext, - int_or_none, - xpath_text, -) class InternetVideoArchiveIE(InfoExtractor): @@ -20,7 +18,7 @@ class InternetVideoArchiveIE(InfoExtractor): 'info_dict': { 'id': '194487', 'ext': 'mp4', - 'title': 'KICK-ASS 2', + 'title': 'Kick-Ass 2', 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', }, 'params': { @@ -33,68 +31,34 @@ class InternetVideoArchiveIE(InfoExtractor): def _build_json_url(query): return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query - @staticmethod - def _build_xml_url(query): - return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query - def _real_extract(self, url): - query = compat_urlparse.urlparse(url).query - query_dic = compat_parse_qs(query) - video_id = query_dic['publishedid'][0] - - if '/player/' in url: - configuration = self._download_json(url, video_id) - - # There are multiple videos in the playlist whlie only the first one - # matches the video played in browsers - video_info = configuration['playlist'][0] - title = video_info['title'] - - formats = [] - for source in video_info['sources']: - file_url = source['file'] - if determine_ext(file_url) == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) - file_url = m3u8_formats[0]['url'] - formats.extend(self._extract_f4m_formats( - file_url.replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_mpd_formats( - file_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - else: - a_format = { - 'url': file_url, - } - - if source.get('label') and source['label'][-4:] == ' kbs': - tbr = int_or_none(source['label'][:-4]) - a_format.update({ - 'tbr': tbr, - 'format_id': 'http-%d' % tbr, - }) - formats.append(a_format) - - self._sort_formats(formats) - - description = video_info.get('description') - thumbnail = video_info.get('image') - else: - configuration = self._download_xml(url, video_id) - formats = [{ - 'url': xpath_text(configuration, './file', 'file URL', fatal=True), - }] - thumbnail = xpath_text(configuration, './image', 'thumbnail') - title = 'InternetVideoArchive video %s' % video_id - description = None + query = compat_parse_qs(compat_urlparse.urlparse(url).query) + video_id = query['publishedid'][0] + data = self._download_json( + 'https://video.internetvideoarchive.net/videojs7/videojs7.ivasettings.ashx', + video_id, data=json.dumps({ + 'customerid': query['customerid'][0], + 'publishedid': video_id, + }).encode()) + title = data['Title'] + formats = self._extract_m3u8_formats( + data['VideoUrl'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + file_url = formats[0]['url'] + if '.ism/' in file_url: + replace_url = lambda x: re.sub(r'\.ism/[^?]+', '.ism/' + x, file_url) + formats.extend(self._extract_f4m_formats( + replace_url('.f4m'), video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + replace_url('.mpd'), video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_ism_formats( + replace_url('Manifest'), video_id, ism_id='mss', fatal=False)) + self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, + 'thumbnail': data.get('PosterUrl'), + 'description': data.get('Description'), } From 0086726e8674e9edec0682e7a84275c3c25ce646 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 19:48:34 +0100 Subject: [PATCH 047/154] [videodetective] fix extraction --- youtube_dl/extractor/videodetective.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index a19411a05..fe70db713 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse from .internetvideoarchive import InternetVideoArchiveIE @@ -13,7 +12,7 @@ class VideoDetectiveIE(InfoExtractor): 'info_dict': { 'id': '194487', 'ext': 'mp4', - 'title': 'KICK-ASS 2', + 'title': 'Kick-Ass 2', 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', }, 'params': { @@ -24,7 +23,7 @@ class VideoDetectiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - og_video = self._og_search_video_url(webpage) - query = compat_urlparse.urlparse(og_video).query - return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key()) + query = 'customerid=69249&publishedid=' + video_id + return self.url_result( + InternetVideoArchiveIE._build_json_url(query), + ie=InternetVideoArchiveIE.ie_key()) From cfabc505984acb3830aeac7759d913bb885d64b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 22:55:01 +0100 Subject: [PATCH 048/154] [mtv] fix extraction for mtv.de (closes #22113) --- youtube_dl/extractor/mtv.py | 51 ++++++++++++++----------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 7a3b57abd..7e95ca18e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -425,14 +425,14 @@ class MTVVideoIE(MTVServicesInfoExtractor): class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' - _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P\d+)-[^/#?]+/*(?:[#?].*)?$' + _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P[0-9a-z]+)' _TESTS = [{ - 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', + 'url': 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum', 'info_dict': { - 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'flv', - 'title': 'MusicVideo_cro-traum', - 'description': 'Cro - Traum', + 'id': 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5', + 'ext': 'mp4', + 'title': 'Traum', + 'description': 'Traum', }, 'params': { # rtmp download @@ -441,11 +441,12 @@ class MTVDEIE(MTVServicesInfoExtractor): 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) - 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', + 'url': 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1', 'info_dict': { - 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'flv', - 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', + 'id': '1e5a878b-31c5-11e7-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Teen Mom 2', + 'description': 'md5:dc65e357ef7e1085ed53e9e9d83146a7', }, 'params': { # rtmp download @@ -453,7 +454,7 @@ class MTVDEIE(MTVServicesInfoExtractor): }, 'skip': 'Blocked at Travis CI', }, { - 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', + 'url': 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', 'ext': 'mp4', @@ -466,25 +467,11 @@ class MTVDEIE(MTVServicesInfoExtractor): }, 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] + _GEO_COUNTRIES = ['DE'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - playlist = self._parse_json( - self._search_regex( - r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), - video_id) - - def _mrss_url(item): - return item['mrss'] + item.get('mrssvars', '') - - # news pages contain single video in playlist with different id - if len(playlist) == 1: - return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) - - for item in playlist: - item_id = item.get('id') - if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(_mrss_url(item), video_id) + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtv.de', + 'mgid': uri, + } From 3cdcebf5470a56df7d52e6f8acbcde5b4b9f0241 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 23:31:14 +0100 Subject: [PATCH 049/154] [mtv] add support for mtvjapan.com --- youtube_dl/extractor/mtv.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 7e95ca18e..fedd5f46b 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -349,33 +350,29 @@ class MTVIE(MTVServicesInfoExtractor): }] -class MTV81IE(InfoExtractor): - IE_NAME = 'mtv81' - _VALID_URL = r'https?://(?:www\.)?mtv81\.com/videos/(?P[^/?#.]+)' +class MTVJapanIE(MTVServicesInfoExtractor): + IE_NAME = 'mtvjapan' + _VALID_URL = r'https?://(?:www\.)?mtvjapan\.com/videos/(?P[0-9a-z]+)' _TEST = { - 'url': 'http://www.mtv81.com/videos/artist-to-watch/the-godfather-of-japanese-hip-hop-segment-1/', - 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b', + 'url': 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade', 'info_dict': { - 'id': '5e14040d-18a4-47c4-a582-43ff602de88e', + 'id': 'bc01da03-6fe5-4284-8880-f291f4e368f5', 'ext': 'mp4', - 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer', - 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.', - 'timestamp': 1468846800, - 'upload_date': '20160718', + 'title': '【Fresh Info】Cadillac ESCALADE Sport Edition', + }, + 'params': { + 'skip_download': True, }, } + _GEO_COUNTRIES = ['JP'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - def _extract_mgid(self, webpage): - return self._search_regex( - r'getTheVideo\((["\'])(?Pmgid:.+?)\1', webpage, - 'mgid', group='id') - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) - return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtvjapan.com', + 'mgid': uri, + } class MTVVideoIE(MTVServicesInfoExtractor): From 01358b9fc198cafb619a03ed5ad7865a74805611 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 23:34:31 +0100 Subject: [PATCH 050/154] [extractors] add import for MTVJapanIE --- youtube_dl/extractor/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 114ede8b9..c10bcbcc1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -666,7 +666,7 @@ from .mtv import ( MTVVideoIE, MTVServicesEmbeddedIE, MTVDEIE, - MTV81IE, + MTVJapanIE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE From dd90a21c28cb1ec592e5961a5f67556edfb3ce87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 05:49:36 +0700 Subject: [PATCH 051/154] [go] Add support for abc.com and freeform.com (closes #22823, closes #22864) --- youtube_dl/extractor/go.py | 44 ++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 03e48f4ea..107059023 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -40,8 +40,8 @@ class GoIE(AdobePassIE): 'resource_id': 'Disney', } } - _VALID_URL = r'https?://(?:(?:(?P%s)\.)?go|(?Pdisneynow))\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ - % '|'.join(list(_SITE_INFO.keys()) + ['disneynow']) + _VALID_URL = r'https?://(?:(?:(?P%s)\.)?go|(?Pabc|freeform|disneynow))\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ + % '|'.join(list(_SITE_INFO.keys())) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { @@ -54,6 +54,7 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + 'skip': 'This content is no longer available.', }, { 'url': 'http://watchdisneyxd.go.com/doraemon', 'info_dict': { @@ -61,6 +62,34 @@ class GoIE(AdobePassIE): 'id': 'SH55574025', }, 'playlist_mincount': 51, + }, { + 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', + 'info_dict': { + 'id': 'VDKA3609139', + 'ext': 'mp4', + 'title': 'This Guilty Blood', + 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', + 'age_limit': 14, + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet', + 'info_dict': { + 'id': 'VDKA13435179', + 'ext': 'mp4', + 'title': 'The Bet', + 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404', + 'age_limit': 14, + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -95,10 +124,13 @@ class GoIE(AdobePassIE): if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) video_id = self._search_regex( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', - default=video_id) + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', From aef9f87ea4dcfe483c5b776f1c37310766ad818d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 05:52:15 +0700 Subject: [PATCH 052/154] [go] Improve and beautify _VALID_URL --- youtube_dl/extractor/go.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 107059023..03cfba91f 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -40,8 +40,17 @@ class GoIE(AdobePassIE): 'resource_id': 'Disney', } } - _VALID_URL = r'https?://(?:(?:(?P%s)\.)?go|(?Pabc|freeform|disneynow))\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ - % '|'.join(list(_SITE_INFO.keys())) + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?P%s)\.)?go| + (?Pabc|freeform|disneynow) + )\.com/ + (?: + (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| + (?:[^/]+/)*(?P[^/?\#]+) + ) + ''' % '|'.join(list(_SITE_INFO.keys())) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { From 0d7392e68b7ebb7215651da0784e859d7bdff826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 05:54:32 +0700 Subject: [PATCH 053/154] [ChangeLog] Actualize [ci skip] --- ChangeLog | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/ChangeLog b/ChangeLog index 64233b03b..b664368a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,48 @@ +version + +Extractors ++ [go] Add support for abc.com and freeform.com (#22823, #22864) ++ [mtv] Add support for mtvjapan.com +* [mtv] Fix extraction for mtv.de (#22113) +* [videodetective] Fix extraction +* [internetvideoarchive] Fix extraction +* [nbcnews] Fix extraction (#12569, #12576, #21703, #21923) +- [hark] Remove extractor +- [tutv] Remove extractor +- [learnr] Remove extractor +- [macgamestore] Remove extractor +* [la7] Update Kaltura service URL (#22358) +* [thesun] Fix extraction (#16966) +- [makertv] Remove extractor ++ [tenplay] Add support for 10play.com.au (#21446) +* [soundcloud] Improve extraction + * Improve format extraction (#22123) + + Extract uploader_id and uploader_url (#21916) + + Extract all known thumbnails (#19071, #20659) + * Fix extration for private playlists (#20976) + + Add support for playlist embeds (#20976) + * Skip preview formats (#22806) +* [dplay] Improve extraction + + Add support for dplay.fi, dplay.jp and es.dplay.com (#16969) + * Fix it.dplay.com extraction (#22826) + + Extract creator, tags and thumbnails + * Handle playback API call errors ++ [discoverynetworks] Add support for dplay.co.uk +* [vk] Improve extraction + + Add support for Odnoklassniki embeds + + Extract more videos from user lists (#4470) + + Fix wall post audio extraction (#18332) + * Improve error detection (#22568) ++ [odnoklassniki] Add support for embeds +* [puhutv] Improve extraction + * Fix subtitles extraction + * Transform HLS URLs to HTTP URLs + * Improve metadata extraction +* [ceskatelevize] Skip DRM media ++ [facebook] Extract subtitles (#22777) +* [globo] Handle alternative hash signing method + + version 2019.10.22 Core From 53896ca5be9a629c2cbaceb3fe43c707bb217437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 06:10:20 +0700 Subject: [PATCH 054/154] [utils] Actualize major IPv4 address blocks per country --- youtube_dl/utils.py | 71 +++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 53117ea90..aed988b88 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4979,7 +4979,7 @@ class ISO3166Utils(object): class GeoUtils(object): # Major IPv4 address blocks per country _country_ip_map = { - 'AD': '85.94.160.0/19', + 'AD': '46.172.224.0/19', 'AE': '94.200.0.0/13', 'AF': '149.54.0.0/17', 'AG': '209.59.64.0/18', @@ -4987,28 +4987,30 @@ class GeoUtils(object): 'AL': '46.99.0.0/16', 'AM': '46.70.0.0/15', 'AO': '105.168.0.0/13', - 'AP': '159.117.192.0/21', + 'AP': '182.50.184.0/21', + 'AQ': '23.154.160.0/24', 'AR': '181.0.0.0/12', 'AS': '202.70.112.0/20', - 'AT': '84.112.0.0/13', + 'AT': '77.116.0.0/14', 'AU': '1.128.0.0/11', 'AW': '181.41.0.0/18', - 'AZ': '5.191.0.0/16', + 'AX': '185.217.4.0/22', + 'AZ': '5.197.0.0/16', 'BA': '31.176.128.0/17', 'BB': '65.48.128.0/17', 'BD': '114.130.0.0/16', 'BE': '57.0.0.0/8', - 'BF': '129.45.128.0/17', + 'BF': '102.178.0.0/15', 'BG': '95.42.0.0/15', 'BH': '37.131.0.0/17', 'BI': '154.117.192.0/18', 'BJ': '137.255.0.0/16', - 'BL': '192.131.134.0/24', + 'BL': '185.212.72.0/23', 'BM': '196.12.64.0/18', 'BN': '156.31.0.0/16', 'BO': '161.56.0.0/16', 'BQ': '161.0.80.0/20', - 'BR': '152.240.0.0/12', + 'BR': '191.128.0.0/12', 'BS': '24.51.64.0/18', 'BT': '119.2.96.0/19', 'BW': '168.167.0.0/16', @@ -5016,20 +5018,20 @@ class GeoUtils(object): 'BZ': '179.42.192.0/18', 'CA': '99.224.0.0/11', 'CD': '41.243.0.0/16', - 'CF': '196.32.200.0/21', - 'CG': '197.214.128.0/17', + 'CF': '197.242.176.0/21', + 'CG': '160.113.0.0/16', 'CH': '85.0.0.0/13', - 'CI': '154.232.0.0/14', + 'CI': '102.136.0.0/14', 'CK': '202.65.32.0/19', 'CL': '152.172.0.0/14', - 'CM': '165.210.0.0/15', + 'CM': '102.244.0.0/14', 'CN': '36.128.0.0/10', 'CO': '181.240.0.0/12', 'CR': '201.192.0.0/12', 'CU': '152.206.0.0/15', 'CV': '165.90.96.0/19', 'CW': '190.88.128.0/17', - 'CY': '46.198.0.0/15', + 'CY': '31.153.0.0/16', 'CZ': '88.100.0.0/14', 'DE': '53.0.0.0/8', 'DJ': '197.241.0.0/17', @@ -5046,6 +5048,7 @@ class GeoUtils(object): 'EU': '2.16.0.0/13', 'FI': '91.152.0.0/13', 'FJ': '144.120.0.0/16', + 'FK': '80.73.208.0/21', 'FM': '119.252.112.0/20', 'FO': '88.85.32.0/19', 'FR': '90.0.0.0/9', @@ -5055,8 +5058,8 @@ class GeoUtils(object): 'GE': '31.146.0.0/16', 'GF': '161.22.64.0/18', 'GG': '62.68.160.0/19', - 'GH': '45.208.0.0/14', - 'GI': '85.115.128.0/19', + 'GH': '154.160.0.0/12', + 'GI': '95.164.0.0/16', 'GL': '88.83.0.0/19', 'GM': '160.182.0.0/15', 'GN': '197.149.192.0/18', @@ -5085,13 +5088,13 @@ class GeoUtils(object): 'JE': '87.244.64.0/18', 'JM': '72.27.0.0/17', 'JO': '176.29.0.0/16', - 'JP': '126.0.0.0/8', + 'JP': '133.0.0.0/8', 'KE': '105.48.0.0/12', 'KG': '158.181.128.0/17', 'KH': '36.37.128.0/17', 'KI': '103.25.140.0/22', 'KM': '197.255.224.0/20', - 'KN': '198.32.32.0/19', + 'KN': '198.167.192.0/19', 'KP': '175.45.176.0/22', 'KR': '175.192.0.0/10', 'KW': '37.36.0.0/14', @@ -5099,10 +5102,10 @@ class GeoUtils(object): 'KZ': '2.72.0.0/13', 'LA': '115.84.64.0/18', 'LB': '178.135.0.0/16', - 'LC': '192.147.231.0/24', + 'LC': '24.92.144.0/20', 'LI': '82.117.0.0/19', 'LK': '112.134.0.0/15', - 'LR': '41.86.0.0/19', + 'LR': '102.183.0.0/16', 'LS': '129.232.0.0/17', 'LT': '78.56.0.0/13', 'LU': '188.42.0.0/16', @@ -5127,7 +5130,7 @@ class GeoUtils(object): 'MT': '46.11.0.0/16', 'MU': '105.16.0.0/12', 'MV': '27.114.128.0/18', - 'MW': '105.234.0.0/16', + 'MW': '102.70.0.0/15', 'MX': '187.192.0.0/11', 'MY': '175.136.0.0/13', 'MZ': '197.218.0.0/15', @@ -5158,23 +5161,23 @@ class GeoUtils(object): 'PW': '202.124.224.0/20', 'PY': '181.120.0.0/14', 'QA': '37.210.0.0/15', - 'RE': '139.26.0.0/16', + 'RE': '102.35.0.0/16', 'RO': '79.112.0.0/13', - 'RS': '178.220.0.0/14', + 'RS': '93.86.0.0/15', 'RU': '5.136.0.0/13', - 'RW': '105.178.0.0/15', + 'RW': '41.186.0.0/16', 'SA': '188.48.0.0/13', 'SB': '202.1.160.0/19', 'SC': '154.192.0.0/11', - 'SD': '154.96.0.0/13', + 'SD': '102.120.0.0/13', 'SE': '78.64.0.0/12', - 'SG': '152.56.0.0/14', + 'SG': '8.128.0.0/10', 'SI': '188.196.0.0/14', 'SK': '78.98.0.0/15', - 'SL': '197.215.0.0/17', + 'SL': '102.143.0.0/17', 'SM': '89.186.32.0/19', 'SN': '41.82.0.0/15', - 'SO': '197.220.64.0/19', + 'SO': '154.115.192.0/18', 'SR': '186.179.128.0/17', 'SS': '105.235.208.0/21', 'ST': '197.159.160.0/19', @@ -5197,15 +5200,15 @@ class GeoUtils(object): 'TV': '202.2.96.0/19', 'TW': '120.96.0.0/11', 'TZ': '156.156.0.0/14', - 'UA': '93.72.0.0/13', - 'UG': '154.224.0.0/13', - 'US': '3.0.0.0/8', + 'UA': '37.52.0.0/14', + 'UG': '102.80.0.0/13', + 'US': '6.0.0.0/8', 'UY': '167.56.0.0/13', - 'UZ': '82.215.64.0/18', + 'UZ': '84.54.64.0/18', 'VA': '212.77.0.0/19', - 'VC': '24.92.144.0/20', + 'VC': '207.191.240.0/21', 'VE': '186.88.0.0/13', - 'VG': '172.103.64.0/18', + 'VG': '66.81.192.0/20', 'VI': '146.226.0.0/16', 'VN': '14.160.0.0/11', 'VU': '202.80.32.0/20', @@ -5214,8 +5217,8 @@ class GeoUtils(object): 'YE': '134.35.0.0/16', 'YT': '41.242.116.0/22', 'ZA': '41.0.0.0/11', - 'ZM': '165.56.0.0/13', - 'ZW': '41.85.192.0/19', + 'ZM': '102.144.0.0/13', + 'ZW': '102.177.192.0/18', } @classmethod From cae0bbc53831eed38c4af3755de43e223c503270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 06:11:09 +0700 Subject: [PATCH 055/154] [ChangeLog] Actualize [ci skip] --- ChangeLog | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ChangeLog b/ChangeLog index b664368a1..2957b7ced 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ version +Core +* [utils] Actualize major IPv4 address blocks per country + Extractors + [go] Add support for abc.com and freeform.com (#22823, #22864) + [mtv] Add support for mtvjapan.com From c4bd9cb7bb57c6e4bbc04fb054dfea14d4ecb171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 06:12:33 +0700 Subject: [PATCH 056/154] release 2019.10.29 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 10 +++------- youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 21 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index f1afe704c..f82502bd1 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.22 + [debug] youtube-dl version 2019.10.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index a4dc9b005..5ef983d43 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 5bf86adce..8f05aa79f 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 7aa5534e5..e90900d8d 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.22 + [debug] youtube-dl version 2019.10.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 5d3645e3d..7021d7397 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 2957b7ced..fcab1102c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.10.29 Core * [utils] Actualize major IPv4 address blocks per country diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a1b0edeeb..af905db5a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -232,7 +232,6 @@ - **DouyuShow** - **DouyuTV**: 斗鱼 - **DPlay** - - **DPlayIt** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -339,7 +338,6 @@ - **Goshgay** - **GPUTechConf** - **Groupon** - - **Hark** - **hbo** - **HearThisAt** - **Heise** @@ -432,7 +430,6 @@ - **Lcp** - **LcpPlay** - **Le**: 乐视网 - - **Learnr** - **Lecture2Go** - **Lecturio** - **LecturioCourse** @@ -466,11 +463,9 @@ - **lynda**: lynda.com videos - **lynda:course**: lynda.com online courses - **m6** - - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - - **MakerTV** - **MallTV** - **mangomolo:live** - **mangomolo:video** @@ -526,8 +521,8 @@ - **mtg**: MTG services - **mtv** - **mtv.de** - - **mtv81** - **mtv:video** + - **mtvjapan** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** @@ -815,6 +810,7 @@ - **soundcloud:set** - **soundcloud:trackstation** - **soundcloud:user** + - **SoundcloudEmbed** - **soundgasm** - **soundgasm:profile** - **southpark.cc.com** @@ -887,6 +883,7 @@ - **TeleTask** - **Telewebion** - **TennisTV** + - **TenPlay** - **TF1** - **TFO** - **TheIntercept** @@ -925,7 +922,6 @@ - **tunein:topic** - **TunePk** - **Turbo** - - **Tutv** - **tv.dfb.de** - **TV2** - **tv2.hu** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 39b355b9e..924f26ca8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.10.22' +__version__ = '2019.10.29' From 7455832f311843663b416968b9e5a0a0c6134d8d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Oct 2019 09:43:17 +0100 Subject: [PATCH 057/154] [fox9] fix extraction --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/fox9.py | 43 +++++++++++++++--------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c10bcbcc1..15f96fb8f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -367,7 +367,10 @@ from .fourtube import ( FuxIE, ) from .fox import FOXIE -from .fox9 import FOX9IE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, diff --git a/youtube_dl/extractor/fox9.py b/youtube_dl/extractor/fox9.py index 17dfffa7b..91f8f7b8a 100644 --- a/youtube_dl/extractor/fox9.py +++ b/youtube_dl/extractor/fox9.py @@ -1,13 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -from .anvato import AnvatoIE +from .common import InfoExtractor -class FOX9IE(AnvatoIE): - _VALID_URL = r'https?://(?:www\.)?fox9\.com/(?:[^/]+/)+(?P\d+)-story' - _TESTS = [{ - 'url': 'http://www.fox9.com/news/215123287-story', +class FOX9IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fox9\.com/video/(?P\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'anvato:anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b:' + video_id, + 'Anvato', video_id) + + +class FOX9NewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fox9\.com/news/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.fox9.com/news/black-bear-in-tree-draws-crowd-in-downtown-duluth-minnesota', 'md5': 'd6e1b2572c3bab8a849c9103615dd243', 'info_dict': { 'id': '314473', @@ -21,22 +31,11 @@ class FOX9IE(AnvatoIE): 'categories': ['News', 'Sports'], 'tags': ['news', 'video'], }, - }, { - 'url': 'http://www.fox9.com/news/investigators/214070684-story', - 'only_matching': True, - }] + } def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_id = self._parse_json( - self._search_regex( - r"this\.videosJson\s*=\s*'(\[.+?\])';", - webpage, 'anvato playlist'), - video_id)[0]['video'] - - return self._get_anvato_videos( - 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b', - video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + anvato_id = self._search_regex( + r'anvatoId\s*:\s*[\'"](\d+)', webpage, 'anvato id') + return self.url_result('https://www.fox9.com/video/' + anvato_id, 'FOX9') From 8989349e6dcaa98204f77fb9f1e15a86eecb823d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Oct 2019 09:44:07 +0100 Subject: [PATCH 058/154] [onet] improve extraction - add support for onet100.vod.pl domain - extract m3u8 formats - correct audio only format info --- youtube_dl/extractor/onet.py | 54 ++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 58da1bc27..e55b2ac89 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -20,6 +20,8 @@ from ..utils import ( class OnetBaseIE(InfoExtractor): + _URL_BASE_RE = r'https?://(?:(?:www\.)?onet\.tv|onet100\.vod\.pl)/[a-z]/' + def _search_mvp_id(self, webpage): return self._search_regex( r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') @@ -45,7 +47,7 @@ class OnetBaseIE(InfoExtractor): video = response['result'].get('0') formats = [] - for _, formats_dict in video['formats'].items(): + for format_type, formats_dict in video['formats'].items(): if not isinstance(formats_dict, dict): continue for format_id, format_list in formats_dict.items(): @@ -56,21 +58,31 @@ class OnetBaseIE(InfoExtractor): if not video_url: continue ext = determine_ext(video_url) - if format_id == 'ism': + if format_id.startswith('ism'): formats.extend(self._extract_ism_formats( video_url, video_id, 'mss', fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) + elif format_id.startswith('hls'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - formats.append({ + http_f = { 'url': video_url, 'format_id': format_id, - 'height': int_or_none(f.get('vertical_resolution')), - 'width': int_or_none(f.get('horizontal_resolution')), 'abr': float_or_none(f.get('audio_bitrate')), - 'vbr': float_or_none(f.get('video_bitrate')), - }) + } + if format_type == 'audio': + http_f['vcodec'] = 'none' + else: + http_f.update({ + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + formats.append(http_f) self._sort_formats(formats) meta = video.get('meta', {}) @@ -105,12 +117,12 @@ class OnetMVPIE(OnetBaseIE): class OnetIE(OnetBaseIE): - _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' + _VALID_URL = OnetBaseIE._URL_BASE_RE + r'[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' IE_NAME = 'onet.tv' - _TEST = { + _TESTS = [{ 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', - 'md5': 'e3ffbf47590032ac3f27249204173d50', + 'md5': '436102770fb095c75b8bb0392d3da9ff', 'info_dict': { 'id': 'qbpyqc', 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', @@ -120,7 +132,10 @@ class OnetIE(OnetBaseIE): 'upload_date': '20160705', 'timestamp': 1467721580, }, - } + }, { + 'url': 'https://onet100.vod.pl/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -140,18 +155,21 @@ class OnetIE(OnetBaseIE): class OnetChannelIE(OnetBaseIE): - _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P[a-z]+)(?:[?#]|$)' + _VALID_URL = OnetBaseIE._URL_BASE_RE + r'(?P[a-z]+)(?:[?#]|$)' IE_NAME = 'onet.tv:channel' - _TEST = { + _TESTS = [{ 'url': 'http://onet.tv/k/openerfestival', 'info_dict': { 'id': 'openerfestival', - 'title': 'Open\'er Festival Live', - 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.', + 'title': "Open'er Festival", + 'description': "Tak było na Open'er Festival 2016! Oglądaj nasze reportaże i wywiady z artystami.", }, - 'playlist_mincount': 46, - } + 'playlist_mincount': 35, + }, { + 'url': 'https://onet100.vod.pl/k/openerfestival', + 'only_matching': True, + }] def _real_extract(self, url): channel_id = self._match_id(url) @@ -173,7 +191,7 @@ class OnetChannelIE(OnetBaseIE): 'Downloading channel %s - add --no-playlist to just download video %s' % ( channel_id, video_name)) matches = re.findall( - r']+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)', + r']+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE, webpage) entries = [ self.url_result(video_link, OnetIE.ie_key()) From c56b2ac43ca27b32fb4f7b230d851a61b5fc7cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Oct 2019 02:21:03 +0700 Subject: [PATCH 059/154] [tv2dk] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tv2dk.py | 82 ++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 youtube_dl/extractor/tv2dk.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 15f96fb8f..5d20ba863 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1189,6 +1189,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, ) +from .tv2dk import TV2DKIE from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py new file mode 100644 index 000000000..eb39424df --- /dev/null +++ b/youtube_dl/extractor/tv2dk.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class TV2DKIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + tvsyd| + tv2ostjylland| + tvmidtvest| + tv2fyn| + tv2east| + tv2lorry| + tv2nord + )\.dk/ + (:[^/]+/)* + (?P[^/?\#&]+) + ''' + _TESTS = [{ + 'url': 'https://www.tvsyd.dk/nyheder/28-10-2019/1930/1930-28-okt-2019?autoplay=1#player', + 'info_dict': { + 'id': '0_52jmwa0p', + 'ext': 'mp4', + 'title': '19:30 - 28. okt. 2019', + 'timestamp': 1572290248, + 'upload_date': '20191028', + 'uploader_id': 'tvsyd', + 'duration': 1347, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tv2ostjylland.dk/artikel/minister-gaar-ind-i-sag-om-diabetes-teknologi', + 'only_matching': True, + }, { + 'url': 'https://www.tv2ostjylland.dk/nyheder/28-10-2019/22/2200-nyhederne-mandag-d-28-oktober-2019?autoplay=1#player', + 'only_matching': True, + }, { + 'url': 'https://www.tvmidtvest.dk/nyheder/27-10-2019/1930/1930-27-okt-2019', + 'only_matching': True, + }, { + 'url': 'https://www.tv2fyn.dk/artikel/fyn-kan-faa-landets-foerste-fabrik-til-groent-jetbraendstof', + 'only_matching': True, + }, { + 'url': 'https://www.tv2east.dk/artikel/gods-faar-indleveret-tonsvis-af-aebler-100-kilo-aebler-gaar-til-en-aeblebrandy', + 'only_matching': True, + }, { + 'url': 'https://www.tv2lorry.dk/koebenhavn/rasmus-paludan-evakueret-til-egen-demonstration#player', + 'only_matching': True, + }, { + 'url': 'https://www.tv2nord.dk/artikel/dybt-uacceptabelt', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + entries = [] + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): + video = extract_attributes(video_el) + kaltura_id = video.get('data-entryid') + if not kaltura_id: + continue + partner_id = video.get('data-partnerid') + if not partner_id: + continue + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + return self.playlist_result(entries) From 9a621ddc3a42769f107f8bd0d67b2c7073ea8256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Oct 2019 02:21:52 +0700 Subject: [PATCH 060/154] [tv2] Fix and improve extraction (closes #22787) --- youtube_dl/extractor/tv2.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index d5071e8a5..1b6590767 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -11,6 +11,7 @@ from ..utils import ( js_to_json, parse_iso8601, remove_end, + try_get, ) @@ -44,7 +45,14 @@ class TV2IE(InfoExtractor): data = self._download_json( 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol), video_id, 'Downloading play JSON')['playback'] - for item in data['items']['item']: + items = try_get(data, lambda x: x['items']['item']) + if not items: + continue + if not isinstance(items, list): + items = [items] + for item in items: + if not isinstance(item, dict): + continue video_url = item.get('url') if not video_url or video_url in format_urls: continue From 45f4a433894556301204b704caca7d6a14286287 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Oct 2019 23:07:35 +0100 Subject: [PATCH 061/154] [yahoo] improve extraction - add support for live streams(closes #3597)(closes #3779)(closes #22178) - bypass cookie consent page for european domains(closes #16948)(closes #22576) - add generic support for embeds(closes #20332) --- youtube_dl/extractor/yahoo.py | 672 +++++++++++++--------------------- 1 file changed, 264 insertions(+), 408 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e5ebdd180..ee68096d0 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -3,453 +3,309 @@ from __future__ import unicode_literals import hashlib import itertools -import json import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_str, compat_urllib_parse, - compat_urlparse, ) from ..utils import ( clean_html, - determine_ext, - ExtractorError, - extract_attributes, int_or_none, mimetype2ext, + parse_iso8601, smuggle_url, try_get, - unescapeHTML, url_or_none, ) -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nbc import NBCSportsVPlayerIE +from .brightcove import BrightcoveNewIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2})\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?:(?P.+)?-)?(?P[0-9]+)(?:-[a-z]+)?(?:\.html)?' - _TESTS = [ - { - 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - 'info_dict': { - 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', - 'ext': 'mp4', - 'title': 'Julian Smith & Travis Legg Watch Julian Smith', - 'description': 'Julian and Travis watch Julian Smith', - 'duration': 6863, - }, + _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)\.html)' + _TESTS = [{ + 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + 'info_dict': { + 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', + 'ext': 'mp4', + 'title': 'Julian Smith & Travis Legg Watch Julian Smith', + 'description': 'Julian and Travis watch Julian Smith', + 'duration': 6863, + 'timestamp': 1369812016, + 'upload_date': '20130529', }, - { - 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': '251af144a19ebc4a033e8ba91ac726bb', - 'info_dict': { - 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', - 'ext': 'mp4', - 'title': 'Codefellas - The Cougar Lies with Spanish Moss', - 'description': 'md5:66b627ab0a282b26352136ca96ce73c1', - 'duration': 151, - }, - 'skip': 'HTTP Error 404', + }, { + 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', + 'md5': '7993e572fac98e044588d0b5260f4352', + 'info_dict': { + 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', + 'ext': 'mp4', + 'title': "Yahoo Saves 'Community'", + 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', + 'duration': 170, + 'timestamp': 1406838636, + 'upload_date': '20140731', }, - { - 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '7993e572fac98e044588d0b5260f4352', - 'info_dict': { - 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', - 'ext': 'mp4', - 'title': "Yahoo Saves 'Community'", - 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', - 'duration': 170, - } - }, - { - 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', - 'md5': '45c024bad51e63e9b6f6fad7a43a8c23', - 'info_dict': { - 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', - 'ext': 'mp4', - 'title': '敢問市長/黃秀霜批賴清德「非常高傲」', - 'description': '直言台南沒捷運 交通居五都之末', - 'duration': 396, - }, - }, - { - 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '71298482f7c64cbb7fa064e4553ff1c1', - 'info_dict': { - 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'webm', - 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', - 'description': 'md5:f66c890e1490f4910a9953c941dee944', - 'duration': 97, - } - }, - { - 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html', - 'md5': '57e06440778b1828a6079d2f744212c4', - 'info_dict': { - 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73', - 'ext': 'mp4', - 'title': 'Program that makes hockey more affordable not offered in Manitoba', - 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4', - 'duration': 121, - }, - 'skip': 'Video gone', - }, { - 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html', - 'info_dict': { - 'id': '154609075', - }, - 'playlist': [{ - 'md5': '000887d0dc609bc3a47c974151a40fb8', - 'info_dict': { - 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: War', - 'description': 'The Interview', - 'duration': 30, - }, - }, { - 'md5': '81bc74faf10750fe36e4542f9a184c66', - 'info_dict': { - 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: Guys', - 'description': 'The Interview', - 'duration': 30, - }, - }], - }, { - 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', - 'md5': '88e209b417f173d86186bef6e4d1f160', - 'info_dict': { - 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', - 'ext': 'mp4', - 'title': 'China Moses Is Crazy About the Blues', - 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', - 'duration': 128, - } - }, { - 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html', - 'md5': 'd9a083ccf1379127bf25699d67e4791b', - 'info_dict': { - 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c', - 'ext': 'mp4', - 'title': 'Connect the Dots: Dark Side of Virgo', - 'description': 'md5:1428185051cfd1949807ad4ff6d3686a', - 'duration': 201, - }, - 'skip': 'Domain name in.lifestyle.yahoo.com gone', - }, { - 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': '989396ae73d20c6f057746fb226aa215', - 'info_dict': { - 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', - 'ext': 'mp4', - 'title': '\'True Story\' Trailer', - 'description': 'True Story', - 'duration': 150, - }, - }, { - 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', - 'only_matching': True, - }, { - 'note': 'NBC Sports embeds', - 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', - 'info_dict': { - 'id': '9CsDKds0kvHI', - 'ext': 'flv', - 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', - 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', - 'upload_date': '20150313', - 'uploader': 'NBCU-SPORTS', - 'timestamp': 1426270238, - } - }, { - 'url': 'https://tw.news.yahoo.com/-100120367.html', - 'only_matching': True, - }, { - # Query result is embedded in webpage, but explicit request to video API fails with geo restriction - 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', - 'info_dict': { - 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', - 'ext': 'mp4', - 'title': 'Communitary - Community Episode 1: Ladders', - 'description': 'md5:8fc39608213295748e1e289807838c97', - 'duration': 1646, - }, - }, { - # it uses an alias to get the video_id - 'url': 'https://www.yahoo.com/movies/the-stars-of-daddys-home-have-very-different-212843197.html', - 'info_dict': { - 'id': '40eda9c8-8e5f-3552-8745-830f67d0c737', - 'ext': 'mp4', - 'title': 'Will Ferrell & Mark Wahlberg Are Pro-Spanking', - 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.', - }, - }, - { - # config['models']['applet_model']['data']['sapi'] has no query - 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016', - 'md5': 'dac0c72d502bc5facda80c9e6d5c98db', - 'info_dict': { - 'id': 'a6015640-e9e5-3efb-bb60-05589a183919', - 'ext': 'mp4', - 'description': 'Galactic', - 'title': 'Dolla Diva (feat. Maggie Koerner)', - }, - 'skip': 'redirect to https://www.yahoo.com/music', - }, - { - # yahoo://article/ - 'url': 'https://www.yahoo.com/movies/video/true-story-trailer-173000497.html', - 'info_dict': { - 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', - 'ext': 'mp4', - 'title': "'True Story' Trailer", - 'description': 'True Story', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # ytwnews://cavideo/ - 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', - 'info_dict': { - 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', - 'ext': 'mp4', - 'title': '單車天使 - 中文版預', - 'description': '中文版預', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # custom brightcove - 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37083565/clown-entertainers-say-it-is-hurting-their-business/', - 'info_dict': { - 'id': '5575377707001', - 'ext': 'mp4', - 'title': "Clown entertainers say 'It' is hurting their business", - 'description': 'Stephen King s horror film has much to answer for. Jelby and Mr Loopy the Clowns join us.', - 'timestamp': 1505341164, - 'upload_date': '20170913', - 'uploader_id': '2376984109001', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # custom brightcove, geo-restricted to Australia, bypassable - 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37263964/sunrise-episode-wed-27-sep/', - 'only_matching': True, + }, { + 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', + 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'info_dict': { + 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', + 'ext': 'mp4', + 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', + 'description': 'md5:f66c890e1490f4910a9953c941dee944', + 'duration': 97, + 'timestamp': 1414489862, + 'upload_date': '20141028', } - ] + }, { + 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + 'md5': '88e209b417f173d86186bef6e4d1f160', + 'info_dict': { + 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', + 'ext': 'mp4', + 'title': 'China Moses Is Crazy About the Blues', + 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', + 'duration': 128, + 'timestamp': 1385722202, + 'upload_date': '20131129', + } + }, { + 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', + 'md5': '2a9752f74cb898af5d1083ea9f661b58', + 'info_dict': { + 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', + 'ext': 'mp4', + 'title': '\'True Story\' Trailer', + 'description': 'True Story', + 'duration': 150, + 'timestamp': 1418919206, + 'upload_date': '20141218', + }, + }, { + 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', + 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', + 'timestamp': 1426270238, + }, + }, { + 'url': 'https://tw.news.yahoo.com/-100120367.html', + 'only_matching': True, + }, { + # Query result is embedded in webpage, but explicit request to video API fails with geo restriction + 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'info_dict': { + 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', + 'ext': 'mp4', + 'title': 'Communitary - Community Episode 1: Ladders', + 'description': 'md5:8fc39608213295748e1e289807838c97', + 'duration': 1646, + 'timestamp': 1440436550, + 'upload_date': '20150824', + 'series': 'Communitary', + 'season_number': 6, + 'episode_number': 1, + }, + }, { + # ytwnews://cavideo/ + 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', + 'info_dict': { + 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', + 'ext': 'mp4', + 'title': '單車天使 - 中文版預', + 'description': '中文版預', + 'timestamp': 1476696196, + 'upload_date': '20161017', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Contains both a Yahoo hosted video and multiple Youtube embeds + 'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html', + 'info_dict': { + 'id': '46c5d95a-528f-3d03-b732-732fcadd51de', + 'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead', + 'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.', + }, + 'playlist': [{ + 'info_dict': { + 'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6', + 'ext': 'mp4', + 'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs', + 'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.', + 'timestamp': 1572406500, + 'upload_date': '20191030', + }, + }, { + 'info_dict': { + 'id': '352CFDOQrKg', + 'ext': 'mp4', + 'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019', + 'description': 'md5:35b61e94c2ae214bc965ff4245f80d11', + 'uploader': 'The Voice', + 'uploader_id': 'NBCTheVoice', + 'upload_date': '20191029', + }, + }], + 'params': { + 'playlistend': 2, + }, + }, { + 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', + 'only_matching': True, + }, { + 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - display_id = mobj.group('display_id') or page_id - host = mobj.group('host') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'err=404' in urlh.geturl(): - raise ExtractorError('Video gone', expected=True) - - # Look for iframed media first - entries = [] - iframe_urls = re.findall(r']+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) - for idx, iframe_url in enumerate(iframe_urls): - entries.append(self.url_result(host + iframe_url, 'Yahoo')) - if entries: - return self.playlist_result(entries, page_id) - - # Look for NBCSports iframes - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) - - # Look for Brightcove Legacy Studio embeds - bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if bc_url: - return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) - - def brightcove_url_result(bc_url): - return self.url_result( - smuggle_url(bc_url, {'geo_countries': [mobj.group('country')]}), - BrightcoveNewIE.ie_key()) - - # Look for Brightcove New Studio embeds - bc_url = BrightcoveNewIE._extract_url(self, webpage) - if bc_url: - return brightcove_url_result(bc_url) - - brightcove_iframe = self._search_regex( - r'(]+data-video-id=["\']\d+[^>]+>)', webpage, - 'brightcove iframe', default=None) - if brightcove_iframe: - attr = extract_attributes(brightcove_iframe) - src = attr.get('src') - if src: - parsed_src = compat_urlparse.urlparse(src) - qs = compat_urlparse.parse_qs(parsed_src.query) - account_id = qs.get('accountId', ['2376984109001'])[0] - brightcove_id = attr.get('data-video-id') or qs.get('videoId', [None])[0] - if account_id and brightcove_id: - return brightcove_url_result( - 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - % (account_id, brightcove_id)) - - # Query result is often embedded in webpage as JSON. Sometimes explicit requests - # to video API results in a failure with geo restriction reason therefore using - # embedded query result when present sounds reasonable. - config_json = self._search_regex( - r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:|$)', - webpage, 'videoplayer applet', default=None) - if config_json: - config = self._parse_json(config_json, display_id, fatal=False) - if config: - sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') - if sapi and 'query' in sapi: - info = self._extract_info(display_id, sapi, webpage) - self._sort_formats(info['formats']) - return info - - items_json = self._search_regex( - r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, - default=None) - if items_json is None: - alias = self._search_regex( - r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None) - if alias is not None: - alias_info = self._download_json( - 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias, - display_id, 'Downloading alias info') - video_id = alias_info[0]['id'] - else: - CONTENT_ID_REGEXES = [ - r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', - r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"', - r'"first_videoid"\s*:\s*"([^"]+)"', - r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), - r']data-uuid=["\']([^"\']+)', - r']+yahoo://article/view\?.*\buuid=([^&"\']+)', - r']+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']', - ] - video_id = self._search_regex( - CONTENT_ID_REGEXES, webpage, 'content ID') + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' else: - items = json.loads(items_json) - info = items['mediaItems']['query']['results']['mediaObj'][0] - # The 'meta' field is not always in the video webpage, we request it - # from another page - video_id = info['id'] - return self._get_info(video_id, display_id, webpage) + country = country.split('-')[0] + api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - def _extract_info(self, display_id, query, webpage): - info = query['query']['results']['mediaObj'][0] - meta = info.get('meta') - video_id = info.get('id') + for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): + content = self._download_json( + api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, + display_id, 'Downloading content JSON metadata', fatal=i == 1) + if content: + item = content['items'][0] + break - if not meta: - msg = info['status'].get('msg') - if msg: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unable to extract media object meta') + if item.get('type') != 'video': + entries = [] + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in item.get('body', []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + video_id = item['uuid'] + video = self._download_json( + api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + video_id, 'Downloading video JSON metadata')[0] + title = video['title'] + + if country == 'malaysia': + country = 'my' + + is_live = video.get('live_state') == 'live' + fmts = ('m3u8',) if is_live else ('web', 'mp4') + + urls = [] formats = [] - for s in info['streams']: - tbr = int_or_none(s.get('bitrate')) - format_info = { - 'width': int_or_none(s.get('width')), - 'height': int_or_none(s.get('height')), - 'tbr': tbr, - } - - host = s['host'] - path = s['path'] - if host.startswith('rtmp'): - fmt = 'rtmp' - format_info.update({ - 'url': host, - 'play_path': path, - 'ext': 'flv', - }) - else: - if s.get('format') == 'm3u8_playlist': - fmt = 'hls' - format_info.update({ - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - else: - fmt = format_info['ext'] = determine_ext(path) - format_url = compat_urlparse.urljoin(host, path) - format_info['url'] = format_url - format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '') - formats.append(format_info) - - closed_captions = self._html_search_regex( - r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', - default='[]') - - cc_json = self._parse_json(closed_captions, video_id, fatal=False) subtitles = {} - if cc_json: - for closed_caption in cc_json: - lang = closed_caption['lang'] - if lang not in subtitles: - subtitles[lang] = [] - subtitles[lang].append({ - 'url': closed_caption['url'], - 'ext': mimetype2ext(closed_caption['content_type']), + for fmt in fmts: + media_obj = self._download_json( + 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + video_id, 'Downloading %s JSON metadata' % fmt, + headers=self.geo_verification_headers(), query={ + 'format': fmt, + 'region': country.upper(), + })['query']['results']['mediaObj'][0] + msg = media_obj.get('status', {}).get('msg') + + for s in media_obj.get('streams', []): + host = s.get('host') + path = s.get('path') + if not host or not path: + continue + s_url = host + path + if s.get('format') == 'm3u8': + formats.extend(self._extract_m3u8_formats( + s_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + continue + tbr = int_or_none(s.get('bitrate')) + formats.append({ + 'url': s_url, + 'format_id': fmt + ('-%d' % tbr if tbr else ''), + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'tbr': tbr, + 'fps': int_or_none(s.get('framerate')), }) + for cc in media_obj.get('closedcaptions', []): + cc_url = cc.get('url') + if not cc_url or cc_url in urls: + continue + urls.append(cc_url) + subtitles.setdefault(cc.get('lang') or 'en-US', []).append({ + 'url': cc_url, + 'ext': mimetype2ext(cc.get('content_type')), + }) + + streaming_url = video.get('streaming_url') + if streaming_url and not is_live: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + if not formats and msg == 'geo restricted': + self.raise_geo_restricted() + + self._sort_formats(formats) + + thumbnails = [] + for thumb in video.get('thumbnails', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'id': thumb.get('tag'), + 'url': thumb.get('url'), + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + series_info = video.get('series_info') or {} + return { 'id': video_id, - 'display_id': display_id, - 'title': unescapeHTML(meta['title']), + 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'description': clean_html(meta['description']), - 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), - 'duration': int_or_none(meta.get('duration')), + 'display_id': display_id, + 'thumbnails': thumbnails, + 'description': clean_html(video.get('description')), + 'timestamp': parse_iso8601(video.get('publish_time')), 'subtitles': subtitles, + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('view_count')), + 'is_live': is_live, + 'series': video.get('show_name'), + 'season_number': int_or_none(series_info.get('season_number')), + 'episode_number': int_or_none(series_info.get('episode_number')), } - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US').upper() - formats = [] - info = {} - for fmt in ('webm', 'mp4'): - query_result = self._download_json( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id, - display_id, 'Downloading %s video info' % fmt, query={ - 'protocol': 'http', - 'region': region, - 'format': fmt, - }) - info = self._extract_info(display_id, query_result, webpage) - formats.extend(info['formats']) - formats.extend(self._extract_m3u8_formats( - 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - info['formats'] = formats - return info - class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' From 8040a0d35e11f7b2bf6d698175ab0b12424d696f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Oct 2019 23:52:09 +0100 Subject: [PATCH 062/154] [yahoo] fix typo --- youtube_dl/extractor/yahoo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index ee68096d0..6c6bd76e8 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -51,10 +51,10 @@ class YahooIE(InfoExtractor): }, }, { 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', 'info_dict': { 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', 'description': 'md5:f66c890e1490f4910a9953c941dee944', 'duration': 97, @@ -164,6 +164,7 @@ class YahooIE(InfoExtractor): 'params': { 'playlistend': 2, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', 'only_matching': True, @@ -219,7 +220,7 @@ class YahooIE(InfoExtractor): country = 'my' is_live = video.get('live_state') == 'live' - fmts = ('m3u8',) if is_live else ('web', 'mp4') + fmts = ('m3u8',) if is_live else ('webm', 'mp4') urls = [] formats = [] From 237513e801671a51cc45d6a2fe5e7df69517958e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Oct 2019 07:38:53 +0100 Subject: [PATCH 063/154] [yahoo] restore support for cbs suffixed URLs --- test/test_all_urls.py | 6 ------ youtube_dl/extractor/yahoo.py | 5 ++++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 465ce0050..81056a999 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -123,12 +123,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['pbs']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['pbs']) - def test_yahoo_https(self): - # https://github.com/ytdl-org/youtube-dl/issues/2701 - self.assertMatch( - 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', - ['Yahoo']) - def test_no_duplicated_ie_names(self): name_accu = collections.defaultdict(list) for ie in self.ies: diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 6c6bd76e8..f041cf5de 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -25,7 +25,7 @@ from .brightcove import BrightcoveNewIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)\.html)' + _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)(?:-[a-z]+)?\.html)' _TESTS = [{ 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'info_dict': { @@ -171,6 +171,9 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', 'only_matching': True, + }, { + 'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html', + 'only_matching': True, }] def _real_extract(self, url): From 3cf70bf1590ce364dc223197ba804cb70e704760 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Oct 2019 07:44:21 +0100 Subject: [PATCH 064/154] [yahoo] make cbs URL suffix part of the media alias --- youtube_dl/extractor/yahoo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index f041cf5de..b9a9e88a0 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -25,7 +25,7 @@ from .brightcove import BrightcoveNewIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)(?:-[a-z]+)?\.html)' + _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' _TESTS = [{ 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'info_dict': { From e993f1a0959fc04507b1cb2efeb610ae628d6d98 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Oct 2019 08:13:10 +0100 Subject: [PATCH 065/154] [mixcloud] fix cloudcast data extraction(closes #22821) --- youtube_dl/extractor/mixcloud.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index bf5353ef9..e5f631506 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -86,9 +86,10 @@ class MixcloudIE(InfoExtractor): r'', webpage, 'play info'), 'play info') for item in full_info_json: - item_data = try_get( - item, lambda x: x['cloudcast']['data']['cloudcastLookup'], - dict) + item_data = try_get(item, [ + lambda x: x['cloudcast']['data']['cloudcastLookup'], + lambda x: x['cloudcastLookup']['data']['cloudcastLookup'], + ], dict) if try_get(item_data, lambda x: x['streamInfo']['url']): info_json = item_data break From 274bf5e4c58bceed4ff8c283d77457bf1cb76d3e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 11:37:41 +0100 Subject: [PATCH 066/154] [kakao] improve extraction - support embed URLs - support Kakao Legacy vid based embed URLs - only extract fields used for extraction - strip description and extract tags --- youtube_dl/extractor/kakao.py | 45 +++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 7fa140b0c..96f918b75 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -6,14 +6,15 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + strip_or_none, unified_timestamp, update_url_query, ) class KakaoIE(InfoExtractor): - _VALID_URL = r'https?://tv\.kakao\.com/channel/(?P\d+)/cliplink/(?P\d+)' - _API_BASE = 'http://tv.kakao.com/api/v1/ft/cliplinks' + _VALID_URL = r'https?://(?:play-)?tv\.kakao\.com/(?:channel/\d+|embed/player)/cliplink/(?P\d+|[^?#&]+@my)' + _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/' _TESTS = [{ 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', @@ -36,7 +37,7 @@ class KakaoIE(InfoExtractor): 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', 'uploader_id': 2653210, - 'uploader': '쇼 음악중심', + 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } @@ -44,6 +45,8 @@ class KakaoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + display_id = video_id.rstrip('@my') + api_base = self._API_BASE_TMPL % video_id player_header = { 'Referer': update_url_query( @@ -55,20 +58,22 @@ class KakaoIE(InfoExtractor): }) } - QUERY_COMMON = { + query = { 'player': 'monet_html5', 'referer': url, 'uuid': '', 'service': 'kakao_tv', 'section': '', 'dteType': 'PC', + 'fields': ','.join([ + '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', + 'description', 'channelId', 'createTime', 'duration', 'playCount', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', + 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault']) } - query = QUERY_COMMON.copy() - query['fields'] = 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' impress = self._download_json( - '%s/%s/impress' % (self._API_BASE, video_id), - video_id, 'Downloading video info', + api_base + 'impress', display_id, 'Downloading video info', query=query, headers=player_header) clip_link = impress['clipLink'] @@ -78,30 +83,27 @@ class KakaoIE(InfoExtractor): tid = impress.get('tid', '') - query = QUERY_COMMON.copy() query.update({ + 'fields': '-*,outputList,profile,width,height,label,filesize', 'tid': tid, 'profile': 'HIGH', }) raw = self._download_json( - '%s/%s/raw' % (self._API_BASE, video_id), - video_id, 'Downloading video formats info', + api_base + 'raw', display_id, 'Downloading video formats info', query=query, headers=player_header) formats = [] for fmt in raw.get('outputList', []): try: profile_name = fmt['profile'] + query.update({ + 'profile': profile_name, + 'fields': '-*,url', + }) fmt_url_json = self._download_json( - '%s/%s/raw/videolocation' % (self._API_BASE, video_id), - video_id, + api_base + 'raw/videolocation', display_id, 'Downloading video URL for profile %s' % profile_name, - query={ - 'service': 'kakao_tv', - 'section': '', - 'tid': tid, - 'profile': profile_name - }, headers=player_header, fatal=False) + query=query, headers=player_header, fatal=False) if fmt_url_json is None: continue @@ -134,9 +136,9 @@ class KakaoIE(InfoExtractor): }) return { - 'id': video_id, + 'id': display_id, 'title': title, - 'description': clip.get('description'), + 'description': strip_or_none(clip.get('description')), 'uploader': clip_link.get('channel', {}).get('name'), 'uploader_id': clip_link.get('channelId'), 'thumbnails': thumbs, @@ -146,4 +148,5 @@ class KakaoIE(InfoExtractor): 'like_count': int_or_none(clip.get('likeCount')), 'comment_count': int_or_none(clip.get('commentCount')), 'formats': formats, + 'tags': clip.get('tagList'), } From d439989215fcb1672bc2ac18d4fb6206e12c387a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 11:43:18 +0100 Subject: [PATCH 067/154] [daum] fix VOD and Clip extracton(closes #15015) --- youtube_dl/extractor/daum.py | 106 +++++++++++------------------------ 1 file changed, 32 insertions(+), 74 deletions(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 76f021892..137095577 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -2,25 +2,21 @@ from __future__ import unicode_literals -import re import itertools from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, compat_urlparse, ) -from ..utils import ( - int_or_none, - str_to_int, - xpath_text, - unescapeHTML, -) -class DaumIE(InfoExtractor): +class DaumBaseIE(InfoExtractor): + _KAKAO_EMBED_BASE = 'http://tv.kakao.com/embed/player/cliplink/' + + +class DaumIE(DaumBaseIE): _VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P[^?#&]+)' IE_NAME = 'daum.net' @@ -36,6 +32,9 @@ class DaumIE(InfoExtractor): 'duration': 2117, 'view_count': int, 'comment_count': int, + 'uploader_id': 186139, + 'uploader': '콘간지', + 'timestamp': 1387310323, }, }, { 'url': 'http://m.tvpot.daum.net/v/65139429', @@ -44,11 +43,14 @@ class DaumIE(InfoExtractor): 'ext': 'mp4', 'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118', 'description': 'md5:79794514261164ff27e36a21ad229fc5', - 'upload_date': '20150604', + 'upload_date': '20150118', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 154, 'view_count': int, 'comment_count': int, + 'uploader': 'MBC 예능', + 'uploader_id': 132251, + 'timestamp': 1421604228, }, }, { 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24', @@ -59,12 +61,15 @@ class DaumIE(InfoExtractor): 'id': 'vwIpVpCQsT8$', 'ext': 'flv', 'title': '01-Korean War ( Trouble on the horizon )', - 'description': '\nKorean War 01\nTrouble on the horizon\n전쟁의 먹구름', + 'description': 'Korean War 01\r\nTrouble on the horizon\r\n전쟁의 먹구름', 'upload_date': '20080223', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 249, 'view_count': int, 'comment_count': int, + 'uploader': '까칠한 墮落始祖 황비홍님의', + 'uploader_id': 560824, + 'timestamp': 1203770745, }, }, { # Requires dte_type=WEB (#9972) @@ -73,60 +78,24 @@ class DaumIE(InfoExtractor): 'info_dict': { 'id': 's3794Uf1NZeZ1qMpGpeqeRU', 'ext': 'mp4', - 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611', - 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회', - 'upload_date': '20160611', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'upload_date': '20170129', + 'uploader': '쇼! 음악중심', + 'uploader_id': 2653210, + 'timestamp': 1485684628, }, }] def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - movie_data = self._download_json( - 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json', - video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'}) - - # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid - if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id): - return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id) - - info = self._download_xml( - 'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id, - 'Downloading video info', query={'vid': video_id}) - - formats = [] - for format_el in movie_data['output_list']['output_list']: - profile = format_el['profile'] - format_query = compat_urllib_parse_urlencode({ - 'vid': video_id, - 'profile': profile, - }) - url_doc = self._download_xml( - 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, - video_id, note='Downloading video data for %s format' % profile) - format_url = url_doc.find('result/url').text - formats.append({ - 'url': format_url, - 'format_id': profile, - 'width': int_or_none(format_el.get('width')), - 'height': int_or_none(format_el.get('height')), - 'filesize': int_or_none(format_el.get('filesize')), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info.find('TITLE').text, - 'formats': formats, - 'thumbnail': xpath_text(info, 'THUMB_URL'), - 'description': xpath_text(info, 'CONTENTS'), - 'duration': int_or_none(xpath_text(info, 'DURATION')), - 'upload_date': info.find('REGDTTM').text[:8], - 'view_count': str_to_int(xpath_text(info, 'PLAY_CNT')), - 'comment_count': str_to_int(xpath_text(info, 'COMMENT_CNT')), - } + if not video_id.isdigit(): + video_id += '@my' + return self.url_result( + self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) -class DaumClipIE(InfoExtractor): +class DaumClipIE(DaumBaseIE): _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P\d+)' IE_NAME = 'daum.net:clip' _URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s' @@ -142,6 +111,9 @@ class DaumClipIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 3868, 'view_count': int, + 'uploader': 'GOMeXP', + 'uploader_id': 6667, + 'timestamp': 1377911092, }, }, { 'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425', @@ -154,22 +126,8 @@ class DaumClipIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - clip_info = self._download_json( - 'http://tvpot.daum.net/mypot/json/GetClipInfo.do?clipid=%s' % video_id, - video_id, 'Downloading clip info')['clip_bean'] - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'http://tvpot.daum.net/v/%s' % clip_info['vid'], - 'title': unescapeHTML(clip_info['title']), - 'thumbnail': clip_info.get('thumb_url'), - 'description': clip_info.get('contents'), - 'duration': int_or_none(clip_info.get('duration')), - 'upload_date': clip_info.get('up_date')[:8], - 'view_count': int_or_none(clip_info.get('play_count')), - 'ie_key': 'Daum', - } + return self.url_result( + self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) class DaumListIE(InfoExtractor): From e987ce4bda476a387937e4af5b46f4a412a67830 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 12:40:41 +0100 Subject: [PATCH 068/154] [kakao] remove raw request and extract format total bitrate --- youtube_dl/extractor/kakao.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 96f918b75..32935bb28 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -69,7 +69,8 @@ class KakaoIE(InfoExtractor): '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault']) + 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } impress = self._download_json( @@ -81,21 +82,14 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - tid = impress.get('tid', '') - - query.update({ - 'fields': '-*,outputList,profile,width,height,label,filesize', - 'tid': tid, - 'profile': 'HIGH', - }) - raw = self._download_json( - api_base + 'raw', display_id, 'Downloading video formats info', - query=query, headers=player_header) + query['tid'] = impress.get('tid', '') formats = [] - for fmt in raw.get('outputList', []): + for fmt in clip.get('videoOutputList', []): try: profile_name = fmt['profile'] + if profile_name == 'AUDIO': + continue query.update({ 'profile': profile_name, 'fields': '-*,url', @@ -115,7 +109,8 @@ class KakaoIE(InfoExtractor): 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), 'format_note': fmt.get('label'), - 'filesize': int_or_none(fmt.get('filesize')) + 'filesize': int_or_none(fmt.get('filesize')), + 'tbr': int_or_none(fmt.get('kbps')), }) except KeyError: pass From 20cc7c082b82e82050a4e1f1bb815fee51f6c1c2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 16:36:35 +0100 Subject: [PATCH 069/154] [go90] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/go90.py | 149 ----------------------------- 2 files changed, 150 deletions(-) delete mode 100644 youtube_dl/extractor/go90.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5d20ba863..e9b59ce52 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -422,7 +422,6 @@ from .globo import ( GloboArticleIE, ) from .go import GoIE -from .go90 import Go90IE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py deleted file mode 100644 index c3ea717bc..000000000 --- a/youtube_dl/extractor/go90.py +++ /dev/null @@ -1,149 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, -) - - -class Go90IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'https://www.go90.com/videos/84BUqjLpf9D', - 'md5': 'efa7670dbbbf21a7b07b360652b24a32', - 'info_dict': { - 'id': '84BUqjLpf9D', - 'ext': 'mp4', - 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention', - 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', - 'timestamp': 1491868800, - 'upload_date': '20170411', - 'age_limit': 14, - } - }, { - 'url': 'https://www.go90.com/embed/261MflWkD3N', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - headers = self.geo_verification_headers() - headers.update({ - 'Content-Type': 'application/json; charset=utf-8', - }) - video_data = self._download_json( - 'https://www.go90.com/api/view/items/' + video_id, video_id, - headers=headers, data=b'{"client":"web","device_type":"pc"}') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - message = self._parse_json(e.cause.read().decode(), None)['error']['message'] - if 'region unavailable' in message: - self.raise_geo_restricted(countries=['US']) - raise ExtractorError(message, expected=True) - raise - - if video_data.get('requires_drm'): - raise ExtractorError('This video is DRM protected.', expected=True) - main_video_asset = video_data['main_video_asset'] - - episode_number = int_or_none(video_data.get('episode_number')) - series = None - season = None - season_id = None - season_number = None - for metadata in video_data.get('__children', {}).get('Item', {}).values(): - if metadata.get('type') == 'show': - series = metadata.get('title') - elif metadata.get('type') == 'season': - season = metadata.get('title') - season_id = metadata.get('id') - season_number = int_or_none(metadata.get('season_number')) - - title = episode = video_data.get('title') or series - if series and series != title: - title = '%s - %s' % (series, title) - - thumbnails = [] - formats = [] - subtitles = {} - for asset in video_data.get('assets'): - if asset.get('id') == main_video_asset: - for source in asset.get('sources', []): - source_location = source.get('location') - if not source_location: - continue - source_type = source.get('type') - if source_type == 'hls': - m3u8_formats = self._extract_m3u8_formats( - source_location, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - for f in m3u8_formats: - mobj = re.search(r'/hls-(\d+)-(\d+)K', f['url']) - if mobj: - height, tbr = mobj.groups() - height = int_or_none(height) - f.update({ - 'height': f.get('height') or height, - 'width': f.get('width') or int_or_none(height / 9.0 * 16.0 if height else None), - 'tbr': f.get('tbr') or int_or_none(tbr), - }) - formats.extend(m3u8_formats) - elif source_type == 'dash': - formats.extend(self._extract_mpd_formats( - source_location, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'format_id': source.get('name'), - 'url': source_location, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'tbr': int_or_none(source.get('bitrate')), - }) - - for caption in asset.get('caption_metadata', []): - caption_url = caption.get('source_url') - if not caption_url: - continue - subtitles.setdefault(caption.get('language', 'en'), []).append({ - 'url': caption_url, - 'ext': determine_ext(caption_url, 'vtt'), - }) - elif asset.get('type') == 'image': - asset_location = asset.get('location') - if not asset_location: - continue - thumbnails.append({ - 'url': asset_location, - 'width': int_or_none(asset.get('width')), - 'height': int_or_none(asset.get('height')), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': video_data.get('short_description'), - 'like_count': int_or_none(video_data.get('like_count')), - 'timestamp': parse_iso8601(video_data.get('released_at')), - 'series': series, - 'episode': episode, - 'season': season, - 'season_id': season_id, - 'season_number': season_number, - 'episode_number': episode_number, - 'subtitles': subtitles, - 'age_limit': parse_age_limit(video_data.get('rating')), - } From 152f22920d73bb0dc24fa357d5904a8dd97a5bf6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 17:44:34 +0100 Subject: [PATCH 070/154] [wistia] reduce embed extraction false positives and support inline embeds(closes #22931) --- youtube_dl/extractor/wistia.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index fa142b974..0fbc888ec 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -12,7 +12,7 @@ from ..utils import ( class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P[a-z0-9]+)' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P[a-z0-9]{10})' _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' @@ -43,25 +43,26 @@ class WistiaIE(InfoExtractor): 'only_matching': True, }] + # https://wistia.com/support/embed-and-share/video-on-your-website @staticmethod def _extract_url(webpage): match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage) if match: return unescapeHTML(match.group('url')) - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) - if match: - return 'wistia:%s' % match.group('id') - match = re.search( r'''(?sx) ]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]+)\b.*?\2 + ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]{10})\b.*?\2 ''', webpage) if match: return 'wistia:%s' % match.group('id') + match = re.search(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P[a-z0-9]{10})', webpage) + if match: + return 'wistia:%s' % match.group('id') + def _real_extract(self, url): video_id = self._match_id(url) From 4c95fcf9e8fa2ed113698d13df55df4aaecd8433 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 21:16:47 +0100 Subject: [PATCH 071/154] [bambuser] remove extractor https://web.archive.org/web/20190808014227/https://go.bambuser.com/shutdown-announcement --- youtube_dl/extractor/bambuser.py | 142 ----------------------------- youtube_dl/extractor/extractors.py | 1 - 2 files changed, 143 deletions(-) delete mode 100644 youtube_dl/extractor/bambuser.py diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py deleted file mode 100644 index 4400ff9c1..000000000 --- a/youtube_dl/extractor/bambuser.py +++ /dev/null @@ -1,142 +0,0 @@ -from __future__ import unicode_literals - -import re -import itertools - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - sanitized_Request, - urlencode_postdata, -) - - -class BambuserIE(InfoExtractor): - IE_NAME = 'bambuser' - _VALID_URL = r'https?://bambuser\.com/v/(?P\d+)' - _API_KEY = '005f64509e19a868399060af746a00aa' - _LOGIN_URL = 'https://bambuser.com/user' - _NETRC_MACHINE = 'bambuser' - - _TEST = { - 'url': 'http://bambuser.com/v/4050584', - # MD5 seems to be flaky, see https://travis-ci.org/ytdl-org/youtube-dl/jobs/14051016#L388 - # 'md5': 'fba8f7693e48fd4e8641b3fd5539a641', - 'info_dict': { - 'id': '4050584', - 'ext': 'flv', - 'title': 'Education engineering days - lightning talks', - 'duration': 3741, - 'uploader': 'pixelversity', - 'uploader_id': '344706', - 'timestamp': 1382976692, - 'upload_date': '20131028', - 'view_count': int, - }, - 'params': { - # It doesn't respect the 'Range' header, it would download the whole video - # caused the travis builds to fail: https://travis-ci.org/ytdl-org/youtube-dl/jobs/14493845#L59 - 'skip_download': True, - }, - } - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_form = { - 'form_id': 'user_login', - 'op': 'Log in', - 'name': username, - 'pass': password, - } - - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Referer', self._LOGIN_URL) - response = self._download_webpage( - request, None, 'Logging in') - - login_error = self._html_search_regex( - r'(?s)
(.+?)
', - response, 'login error', default=None) - if login_error: - raise ExtractorError( - 'Unable to login: %s' % login_error, expected=True) - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = self._download_json( - 'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s' - % (self._API_KEY, video_id), video_id) - - error = info.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - result = info['result'] - - return { - 'id': video_id, - 'title': result['title'], - 'url': result['url'], - 'thumbnail': result.get('preview'), - 'duration': int_or_none(result.get('length')), - 'uploader': result.get('username'), - 'uploader_id': compat_str(result.get('owner', {}).get('uid')), - 'timestamp': int_or_none(result.get('created')), - 'fps': float_or_none(result.get('framerate')), - 'view_count': int_or_none(result.get('views_total')), - 'comment_count': int_or_none(result.get('comment_count')), - } - - -class BambuserChannelIE(InfoExtractor): - IE_NAME = 'bambuser:channel' - _VALID_URL = r'https?://bambuser\.com/channel/(?P.*?)(?:/|#|\?|$)' - # The maximum number we can get with each request - _STEP = 50 - _TEST = { - 'url': 'http://bambuser.com/channel/pixelversity', - 'info_dict': { - 'title': 'pixelversity', - }, - 'playlist_mincount': 60, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user = mobj.group('user') - urls = [] - last_id = '' - for i in itertools.count(1): - req_url = ( - 'http://bambuser.com/xhr-api/index.php?username={user}' - '&sort=created&access_mode=0%2C1%2C2&limit={count}' - '&method=broadcast&format=json&vid_older_than={last}' - ).format(user=user, count=self._STEP, last=last_id) - req = sanitized_Request(req_url) - # Without setting this header, we wouldn't get any result - req.add_header('Referer', 'http://bambuser.com/channel/%s' % user) - data = self._download_json( - req, user, 'Downloading page %d' % i) - results = data['result'] - if not results: - break - last_id = results[-1]['vid'] - urls.extend(self.url_result(v['page'], 'Bambuser') for v in results) - - return { - '_type': 'playlist', - 'title': user, - 'entries': urls, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e9b59ce52..af3fff601 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -80,7 +80,6 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE -from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, From 836bfcb54e4d1664815ebffb753a9dc7c9c7d72c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Nov 2019 11:08:51 +0100 Subject: [PATCH 072/154] [flipagram] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/flipagram.py | 115 ----------------------------- 2 files changed, 116 deletions(-) delete mode 100644 youtube_dl/extractor/flipagram.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index af3fff601..33fb461a0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -355,7 +355,6 @@ from .firsttv import FirstTVIE from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .flickr import FlickrIE -from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py deleted file mode 100644 index b7be40f1b..000000000 --- a/youtube_dl/extractor/flipagram.py +++ /dev/null @@ -1,115 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - float_or_none, - try_get, - unified_timestamp, -) - - -class FlipagramIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://flipagram.com/f/nyvTSJMKId', - 'md5': '888dcf08b7ea671381f00fab74692755', - 'info_dict': { - 'id': 'nyvTSJMKId', - 'ext': 'mp4', - 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', - 'description': 'md5:d55e32edc55261cae96a41fa85ff630e', - 'duration': 35.571, - 'timestamp': 1461244995, - 'upload_date': '20160421', - 'uploader': 'kitty juria', - 'uploader_id': 'sjuria101', - 'creator': 'kitty juria', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'comment_count': int, - 'comments': list, - 'formats': 'mincount:2', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json( - self._search_regex( - r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'), - video_id) - - flipagram = video_data['flipagram'] - video = flipagram['video'] - - json_ld = self._search_json_ld(webpage, video_id, default={}) - title = json_ld.get('title') or flipagram['captionText'] - description = json_ld.get('description') or flipagram.get('captionText') - - formats = [{ - 'url': video['url'], - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'filesize': int_or_none(video_data.get('size')), - }] - - preview_url = try_get( - flipagram, lambda x: x['music']['track']['previewUrl'], compat_str) - if preview_url: - formats.append({ - 'url': preview_url, - 'ext': 'm4a', - 'vcodec': 'none', - }) - - self._sort_formats(formats) - - counts = flipagram.get('counts', {}) - user = flipagram.get('user', {}) - video_data = flipagram.get('video', {}) - - thumbnails = [{ - 'url': self._proto_relative_url(cover['url']), - 'width': int_or_none(cover.get('width')), - 'height': int_or_none(cover.get('height')), - 'filesize': int_or_none(cover.get('size')), - } for cover in flipagram.get('covers', []) if cover.get('url')] - - # Note that this only retrieves comments that are initially loaded. - # For videos with large amounts of comments, most won't be retrieved. - comments = [] - for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): - text = comment.get('comment') - if not text or not isinstance(text, list): - continue - comments.append({ - 'author': comment.get('user', {}).get('name'), - 'author_id': comment.get('user', {}).get('username'), - 'id': comment.get('id'), - 'text': text[0], - 'timestamp': unified_timestamp(comment.get('created')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': float_or_none(flipagram.get('duration'), 1000), - 'thumbnails': thumbnails, - 'timestamp': unified_timestamp(flipagram.get('iso8601Created')), - 'uploader': user.get('name'), - 'uploader_id': user.get('username'), - 'creator': user.get('name'), - 'view_count': int_or_none(counts.get('plays')), - 'like_count': int_or_none(counts.get('likes')), - 'repost_count': int_or_none(counts.get('reflips')), - 'comment_count': int_or_none(counts.get('comments')), - 'comments': comments, - 'formats': formats, - } From 79b35e7c15f4a285525b5ec52035ff0f8fc6150d Mon Sep 17 00:00:00 2001 From: geditorit <52565706+geditorit@users.noreply.github.com> Date: Sat, 2 Nov 2019 18:32:49 +0700 Subject: [PATCH 073/154] [gameone] Remove extractor (#21778) --- youtube_dl/extractor/extractors.py | 4 - youtube_dl/extractor/gameone.py | 134 ----------------------------- 2 files changed, 138 deletions(-) delete mode 100644 youtube_dl/extractor/gameone.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 33fb461a0..dce08e077 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -401,10 +401,6 @@ from .fusion import FusionIE from .fxnetworks import FXNetworksIE from .gaia import GaiaIE from .gameinformer import GameInformerIE -from .gameone import ( - GameOneIE, - GameOnePlaylistIE, -) from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py deleted file mode 100644 index a07d69841..000000000 --- a/youtube_dl/extractor/gameone.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - xpath_with_ns, - parse_iso8601, - float_or_none, - int_or_none, -) - -NAMESPACE_MAP = { - 'media': 'http://search.yahoo.com/mrss/', -} - -# URL prefix to download the mp4 files directly instead of streaming via rtmp -# Credits go to XBox-Maniac -# http://board.jdownloader.org/showpost.php?p=185835&postcount=31 -RAW_MP4_URL = 'http://cdn.riptide-mtvn.com/' - - -class GameOneIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gameone\.de/tv/(?P\d+)' - _TESTS = [ - { - 'url': 'http://www.gameone.de/tv/288', - 'md5': '136656b7fb4c9cb4a8e2d500651c499b', - 'info_dict': { - 'id': '288', - 'ext': 'mp4', - 'title': 'Game One - Folge 288', - 'duration': 1238, - 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg', - 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1', - 'age_limit': 16, - 'upload_date': '20140513', - 'timestamp': 1399980122, - } - }, - { - 'url': 'http://gameone.de/tv/220', - 'md5': '5227ca74c4ae6b5f74c0510a7c48839e', - 'info_dict': { - 'id': '220', - 'ext': 'mp4', - 'upload_date': '20120918', - 'description': 'Jet Set Radio HD, Tekken Tag Tournament 2, Source Filmmaker', - 'timestamp': 1347971451, - 'title': 'Game One - Folge 220', - 'duration': 896.62, - 'age_limit': 16, - } - } - - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - og_video = self._og_search_video_url(webpage, secure=False) - description = self._html_search_meta('description', webpage) - age_limit = int( - self._search_regex( - r'age=(\d+)', - self._html_search_meta( - 'age-de-meta-label', - webpage), - 'age_limit', - '0')) - mrss_url = self._search_regex(r'mrss=([^&]+)', og_video, 'mrss') - - mrss = self._download_xml(mrss_url, video_id, 'Downloading mrss') - title = mrss.find('.//item/title').text - thumbnail = mrss.find('.//item/image').get('url') - timestamp = parse_iso8601(mrss.find('.//pubDate').text, delimiter=' ') - content = mrss.find(xpath_with_ns('.//media:content', NAMESPACE_MAP)) - content_url = content.get('url') - - content = self._download_xml( - content_url, - video_id, - 'Downloading media:content') - rendition_items = content.findall('.//rendition') - duration = float_or_none(rendition_items[0].get('duration')) - formats = [ - { - 'url': re.sub(r'.*/(r2)', RAW_MP4_URL + r'\1', r.find('./src').text), - 'width': int_or_none(r.get('width')), - 'height': int_or_none(r.get('height')), - 'tbr': int_or_none(r.get('bitrate')), - } - for r in rendition_items - ] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'description': description, - 'age_limit': age_limit, - 'timestamp': timestamp, - } - - -class GameOnePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gameone\.de(?:/tv)?/?$' - IE_NAME = 'gameone:playlist' - _TEST = { - 'url': 'http://www.gameone.de/tv', - 'info_dict': { - 'title': 'GameOne', - }, - 'playlist_mincount': 294, - } - - def _real_extract(self, url): - webpage = self._download_webpage('http://www.gameone.de/tv', 'TV') - max_id = max(map(int, re.findall(r' Date: Sat, 2 Nov 2019 13:09:44 +0100 Subject: [PATCH 074/154] [keek] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/keek.py | 39 ------------------------------ 2 files changed, 40 deletions(-) delete mode 100644 youtube_dl/extractor/keek.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dce08e077..08facf8d3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -515,7 +515,6 @@ from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .kinopoisk import KinoPoiskIE -from .keek import KeekIE from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py deleted file mode 100644 index 94a03d277..000000000 --- a/youtube_dl/extractor/keek.py +++ /dev/null @@ -1,39 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class KeekIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keek\.com/keek/(?P\w+)' - IE_NAME = 'keek' - _TEST = { - 'url': 'https://www.keek.com/keek/NODfbab', - 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83', - 'info_dict': { - 'id': 'NODfbab', - 'ext': 'mp4', - 'title': 'md5:35d42050a3ece241d5ddd7fdcc6fd896', - 'uploader': 'ytdl', - 'uploader_id': 'eGT5bab', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - return { - 'id': video_id, - 'url': self._og_search_video_url(webpage), - 'ext': 'mp4', - 'title': self._og_search_description(webpage).strip(), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': self._search_regex( - r'data-username=(["\'])(?P.+?)\1', webpage, - 'uploader', fatal=False, group='uploader'), - 'uploader_id': self._search_regex( - r'data-user-id=(["\'])(?P.+?)\1', webpage, - 'uploader id', fatal=False, group='uploader_id'), - } From 5e36b63486794750aca0ee6b9b83f27abf6332dc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Nov 2019 13:25:39 +0100 Subject: [PATCH 075/154] [iconosquare] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/iconosquare.py | 85 ----------------------------- 2 files changed, 86 deletions(-) delete mode 100644 youtube_dl/extractor/iconosquare.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 08facf8d3..dd5f68ca3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -455,7 +455,6 @@ from .hungama import ( HungamaSongIE, ) from .hypem import HypemIE -from .iconosquare import IconosquareIE from .ign import ( IGNIE, OneUPIE, diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py deleted file mode 100644 index a39f422e9..000000000 --- a/youtube_dl/extractor/iconosquare.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - get_element_by_id, - remove_end, -) - - -class IconosquareIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P[^/]+)' - _TEST = { - 'url': 'http://statigr.am/p/522207370455279102_24101272', - 'md5': '6eb93b882a3ded7c378ee1d6884b1814', - 'info_dict': { - 'id': '522207370455279102_24101272', - 'ext': 'mp4', - 'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)', - 'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d', - 'timestamp': 1376471991, - 'upload_date': '20130814', - 'uploader': 'aguynamedpatrick', - 'uploader_id': '24101272', - 'comment_count': int, - 'like_count': int, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - media = self._parse_json( - get_element_by_id('mediaJson', webpage), - video_id) - - formats = [{ - 'url': f['url'], - 'format_id': format_id, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')) - } for format_id, f in media['videos'].items()] - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - via Iconosquare') - - timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time')) - description = media.get('caption', {}).get('text') - - uploader = media.get('user', {}).get('username') - uploader_id = media.get('user', {}).get('id') - - comment_count = int_or_none(media.get('comments', {}).get('count')) - like_count = int_or_none(media.get('likes', {}).get('count')) - - thumbnails = [{ - 'url': t['url'], - 'id': thumbnail_id, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')) - } for thumbnail_id, t in media.get('images', {}).items()] - - comments = [{ - 'id': comment.get('id'), - 'text': comment['text'], - 'timestamp': int_or_none(comment.get('created_time')), - 'author': comment.get('from', {}).get('full_name'), - 'author_id': comment.get('from', {}).get('username'), - } for comment in media.get('comments', {}).get('data', []) if 'text' in comment] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'comment_count': comment_count, - 'like_count': like_count, - 'formats': formats, - 'comments': comments, - } From e54924c46fac6a9745868424dc14011da2572178 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Nov 2019 18:13:31 +0100 Subject: [PATCH 076/154] [stv] fix extraction(closes #22928) --- youtube_dl/extractor/stv.py | 89 +++++++++++++------------------------ 1 file changed, 31 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py index ccb074cd4..bae8b71f4 100644 --- a/youtube_dl/extractor/stv.py +++ b/youtube_dl/extractor/stv.py @@ -4,15 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse -) from ..utils import ( - extract_attributes, + compat_str, float_or_none, int_or_none, - str_or_none, ) @@ -20,20 +15,20 @@ class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?Pepisode|video)/(?P[a-z0-9]{4})' _TEST = { - 'url': 'https://player.stv.tv/video/7srz/victoria/interview-with-the-cast-ahead-of-new-victoria/', - 'md5': '2ad867d4afd641fa14187596e0fbc91b', + 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', + 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { - 'id': '6016487034001', + 'id': '5333973339001', 'ext': 'mp4', - 'upload_date': '20190321', - 'title': 'Interview with the cast ahead of new Victoria', - 'description': 'Nell Hudson and Lily Travers tell us what to expect in the new season of Victoria.', - 'timestamp': 1553179628, + 'upload_date': '20170301', + 'title': '60 seconds on set with Laura Norton', + 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!", + 'timestamp': 1488388054, 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', } - _PUBLISHER_ID = '1486976045' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', 'video': 'shortform', @@ -41,54 +36,32 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) + resp = self._download_json( + 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), + video_id) - qs = compat_parse_qs(compat_urllib_parse_urlparse(self._search_regex( - r'itemprop="embedURL"[^>]+href="([^"]+)', - webpage, 'embed URL', default=None)).query) - publisher_id = qs.get('publisherID', [None])[0] or self._PUBLISHER_ID + result = resp['results'] + video = result['video'] + video_id = compat_str(video['id']) - player_attr = extract_attributes(self._search_regex( - r'(<[^>]+class="bcplayer"[^>]+>)', webpage, 'player', default=None)) or {} + subtitles = {} + _subtitles = result.get('_subtitles') or {} + for ext, sub_url in _subtitles.items(): + subtitles.setdefault('en', []).append({ + 'ext': 'vtt' if ext == 'webvtt' else ext, + 'url': sub_url, + }) - info = {} - duration = ref_id = series = video_id = None - api_ref_id = player_attr.get('data-player-api-refid') - if api_ref_id: - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], api_ref_id), - api_ref_id, fatal=False) - if resp: - result = resp.get('results') or {} - video = result.get('video') or {} - video_id = str_or_none(video.get('id')) - ref_id = video.get('guid') - duration = video.get('length') - programme = result.get('programme') or {} - series = programme.get('name') or programme.get('shortName') - subtitles = {} - _subtitles = result.get('_subtitles') or {} - for ext, sub_url in _subtitles.items(): - subtitles.setdefault('en', []).append({ - 'ext': 'vtt' if ext == 'webvtt' else ext, - 'url': sub_url, - }) - info.update({ - 'description': result.get('summary'), - 'subtitles': subtitles, - 'view_count': int_or_none(result.get('views')), - }) - if not video_id: - video_id = qs.get('videoId', [None])[0] or self._search_regex( - r' Date: Sat, 2 Nov 2019 22:33:51 +0100 Subject: [PATCH 077/154] [bellmedia] add support for marilyn.ca videos(#22193) --- youtube_dl/extractor/bellmedia.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index f36a2452d..485173774 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -22,7 +22,8 @@ class BellMediaIE(InfoExtractor): bravo| mtv| space| - etalk + etalk| + marilyn )\.ca| much\.com )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' @@ -70,6 +71,7 @@ class BellMediaIE(InfoExtractor): 'animalplanet': 'aniplan', 'etalk': 'ctv', 'bnnbloomberg': 'bnn', + 'marilyn': 'ctv_marilyn', } def _real_extract(self, url): From 564275e26fc963fb920236e37c6c19e8e2b046f0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Nov 2019 22:04:03 +0100 Subject: [PATCH 078/154] [telegraaf] fix extraction --- youtube_dl/extractor/telegraaf.py | 75 ++++++++++++++++++------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 0f576c1ab..2dc020537 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -4,21 +4,25 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, - remove_end, + int_or_none, + parse_iso8601, + try_get, ) class TelegraafIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P\d+)/[^/]+\.html' + _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/video/(?P\d+)' _TEST = { - 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', + 'url': 'https://www.telegraaf.nl/video/734366489/historisch-scheepswrak-slaat-na-100-jaar-los', 'info_dict': { - 'id': '24353229', + 'id': 'gaMItuoSeUg2', 'ext': 'mp4', - 'title': 'Tikibad ontruimd wegens brand', - 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 33, + 'title': 'Historisch scheepswrak slaat na 100 jaar los', + 'description': 'md5:6f53b7c4f55596722ac24d6c0ec00cfb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 55, + 'timestamp': 1572805527, + 'upload_date': '20191103', }, 'params': { # m3u8 download @@ -27,23 +31,30 @@ class TelegraafIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + article_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_id = self._download_json( + 'https://www.telegraaf.nl/graphql', article_id, query={ + 'query': '''{ + article(uid: %s) { + videos { + videoId + } + } +}''' % article_id, + })['data']['article']['videos'][0]['videoId'] - player_url = self._html_search_regex( - r']+src="([^"]+")', webpage, 'player URL') - player_page = self._download_webpage( - player_url, video_id, note='Download player webpage') - playlist_url = self._search_regex( - r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') - playlist_data = self._download_json(playlist_url, video_id) + item = self._download_json( + 'https://content.tmgvideo.nl/playlist/item=%s/playlist.json' % video_id, + video_id)['items'][0] + title = item['title'] - item = playlist_data['items'][0] formats = [] - locations = item['locations'] + locations = item.get('locations') or {} for location in locations.get('adaptive', []): - manifest_url = location['src'] + manifest_url = location.get('src') + if not manifest_url: + continue ext = determine_ext(manifest_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -54,25 +65,25 @@ class TelegraafIE(InfoExtractor): else: self.report_warning('Unknown adaptive format %s' % ext) for location in locations.get('progressive', []): + src = try_get(location, lambda x: x['sources'][0]['src']) + if not src: + continue + label = location.get('label') formats.append({ - 'url': location['sources'][0]['src'], - 'width': location.get('width'), - 'height': location.get('height'), - 'format_id': 'http-%s' % location['label'], + 'url': src, + 'width': int_or_none(location.get('width')), + 'height': int_or_none(location.get('height')), + 'format_id': 'http' + ('-%s' % label if label else ''), }) self._sort_formats(formats) - title = remove_end(self._og_search_title(webpage), ' - VIDEO') - description = self._og_search_description(webpage) - duration = item.get('duration') - thumbnail = item.get('poster') - return { 'id': video_id, 'title': title, - 'description': description, + 'description': item.get('description'), 'formats': formats, - 'duration': duration, - 'thumbnail': thumbnail, + 'duration': int_or_none(item.get('duration')), + 'thumbnail': item.get('poster'), + 'timestamp': parse_iso8601(item.get('datecreated'), ' '), } From a6e6673e825f6225c3a316b164ddca03fd20b5d2 Mon Sep 17 00:00:00 2001 From: Manu Cornet Date: Sun, 3 Nov 2019 21:23:27 +0000 Subject: [PATCH 079/154] [README.md] Also read permission to the binary in how to update section (#22903) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c39b13616..01f975958 100644 --- a/README.md +++ b/README.md @@ -752,8 +752,8 @@ As a last resort, you can also uninstall the version installed by your package m Afterwards, simply follow [our manual installation instructions](https://ytdl-org.github.io/youtube-dl/download.html): ``` -sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl -sudo chmod a+x /usr/local/bin/youtube-dl +sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl +sudo chmod a+rx /usr/local/bin/youtube-dl hash -r ``` From ef382405c6dc79d2b7e3f81a527232941e2c0b2d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Nov 2019 02:01:01 +0100 Subject: [PATCH 080/154] [mediaset] extract unprotected M3U and MPD manifests(closes #17204) --- youtube_dl/extractor/mediaset.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index df3748798..fcbc064ff 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -62,7 +62,6 @@ class MediasetIE(ThePlatformBaseIE): 'uploader': 'Canale 5', 'uploader_id': 'C5', }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', @@ -109,6 +108,11 @@ class MediasetIE(ThePlatformBaseIE): entries.append(embed_url) return entries + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + for video in smil.findall(self._xpath_ns('.//video', namespace)): + video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) + return super()._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + def _real_extract(self, url): guid = self._match_id(url) tp_path = 'PR1GhC/media/guid/2702976343/' + guid @@ -118,14 +122,15 @@ class MediasetIE(ThePlatformBaseIE): subtitles = {} first_e = None for asset_type in ('SD', 'HD'): - for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'): + # TODO: fixup ISM+none manifest URLs + for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'): try: tp_formats, tp_subtitles = self._extract_theplatform_smil( update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { 'mbr': 'true', 'formats': f, 'assetTypes': asset_type, - }), guid, 'Downloading %s %s SMIL data' % (f, asset_type)) + }), guid, 'Downloading %s %s SMIL data' % (f.split('+')[0], asset_type)) except ExtractorError as e: if not first_e: first_e = e From bf45295c5387d0d90b97ca34d377cdaa07c71bcb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Nov 2019 11:13:14 +0100 Subject: [PATCH 081/154] [mediaset] relax URL guid matching(closes #18352) --- youtube_dl/extractor/mediaset.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index fcbc064ff..f976506f4 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -27,7 +27,7 @@ class MediasetIE(ThePlatformBaseIE): (?:video|on-demand)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) - )(?P[0-9A-Z]{16}) + )(?P[0-9A-Z]{16,}) ''' _TESTS = [{ # full episode @@ -77,6 +77,18 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'mediaset:FAFU000000665924', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', + 'only_matching': True, }] @staticmethod From e452345fc5cee5e79d2cad6be575da563987a4ff Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Nov 2019 15:43:52 +0100 Subject: [PATCH 082/154] [jamendo] improve extraction - fix album extraction(closes #18564) - improve metadata extraction(closes #18565)(closes #21379) --- youtube_dl/extractor/jamendo.py | 162 +++++++++++++++++++------------- 1 file changed, 99 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index c21827618..12e21eb6f 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -1,38 +1,26 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import hashlib +import random -from ..compat import compat_urlparse +from ..compat import compat_str from .common import InfoExtractor -from ..utils import parse_duration +from ..utils import ( + clean_html, + int_or_none, + try_get, +) -class JamendoBaseIE(InfoExtractor): - def _extract_meta(self, webpage, fatal=True): - title = self._og_search_title( - webpage, default=None) or self._search_regex( - r'([^<]+)', webpage, - 'title', default=None) - if title: - title = self._search_regex( - r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None) - if not title: - title = self._html_search_meta( - 'name', webpage, 'title', fatal=fatal) - mobj = re.search(r'(.+) - (.+)', title or '') - artist, second = mobj.groups() if mobj else [None] * 2 - return title, artist, second - - -class JamendoIE(JamendoBaseIE): +class JamendoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: licensing\.jamendo\.com/[^/]+| (?:www\.)?jamendo\.com ) - /track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+) + /track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))? ''' _TESTS = [{ 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', @@ -45,7 +33,9 @@ class JamendoIE(JamendoBaseIE): 'artist': 'Maya Filipič', 'track': 'Stories from Emona I', 'duration': 210, - 'thumbnail': r're:^https?://.*\.jpg' + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1217438117, + 'upload_date': '20080730', } }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', @@ -53,15 +43,19 @@ class JamendoIE(JamendoBaseIE): }] def _real_extract(self, url): - mobj = self._VALID_URL_RE.match(url) - track_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage( - 'https://www.jamendo.com/track/%s/%s' % (track_id, display_id), - display_id) - - title, artist, track = self._extract_meta(webpage) + track_id, display_id = self._VALID_URL_RE.match(url).groups() + webpage = self._download_webpage(url, track_id) + models = self._parse_json(self._html_search_regex( + r"data-bundled-models='([^']+)", + webpage, 'bundled models'), track_id) + track = models['track']['models'][0] + title = track_name = track['name'] + get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} + artist = get_model('artist') + artist_name = artist.get('name') + if artist_name: + title = '%s - %s' % (artist_name, title) + album = get_model('album') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -77,31 +71,58 @@ class JamendoIE(JamendoBaseIE): ))] self._sort_formats(formats) - thumbnail = self._html_search_meta( - 'image', webpage, 'thumbnail', fatal=False) - duration = parse_duration(self._search_regex( - r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']', - webpage, 'duration', fatal=False)) + urls = [] + thumbnails = [] + for _, covers in track.get('cover', {}).items(): + for cover_id, cover_url in covers.items(): + if not cover_url or cover_url in urls: + continue + urls.append(cover_url) + size = int_or_none(cover_id.lstrip('size')) + thumbnails.append({ + 'id': cover_id, + 'url': cover_url, + 'width': size, + 'height': size, + }) + + tags = [] + for tag in track.get('tags', []): + tag_name = tag.get('name') + if not tag_name: + continue + tags.append(tag_name) + + stats = track.get('stats') or {} return { 'id': track_id, 'display_id': display_id, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'title': title, - 'duration': duration, - 'artist': artist, - 'track': track, - 'formats': formats + 'description': track.get('description'), + 'duration': int_or_none(track.get('duration')), + 'artist': artist_name, + 'track': track_name, + 'album': album.get('name'), + 'formats': formats, + 'license': '-'.join(track.get('licenseCC', [])) or None, + 'timestamp': int_or_none(track.get('dateCreated')), + 'view_count': int_or_none(stats.get('listenedAll')), + 'like_count': int_or_none(stats.get('favorited')), + 'average_rating': int_or_none(stats.get('averageNote')), + 'tags': tags, } -class JamendoAlbumIE(JamendoBaseIE): - _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)' +class JamendoAlbumIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)' _TEST = { 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', - 'title': 'Shearer - Duck On Cover' + 'title': 'Duck On Cover', + 'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239', }, 'playlist': [{ 'md5': 'e1a2fcb42bda30dfac990212924149a8', @@ -111,6 +132,8 @@ class JamendoAlbumIE(JamendoBaseIE): 'title': 'Shearer - Warmachine', 'artist': 'Shearer', 'track': 'Warmachine', + 'timestamp': 1368089771, + 'upload_date': '20130509', } }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', @@ -120,6 +143,8 @@ class JamendoAlbumIE(JamendoBaseIE): 'title': 'Shearer - Without Your Ghost', 'artist': 'Shearer', 'track': 'Without Your Ghost', + 'timestamp': 1368089771, + 'upload_date': '20130509', } }], 'params': { @@ -127,24 +152,35 @@ class JamendoAlbumIE(JamendoBaseIE): } } + def _call_api(self, resource, resource_id): + path = '/api/%ss' % resource + rand = compat_str(random.random()) + return self._download_json( + 'https://www.jamendo.com' + path, resource_id, query={ + 'id[]': resource_id, + }, headers={ + 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + })[0] + def _real_extract(self, url): - mobj = self._VALID_URL_RE.match(url) - album_id = mobj.group('id') + album_id = self._match_id(url) + album = self._call_api('album', album_id) + album_name = album.get('name') - webpage = self._download_webpage(url, mobj.group('display_id')) + entries = [] + for track in album.get('tracks', []): + track_id = track.get('id') + if not track_id: + continue + track_id = compat_str(track_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'https://www.jamendo.com/track/' + track_id, + 'ie_key': JamendoIE.ie_key(), + 'id': track_id, + 'album': album_name, + }) - title, artist, album = self._extract_meta(webpage, fatal=False) - - entries = [{ - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, m.group('path')), - 'ie_key': JamendoIE.ie_key(), - 'id': self._search_regex( - r'/track/(\d+)', m.group('path'), 'track id', default=None), - 'artist': artist, - 'album': album, - } for m in re.finditer( - r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link', - webpage)] - - return self.playlist_result(entries, album_id, title) + return self.playlist_result( + entries, album_id, album_name, + clean_html(try_get(album, lambda x: x['description']['en'], compat_str))) From 2349255abdf822e0bb9508d510db926cae777f8c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 4 Nov 2019 15:51:44 +0100 Subject: [PATCH 083/154] [jamendo] restore track url modification --- youtube_dl/extractor/jamendo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 12e21eb6f..490efa8fb 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -44,7 +44,8 @@ class JamendoIE(InfoExtractor): def _real_extract(self, url): track_id, display_id = self._VALID_URL_RE.match(url).groups() - webpage = self._download_webpage(url, track_id) + webpage = self._download_webpage( + 'https://www.jamendo.com/track/' + track_id, track_id) models = self._parse_json(self._html_search_regex( r"data-bundled-models='([^']+)", webpage, 'bundled models'), track_id) From 3e4908360417bc29e1446bfa85145193fa2c8462 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 4 Nov 2019 20:05:27 +0100 Subject: [PATCH 084/154] [myspass] fix video URL extraction and improve metadata extraction(closes #22448) --- youtube_dl/extractor/myspass.py | 75 +++++++++++++-------------------- 1 file changed, 29 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 2afe535b5..db7ebc94c 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -1,73 +1,56 @@ +# coding: utf-8 from __future__ import unicode_literals -import os.path + +import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( - ExtractorError, + int_or_none, + parse_duration, + xpath_text, ) class MySpassIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?myspass\.de/.*' + _VALID_URL = r'https?://(?:www\.)?myspass\.de/([^/]+/)*(?P<id>\d+)' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', 'info_dict': { 'id': '11741', 'ext': 'mp4', - 'description': 'Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?', - 'title': 'Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2', + 'description': 'Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?', + 'title': '17.02.2013 - Die Highlights, Teil 2', }, } def _real_extract(self, url): - META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' + video_id = self._match_id(url) - # video id is the last path element of the URL - # usually there is a trailing slash, so also try the second but last - url_path = compat_urllib_parse_urlparse(url).path - url_parent_path, video_id = os.path.split(url_path) - if not video_id: - _, video_id = os.path.split(url_parent_path) - - # get metadata - metadata_url = META_DATA_URL_TEMPLATE % video_id metadata = self._download_xml( - metadata_url, video_id, transform_source=lambda s: s.strip()) + 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id, + video_id) - # extract values from metadata - url_flv_el = metadata.find('url_flv') - if url_flv_el is None: - raise ExtractorError('Unable to extract download url') - video_url = url_flv_el.text - title_el = metadata.find('title') - if title_el is None: - raise ExtractorError('Unable to extract title') - title = title_el.text - format_id_el = metadata.find('format_id') - if format_id_el is None: - format = 'mp4' - else: - format = format_id_el.text - description_el = metadata.find('description') - if description_el is not None: - description = description_el.text - else: - description = None - imagePreview_el = metadata.find('imagePreview') - if imagePreview_el is not None: - thumbnail = imagePreview_el.text - else: - thumbnail = None + title = xpath_text(metadata, 'title', fatal=True) + video_url = xpath_text(metadata, 'url_flv', 'download url', True) + video_id_int = int(video_id) + for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups(): + group_int = int(group) + if group_int > video_id_int: + video_url = video_url.replace( + group, compat_str(group_int // video_id_int)) return { 'id': video_id, 'url': video_url, 'title': title, - 'format': format, - 'thumbnail': thumbnail, - 'description': description, + 'thumbnail': xpath_text(metadata, 'imagePreview'), + 'description': xpath_text(metadata, 'description'), + 'duration': parse_duration(xpath_text(metadata, 'duration')), + 'series': xpath_text(metadata, 'format'), + 'season_number': int_or_none(xpath_text(metadata, 'season')), + 'season_id': xpath_text(metadata, 'season_id'), + 'episode': title, + 'episode_number': int_or_none(xpath_text(metadata, 'episode')), } From c69e71733d9619cb1a2bee769b9a381b52901de3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 4 Nov 2019 22:21:00 +0100 Subject: [PATCH 085/154] [msn] add support for Vidible and AOL embeds(closes #22195)(closes #22227) --- youtube_dl/extractor/msn.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index 0460cf4d5..0c3813dda 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -41,6 +41,14 @@ class MSNIE(InfoExtractor): }, { 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', 'only_matching': True, + }, { + # Vidible(AOL) Embed + 'url': 'https://www.msn.com/en-us/video/animals/yellowstone-park-staffers-catch-deer-engaged-in-behavior-they-cant-explain/vi-AAGfdg1', + 'only_matching': True, + }, { + # Dailymotion Embed + 'url': 'https://www.msn.com/es-ve/entretenimiento/watch/winston-salem-paire-refait-des-siennes-en-perdant-sa-raquette-au-service/vp-AAG704L', + 'only_matching': True, }] def _real_extract(self, url): @@ -61,6 +69,18 @@ class MSNIE(InfoExtractor): webpage, 'error', group='error')) raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + player_name = video.get('playerName') + if player_name: + provider_id = video.get('providerId') + if provider_id: + if player_name == 'AOL': + return self.url_result( + 'aol-video:' + provider_id, 'Aol', provider_id) + elif player_name == 'Dailymotion': + return self.url_result( + 'https://www.dailymotion.com/video/' + provider_id, + 'Dailymotion', provider_id) + title = video['title'] formats = [] From 20218040db2b1e063191cc470ce403d35d394e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Nov 2019 05:21:16 +0700 Subject: [PATCH 086/154] [scte] Add extractor (closes #22975) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/scte.py | 144 +++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 youtube_dl/extractor/scte.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dd5f68ca3..9f43b284d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -980,6 +980,10 @@ from .sbs import SBSIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE from .scrippsnetworks import ScrippsNetworksWatchIE +from .scte import ( + SCTEIE, + SCTECourseIE, +) from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE diff --git a/youtube_dl/extractor/scte.py b/youtube_dl/extractor/scte.py new file mode 100644 index 000000000..ca1de63b6 --- /dev/null +++ b/youtube_dl/extractor/scte.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + urlencode_postdata, +) + + +class SCTEBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' + _NETRC_MACHINE = 'scte' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_popup = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']welcome\b', r'>Sign Out<')) + + # already logged in + if is_logged(login_popup): + return + + login_form = self._hidden_inputs(login_popup) + + login_form.update({ + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on', + }) + + response = self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + if '|pageRedirect|' not in response and not is_logged(response): + error = self._html_search_regex( + r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class SCTEIE(SCTEBaseIE): + _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484', + 'info_dict': { + 'title': 'Introduction to DOCSIS Engineering Professional', + 'id': '31484', + }, + 'playlist_count': 5, + 'skip': 'Requires account credentials', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title') + + context_id = self._search_regex(r'context-(\d+)', webpage, video_id) + content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id + context = decode_packed_codes(self._download_webpage( + '%smobile/data.js' % content_base, video_id)) + + data = self._parse_xml( + self._search_regex( + r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"), + video_id) + + entries = [] + for asset in data.findall('.//asset'): + asset_url = asset.get('url') + if not asset_url or not asset_url.endswith('.mp4'): + continue + asset_id = self._search_regex( + r'video_([^_]+)_', asset_url, 'asset id', default=None) + if not asset_id: + continue + entries.append({ + 'id': asset_id, + 'title': title, + 'url': content_base + asset_url, + }) + + return self.playlist_result(entries, video_id, title) + + +class SCTECourseIE(SCTEBaseIE): + _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3639', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3073', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + title = self._search_regex( + r'<h1>(.+?)</h1>', webpage, 'title', default=None) + + entries = [] + for mobj in re.finditer( + r'''(?x) + <a[^>]+ + href=(["\']) + (?P<url> + https?://learning\.scte\.org/mod/ + (?P<kind>scorm|subcourse)/view\.php?(?:(?!\1).)*? + \bid=\d+ + ) + ''', + webpage): + item_url = mobj.group('url') + if item_url == url: + continue + ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm' + else SCTECourseIE.ie_key()) + entries.append(self.url_result(item_url, ie=ie)) + + return self.playlist_result(entries, course_id, title) From 1a4e4b0bfeb83b24755f80630d1e7f3427a5bf48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Nov 2019 05:31:40 +0700 Subject: [PATCH 087/154] [ChangeLog] Actualize [ci skip] --- ChangeLog | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/ChangeLog b/ChangeLog index fcab1102c..338dd456b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,47 @@ +version <unreleased> + +Extractors ++ [scte] Add support for learning.scte.org (#22975) ++ [msn] Add support for Vidible and AOL embeds (#22195, #22227) +* [myspass] Fix video URL extraction and improve metadata extraction (#22448) +* [jamendo] Improve extraction + * Fix album extraction (#18564) + * Improve metadata extraction (#18565, #21379) +* [mediaset] Relax URL guid matching (#18352) ++ [mediaset] Extract unprotected M3U and MPD manifests (#17204) +* [telegraaf] Fix extraction ++ [bellmedia] Add support for marilyn.ca videos (#22193) +* [stv] Fix extraction (#22928) +- [iconosquare] Remove extractor +- [keek] Remove extractor +- [gameone] Remove extractor (#21778) +- [flipagram] Remove extractor +- [bambuser] Remove extractor +* [wistia] Reduce embed extraction false positives ++ [wistia] Add support for inline embeds (#22931) +- [go90] Remove extractor +* [kakao] Remove raw request ++ [kakao] Extract format total bitrate +* [daum] Fix VOD and Clip extracton (#15015) +* [kakao] Improve extraction + + Add support for embed URLs + + Add support for Kakao Legacy vid based embed URLs + * Only extract fields used for extraction + * Strip description and extract tags +* [mixcloud] Fix cloudcast data extraction (#22821) +* [yahoo] Improve extraction + + Add support for live streams (#3597, #3779, #22178) + * Bypass cookie consent page for european domains (#16948, #22576) + + Add generic support for embeds (#20332) +* [tv2] Fix and improve extraction (#22787) ++ [tv2dk] Add support for TV2 DK sites +* [onet] Improve extraction … + + Add support for onet100.vod.pl + + Extract m3u8 formats + * Correct audio only format info +* [fox9] Fix extraction + + version 2019.10.29 Core From ea07412ebf6fff7c17bcac9960cfe4e92ed62f12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Nov 2019 05:32:56 +0700 Subject: [PATCH 088/154] release 2019.11.05 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 12 ++++-------- youtube_dl/version.py | 2 +- 8 files changed, 18 insertions(+), 22 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index f82502bd1..12de9add2 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.29 + [debug] youtube-dl version 2019.11.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5ef983d43..8a6202cf6 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 8f05aa79f..83f91d5fe 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index e90900d8d..be8e70f1e 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.29 + [debug] youtube-dl version 2019.11.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 7021d7397..7544d171c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 338dd456b..d46d20082 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.11.05 Extractors + [scte] Add support for learning.scte.org (#22975) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index af905db5a..536b87479 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -76,8 +76,6 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 - - **bambuser** - - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** - **Bandcamp:weekly** @@ -284,12 +282,12 @@ - **FiveThirtyEight** - **FiveTV** - **Flickr** - - **Flipagram** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Formula1** - **FOX** - **FOX9** + - **FOX9News** - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** @@ -315,8 +313,6 @@ - **FXNetworks** - **Gaia** - **GameInformer** - - **GameOne** - - **gameone:playlist** - **GameSpot** - **GameStar** - **Gaskrank** @@ -331,7 +327,6 @@ - **Globo** - **GloboArticle** - **Go** - - **Go90** - **GodTube** - **Golem** - **GoogleDrive** @@ -366,7 +361,6 @@ - **Hungama** - **HungamaSong** - **Hypem** - - **Iconosquare** - **ign.com** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists @@ -406,7 +400,6 @@ - **Kankan** - **Karaoketv** - **KarriereVideos** - - **keek** - **KeezMovies** - **Ketnet** - **KhanAcademy** @@ -777,6 +770,8 @@ - **Screencast** - **ScreencastOMatic** - **scrippsnetworks:watch** + - **SCTE** + - **SCTECourse** - **Seeker** - **SenateISVP** - **SendtoNews** @@ -926,6 +921,7 @@ - **TV2** - **tv2.hu** - **TV2Article** + - **TV2DK** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 924f26ca8..8012a66db 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.10.29' +__version__ = '2019.11.05' From e9b95167af3f9cacd16e379a40bacb27999840b9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 10:03:38 +0100 Subject: [PATCH 089/154] [roosterteeth] fix login request(closes #16094)(closes #22689) --- youtube_dl/extractor/roosterteeth.py | 55 +++++++++++----------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8d88ee499..8883639b2 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( compat_HTTPError, @@ -18,7 +16,6 @@ from ..utils import ( class RoosterTeethIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' - _LOGIN_URL = 'https://roosterteeth.com/login' _NETRC_MACHINE = 'roosterteeth' _TESTS = [{ 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', @@ -53,48 +50,40 @@ class RoosterTeethIE(InfoExtractor): 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'only_matching': True, }] + _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/' def _login(self): username, password = self._get_login_info() if username is None: return - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='Unable to download login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password, - }) - - login_request = self._download_webpage( - self._LOGIN_URL, None, - note='Logging in', - data=urlencode_postdata(login_form), - headers={ - 'Referer': self._LOGIN_URL, - }) - - if not any(re.search(p, login_request) for p in ( - r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', - r'>Sign Out<')): - error = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', - login_request, 'alert', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) def _real_initialize(self): + if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): + return self._login() def _real_extract(self, url): display_id = self._match_id(url) - api_episode_url = 'https://svod-be.roosterteeth.com/api/v1/episodes/%s' % display_id + api_episode_url = self._EPISODE_BASE_URL + display_id try: m3u8_url = self._download_json( From b77c3949e899902de78b140f6e444dc55bac824f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 14:04:17 +0100 Subject: [PATCH 090/154] [patreon] minimize reponse size and extract uploader_id and filesize --- youtube_dl/extractor/patreon.py | 52 +++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 426dd8121..761a4b1de 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -6,7 +6,11 @@ from ..utils import ( clean_html, determine_ext, int_or_none, + KNOWN_EXTENSIONS, + mimetype2ext, parse_iso8601, + str_or_none, + try_get, ) @@ -24,6 +28,7 @@ class PatreonIE(InfoExtractor): 'thumbnail': 're:^https?://.*$', 'timestamp': 1406473987, 'upload_date': '20140727', + 'uploader_id': '87145', }, }, { 'url': 'http://www.patreon.com/creation?hid=754133', @@ -90,7 +95,13 @@ class PatreonIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) post = self._download_json( - 'https://www.patreon.com/api/posts/' + video_id, video_id) + 'https://www.patreon.com/api/posts/' + video_id, video_id, query={ + 'fields[media]': 'download_url,mimetype,size_bytes', + 'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title', + 'fields[user]': 'full_name,url', + 'json-api-use-default-includes': 'false', + 'include': 'media,user', + }) attributes = post['data']['attributes'] title = attributes['title'].strip() image = attributes.get('image') or {} @@ -104,33 +115,42 @@ class PatreonIE(InfoExtractor): 'comment_count': int_or_none(attributes.get('comment_count')), } - def add_file(file_data): - file_url = file_data.get('url') - if file_url: - info.update({ - 'url': file_url, - 'ext': determine_ext(file_data.get('name'), 'mp3'), - }) - for i in post.get('included', []): i_type = i.get('type') - if i_type == 'attachment': - add_file(i.get('attributes') or {}) + if i_type == 'media': + media_attributes = i.get('attributes') or {} + download_url = media_attributes.get('download_url') + ext = mimetype2ext(media_attributes.get('mimetype')) + if download_url and ext in KNOWN_EXTENSIONS: + info.update({ + 'ext': ext, + 'filesize': int_or_none(media_attributes.get('size_bytes')), + 'url': download_url, + }) elif i_type == 'user': user_attributes = i.get('attributes') if user_attributes: info.update({ 'uploader': user_attributes.get('full_name'), + 'uploader_id': str_or_none(i.get('id')), 'uploader_url': user_attributes.get('url'), }) if not info.get('url'): - add_file(attributes.get('post_file') or {}) + embed_url = try_get(attributes, lambda x: x['embed']['url']) + if embed_url: + info.update({ + '_type': 'url', + 'url': embed_url, + }) if not info.get('url'): - info.update({ - '_type': 'url', - 'url': attributes['embed']['url'], - }) + post_file = attributes['post_file'] + ext = determine_ext(post_file.get('name')) + if ext in KNOWN_EXTENSIONS: + info.update({ + 'ext': ext, + 'url': post_file['url'], + }) return info From 2318629b2b79cad5fcab743bce86233a7592ed46 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 14:04:50 +0100 Subject: [PATCH 091/154] [dplay] minimize response size --- youtube_dl/extractor/dplay.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index d9c3d59cd..a7b9db568 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -146,6 +146,11 @@ class DPlayIE(InfoExtractor): video = self._download_json( disco_base + 'content/videos/' + display_id, display_id, headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', 'include': 'images,primaryChannel,show,tags' }) video_id = video['data']['id'] @@ -226,7 +231,6 @@ class DPlayIE(InfoExtractor): 'series': series, 'season_number': int_or_none(info.get('seasonNumber')), 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), 'creator': creator, 'tags': tags, 'thumbnails': thumbnails, From b6139cb0c3635eb96e39973ab288c17a9f104067 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 22:56:25 +0100 Subject: [PATCH 092/154] [common] pass headers to _extract_(m3u8|mpd)_formats methods --- youtube_dl/extractor/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 50d48c40d..2688b19e4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1586,12 +1586,12 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True, live=False): + fatal=True, live=False, headers=None): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal) + fatal=fatal, headers=headers) if res is False: return [] @@ -2009,12 +2009,12 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers=None): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal) + fatal=fatal, headers=None) if res is False: return [] mpd_doc, urlh = res From d7def23d0539430f5d816f1cfd733e436f62c257 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 23:08:42 +0100 Subject: [PATCH 093/154] [hotstar] pass Referer header to format requests(closes #22836) --- youtube_dl/extractor/hotstar.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index f9f7c5a64..f97eefa3d 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -118,6 +118,7 @@ class HotStarIE(HotStarBaseIE): if video_data.get('drmProtected'): raise ExtractorError('This video is DRM protected.', expected=True) + headers = {'Referer': url} formats = [] geo_restricted = False playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets'] @@ -137,10 +138,11 @@ class HotStarIE(HotStarBaseIE): if 'package:hls' in tags or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls')) + entry_protocol='m3u8_native', + m3u8_id='hls', headers=headers)) elif 'package:dash' in tags or ext == 'mpd': formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash')) + format_url, video_id, mpd_id='dash', headers=headers)) elif ext == 'f4m': # produce broken files pass @@ -158,6 +160,9 @@ class HotStarIE(HotStarBaseIE): self.raise_geo_restricted(countries=['IN']) self._sort_formats(formats) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + return { 'id': video_id, 'title': title, From 57033e35e58e1d57ab3be5ffe5df5a80a5dbcf83 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 23:41:57 +0100 Subject: [PATCH 094/154] [common] fix typo --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2688b19e4..1e6b66d25 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2014,7 +2014,7 @@ class InfoExtractor(object): mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal, headers=None) + fatal=fatal, headers=headers) if res is False: return [] mpd_doc, urlh = res From 3ec86619e33a3d1e29c14ec053d7e420ac8b62ae Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 07:18:29 +0100 Subject: [PATCH 095/154] [common] initialize headers param with empty dict --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1e6b66d25..4a683f6d6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1586,7 +1586,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, headers=None): + fatal=True, live=False, headers={}): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -2009,7 +2009,7 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers=None): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', From d64ec1242e9dec03ea2aa86b6e913db78c8619e0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 10:44:19 +0100 Subject: [PATCH 096/154] [onionstudios] fix extraction --- youtube_dl/extractor/onionstudios.py | 78 ++++++++++++++++------------ 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index c6e3d5640..7f8c6f0d3 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -5,10 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, + compat_str, int_or_none, - float_or_none, - mimetype2ext, + js_to_json, + parse_iso8601, + try_get, ) @@ -17,14 +18,16 @@ class OnionStudiosIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', - 'md5': '719d1f8c32094b8c33902c17bcae5e34', + 'md5': '5a118d466d62b5cd03647cf2c593977f', 'info_dict': { 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', + 'description': 'md5:545299bda6abf87e5ec666548c6a9448', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'The A.V. Club', - 'uploader_id': 'the-av-club', + 'uploader': 'a.v. club', + 'upload_date': '20150619', + 'timestamp': 1434728546, }, }, { 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', @@ -44,38 +47,49 @@ class OnionStudiosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://onionstudios.com/embed/dc94dc2899fe644c0e7241fa04c1b732.js', + video_id) + mcp_id = compat_str(self._parse_json(self._search_regex( + r'window\.mcpMapping\s*=\s*({.+?});', webpage, + 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) video_data = self._download_json( - 'http://www.onionstudios.com/video/%s.json' % video_id, video_id) - - title = video_data['title'] - + 'https://api.vmh.univision.com/metadata/v1/content/' + mcp_id, + mcp_id)['videoMetadata'] + iptc = video_data['photoVideoMetadataIPTC'] + title = iptc['title']['en'] + fmg = video_data.get('photoVideoMetadata_fmg') or {} + tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' + data = self._download_json( + tvss_domain + '/api/v3/video-auth/url-signature-tokens', + mcp_id, query={'mcpids': mcp_id})['data'][0] formats = [] - for source in video_data.get('sources', []): - source_url = source.get('url') - if not source_url: - continue - ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - tbr = int_or_none(source.get('bitrate')) - formats.append({ - 'format_id': ext + ('-%d' % tbr if tbr else ''), - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'tbr': tbr, - 'ext': ext, - }) + + rendition_url = data.get('renditionUrl') + if rendition_url: + formats = self._extract_m3u8_formats( + rendition_url, mcp_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + + fallback_rendition_url = data.get('fallbackRenditionUrl') + if fallback_rendition_url: + formats.append({ + 'format_id': 'fallback', + 'tbr': int_or_none(self._search_regex( + r'_(\d+)\.mp4', fallback_rendition_url, + 'bitrate', default=None)), + 'url': fallback_rendition_url, + }) + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': video_data.get('poster_url'), - 'uploader': video_data.get('channel_name'), - 'uploader_id': video_data.get('channel_slug'), - 'duration': float_or_none(video_data.get('duration', 1000)), - 'tags': video_data.get('tags'), + 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), + 'uploader': fmg.get('network'), + 'duration': int_or_none(iptc.get('fileDuration')), 'formats': formats, + 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), + 'timestamp': parse_iso8601(iptc.get('dateReleased')), } From 55adb63e5412fa5556be22e97d61b8d27c7a5e67 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 19:56:10 +0100 Subject: [PATCH 097/154] [kinja] add support for Kinja embeds closes #5756 closes #11282 closes #22237 closes #22384 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 17 ++- youtube_dl/extractor/kinja.py | 221 +++++++++++++++++++++++++++ youtube_dl/extractor/onionstudios.py | 54 +------ 4 files changed, 241 insertions(+), 52 deletions(-) create mode 100644 youtube_dl/extractor/kinja.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f43b284d..9e3b554fa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -513,6 +513,7 @@ from .keezmovies import KeezMoviesIE from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c0780e98..3d919f656 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -119,6 +119,7 @@ from .viqeo import ViqeoIE from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE +from .kinja import KinjaEmbedIE class GenericIE(InfoExtractor): @@ -1487,16 +1488,18 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # OnionStudios embed + # Kinja embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', 'info_dict': { - 'id': '2855', + 'id': '106351', 'ext': 'mp4', 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', + 'description': 'Migrated from OnionStudios', 'thumbnail': r're:^https?://.*\.jpe?g$', - 'uploader': 'ClickHole', - 'uploader_id': 'clickhole', + 'uploader': 'clickhole', + 'upload_date': '20150527', + 'timestamp': 1432744860, } }, # SnagFilms embed @@ -2894,6 +2897,12 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') + # Look for Kinja embeds + kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) + if kinja_embed_urls: + return self.playlist_from_matches( + kinja_embed_urls, video_id, video_title) + # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: diff --git a/youtube_dl/extractor/kinja.py b/youtube_dl/extractor/kinja.py new file mode 100644 index 000000000..79e3026d2 --- /dev/null +++ b/youtube_dl/extractor/kinja.py @@ -0,0 +1,221 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + unescapeHTML, + urljoin, +) + + +class KinjaEmbedIE(InfoExtractor): + IENAME = 'kinja:embed' + _DOMAIN_REGEX = r'''(?:[^.]+\.)? + (?: + avclub| + clickhole| + deadspin| + gizmodo| + jalopnik| + jezebel| + kinja| + kotaku| + lifehacker| + splinternews| + the(?:inventory|onion|root|takeout) + )\.com''' + _COMMON_REGEX = r'''/ + (?: + ajax/inset| + embed/video + )/iframe\?.*?\bid=''' + _VALID_URL = r'''(?x)https?://%s%s + (?P<type> + fb| + imgur| + instagram| + jwp(?:layer)?-video| + kinjavideo| + mcp| + megaphone| + ooyala| + soundcloud(?:-playlist)?| + tumblr-post| + twitch-stream| + twitter| + ustream-channel| + vimeo| + vine| + youtube-(?:list|video) + )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) + _TESTS = [{ + 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE', + 'only_matching': True, + }] + _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform') + _PROVIDER_MAP = { + 'fb': ('facebook.com/video.php?v=', 'Facebook'), + 'imgur': ('imgur.com/', 'Imgur'), + 'instagram': ('instagram.com/p/', 'Instagram'), + 'jwplayer-video': _JWPLATFORM_PROVIDER, + 'jwp-video': _JWPLATFORM_PROVIDER, + 'megaphone': ('player.megaphone.fm/', 'Generic'), + 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), + 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), + 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), + 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), + 'twitch-stream': ('twitch.tv/', 'TwitchStream'), + 'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'), + 'ustream-channel': ('ustream.tv/embed/', 'Ustream'), + 'vimeo': ('vimeo.com/', 'Vimeo'), + 'vine': ('vine.co/v/', 'Vine'), + 'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'), + 'youtube-video': ('youtube.com/embed/', 'Youtube'), + } + + @staticmethod + def _extract_urls(webpage, url): + return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( + r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), + webpage)] + + def _real_extract(self, url): + video_type, video_id = re.match(self._VALID_URL, url).groups() + + provider = self._PROVIDER_MAP.get(video_type) + if provider: + video_id = compat_urllib_parse_unquote(video_id) + if video_type == 'tumblr-post': + video_id, blog = video_id.split('-', 1) + result_url = provider[0] % (blog, video_id) + elif video_type == 'youtube-list': + video_id, playlist_id = video_id.split('/') + result_url = provider[0] % (video_id, playlist_id) + else: + if video_type == 'ooyala': + video_id = video_id.split('/')[0] + result_url = provider[0] + video_id + return self.url_result('http://' + result_url, provider[1]) + + if video_type == 'kinjavideo': + data = self._download_json( + 'https://kinja.com/api/core/video/views/videoById', + video_id, query={'videoId': video_id})['data'] + title = data['title'] + + formats = [] + for k in ('signedPlaylist', 'streaming'): + m3u8_url = data.get(k + 'Url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + thumbnail = None + poster = data.get('poster') or {} + poster_id = poster.get('id') + if poster_id: + thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg') + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'formats': formats, + 'tags': data.get('tags'), + 'timestamp': int_or_none(try_get( + data, lambda x: x['postInfo']['publishTimeMillis']), 1000), + 'thumbnail': thumbnail, + 'uploader': data.get('network'), + } + else: + video_data = self._download_json( + 'https://api.vmh.univision.com/metadata/v1/content/' + video_id, + video_id)['videoMetadata'] + iptc = video_data['photoVideoMetadataIPTC'] + title = iptc['title']['en'] + fmg = video_data.get('photoVideoMetadata_fmg') or {} + tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' + data = self._download_json( + tvss_domain + '/api/v3/video-auth/url-signature-tokens', + video_id, query={'mcpids': video_id})['data'][0] + formats = [] + + rendition_url = data.get('renditionUrl') + if rendition_url: + formats = self._extract_m3u8_formats( + rendition_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + + fallback_rendition_url = data.get('fallbackRenditionUrl') + if fallback_rendition_url: + formats.append({ + 'format_id': 'fallback', + 'tbr': int_or_none(self._search_regex( + r'_(\d+)\.mp4', fallback_rendition_url, + 'bitrate', default=None)), + 'url': fallback_rendition_url, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), + 'uploader': fmg.get('network'), + 'duration': int_or_none(iptc.get('fileDuration')), + 'formats': formats, + 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), + 'timestamp': parse_iso8601(iptc.get('dateReleased')), + } diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 7f8c6f0d3..cf5c39e66 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -4,13 +4,8 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - compat_str, - int_or_none, - js_to_json, - parse_iso8601, - try_get, -) +from ..compat import compat_str +from ..utils import js_to_json class OnionStudiosIE(InfoExtractor): @@ -20,7 +15,7 @@ class OnionStudiosIE(InfoExtractor): 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', 'md5': '5a118d466d62b5cd03647cf2c593977f', 'info_dict': { - 'id': '2937', + 'id': '3459881', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', 'description': 'md5:545299bda6abf87e5ec666548c6a9448', @@ -53,43 +48,6 @@ class OnionStudiosIE(InfoExtractor): mcp_id = compat_str(self._parse_json(self._search_regex( r'window\.mcpMapping\s*=\s*({.+?});', webpage, 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) - video_data = self._download_json( - 'https://api.vmh.univision.com/metadata/v1/content/' + mcp_id, - mcp_id)['videoMetadata'] - iptc = video_data['photoVideoMetadataIPTC'] - title = iptc['title']['en'] - fmg = video_data.get('photoVideoMetadata_fmg') or {} - tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' - data = self._download_json( - tvss_domain + '/api/v3/video-auth/url-signature-tokens', - mcp_id, query={'mcpids': mcp_id})['data'][0] - formats = [] - - rendition_url = data.get('renditionUrl') - if rendition_url: - formats = self._extract_m3u8_formats( - rendition_url, mcp_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - - fallback_rendition_url = data.get('fallbackRenditionUrl') - if fallback_rendition_url: - formats.append({ - 'format_id': 'fallback', - 'tbr': int_or_none(self._search_regex( - r'_(\d+)\.mp4', fallback_rendition_url, - 'bitrate', default=None)), - 'url': fallback_rendition_url, - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), - 'uploader': fmg.get('network'), - 'duration': int_or_none(iptc.get('fileDuration')), - 'formats': formats, - 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), - 'timestamp': parse_iso8601(iptc.get('dateReleased')), - } + return self.url_result( + 'http://kinja.com/ajax/inset/iframe?id=mcp-' + mcp_id, + 'KinjaEmbed', mcp_id) From 5d92b407e0ea856e3dbadfef35e5258e94e0bb23 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 20:41:49 +0100 Subject: [PATCH 098/154] [mixcloud] improve extraction - improve metadata extraction(closes #11721) - fix playlist extraction(closes #22378) - fix user mixes extraction(closes #15197)(closes #17865) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/mixcloud.py | 498 +++++++++++++---------------- 2 files changed, 225 insertions(+), 274 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9e3b554fa..2f9ba6893 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -639,7 +639,6 @@ from .mixcloud import ( MixcloudIE, MixcloudUserIE, MixcloudPlaylistIE, - MixcloudStreamIE, ) from .mlb import MLBIE from .mnet import MnetIE diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index e5f631506..9759560f1 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import functools import itertools import re @@ -11,28 +10,37 @@ from ..compat import ( compat_ord, compat_str, compat_urllib_parse_unquote, - compat_urlparse, compat_zip ) from ..utils import ( - clean_html, - ExtractorError, int_or_none, - OnDemandPagedList, - str_to_int, + parse_iso8601, + strip_or_none, try_get, - urljoin, ) -class MixcloudIE(InfoExtractor): +class MixcloudBaseIE(InfoExtractor): + def _call_api(self, object_type, object_fields, display_id, username, slug=None): + lookup_key = object_type + 'Lookup' + return self._download_json( + 'https://www.mixcloud.com/graphql', display_id, query={ + 'query': '''{ + %s(lookup: {username: "%s"%s}) { + %s + } +}''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields) + })['data'][lookup_key] + + +class MixcloudIE(MixcloudBaseIE): _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', 'info_dict': { - 'id': 'dholbach-cryptkeeper', + 'id': 'dholbach_cryptkeeper', 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', @@ -40,11 +48,13 @@ class MixcloudIE(InfoExtractor): 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, + 'timestamp': 1321359578, + 'upload_date': '20111115', }, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { - 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', + 'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat', 'ext': 'mp3', 'title': 'Caribou 7 inch Vinyl Mix & Chat', 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', @@ -52,11 +62,14 @@ class MixcloudIE(InfoExtractor): 'uploader_id': 'gillespeterson', 'thumbnail': 're:https?://.*', 'view_count': int, + 'timestamp': 1422987057, + 'upload_date': '20150203', }, }, { 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', 'only_matching': True, }] + _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD' @staticmethod def _decrypt_xor_cipher(key, ciphertext): @@ -66,177 +79,193 @@ class MixcloudIE(InfoExtractor): for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader = mobj.group(1) - cloudcast_name = mobj.group(2) - track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name))) + username, slug = re.match(self._VALID_URL, url).groups() + username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug) + track_id = '%s_%s' % (username, slug) - webpage = self._download_webpage(url, track_id) + cloudcast = self._call_api('cloudcast', '''audioLength + comments(first: 100) { + edges { + node { + comment + created + user { + displayName + username + } + } + } + totalCount + } + description + favorites { + totalCount + } + featuringArtistList + isExclusive + name + owner { + displayName + url + username + } + picture(width: 1024, height: 1024) { + url + } + plays + publishDate + reposts { + totalCount + } + streamInfo { + dashUrl + hlsUrl + url + } + tags { + tag { + name + } + }''', track_id, username, slug) - # Legacy path - encrypted_play_info = self._search_regex( - r'm-play-info="([^"]+)"', webpage, 'play info', default=None) + title = cloudcast['name'] - if encrypted_play_info is not None: - # Decode - encrypted_play_info = compat_b64decode(encrypted_play_info) - else: - # New path - full_info_json = self._parse_json(self._html_search_regex( - r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>', - webpage, 'play info'), 'play info') - for item in full_info_json: - item_data = try_get(item, [ - lambda x: x['cloudcast']['data']['cloudcastLookup'], - lambda x: x['cloudcastLookup']['data']['cloudcastLookup'], - ], dict) - if try_get(item_data, lambda x: x['streamInfo']['url']): - info_json = item_data - break - else: - raise ExtractorError('Failed to extract matching stream info') + stream_info = cloudcast['streamInfo'] + formats = [] - message = self._html_search_regex( - r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', - webpage, 'error message', default=None) - - js_url = self._search_regex( - r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)', - webpage, 'js url') - js = self._download_webpage(js_url, track_id, 'Downloading JS') - # Known plaintext attack - if encrypted_play_info: - kps = ['{"stream_url":'] - kpa_target = encrypted_play_info - else: - kps = ['https://', 'http://'] - kpa_target = compat_b64decode(info_json['streamInfo']['url']) - for kp in kps: - partial_key = self._decrypt_xor_cipher(kpa_target, kp) - for quote in ["'", '"']: - key = self._search_regex( - r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), - js, 'encryption key', default=None) - if key is not None: - break - else: + for url_key in ('url', 'hlsUrl', 'dashUrl'): + format_url = stream_info.get(url_key) + if not format_url: continue - break - else: - raise ExtractorError('Failed to extract encryption key') + decrypted = self._decrypt_xor_cipher( + self._DECRYPTION_KEY, compat_b64decode(format_url)) + if url_key == 'hlsUrl': + formats.extend(self._extract_m3u8_formats( + decrypted, track_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif url_key == 'dashUrl': + formats.extend(self._extract_mpd_formats( + decrypted, track_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': decrypted, + 'downloader_options': { + # Mixcloud starts throttling at >~5M + 'http_chunk_size': 5242880, + }, + }) - if encrypted_play_info is not None: - play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') - if message and 'stream_url' not in play_info: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - song_url = play_info['stream_url'] - formats = [{ - 'format_id': 'normal', - 'url': song_url - }] + if not formats and cloudcast.get('isExclusive'): + self.raise_login_required() - title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') - thumbnail = self._proto_relative_url(self._html_search_regex( - r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) - uploader = self._html_search_regex( - r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) - uploader_id = self._search_regex( - r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) - description = self._og_search_description(webpage) - view_count = str_to_int(self._search_regex( - [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', - r'/listeners/?">([0-9,.]+)</a>', - r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], - webpage, 'play count', default=None)) + self._sort_formats(formats) - else: - title = info_json['name'] - thumbnail = urljoin( - 'https://thumbnailer.mixcloud.com/unsafe/600x600/', - try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str)) - uploader = try_get(info_json, lambda x: x['owner']['displayName']) - uploader_id = try_get(info_json, lambda x: x['owner']['username']) - description = try_get(info_json, lambda x: x['description']) - view_count = int_or_none(try_get(info_json, lambda x: x['plays'])) + comments = [] + for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): + node = edge.get('node') or {} + text = strip_or_none(node.get('comment')) + if not text: + continue + user = node.get('user') or {} + comments.append({ + 'author': user.get('displayName'), + 'author_id': user.get('username'), + 'text': text, + 'timestamp': parse_iso8601(node.get('created')), + }) - stream_info = info_json['streamInfo'] - formats = [] + tags = [] + for t in cloudcast.get('tags'): + tag = try_get(t, lambda x: x['tag']['name'], compat_str) + if not tag: + tags.append(tag) - def decrypt_url(f_url): - for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): - decrypted_url = self._decrypt_xor_cipher(k, f_url) - if re.search(r'^https?://[0-9A-Za-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): - return decrypted_url + get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) - for url_key in ('url', 'hlsUrl', 'dashUrl'): - format_url = stream_info.get(url_key) - if not format_url: - continue - decrypted = decrypt_url(compat_b64decode(format_url)) - if not decrypted: - continue - if url_key == 'hlsUrl': - formats.extend(self._extract_m3u8_formats( - decrypted, track_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif url_key == 'dashUrl': - formats.extend(self._extract_mpd_formats( - decrypted, track_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'format_id': 'http', - 'url': decrypted, - 'downloader_options': { - # Mixcloud starts throttling at >~5M - 'http_chunk_size': 5242880, - }, - }) - self._sort_formats(formats) + owner = cloudcast.get('owner') or {} return { 'id': track_id, 'title': title, 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, + 'description': cloudcast.get('description'), + 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str), + 'uploader': owner.get('displayName'), + 'timestamp': parse_iso8601(cloudcast.get('publishDate')), + 'uploader_id': owner.get('username'), + 'uploader_url': owner.get('url'), + 'duration': int_or_none(cloudcast.get('audioLength')), + 'view_count': int_or_none(cloudcast.get('plays')), + 'like_count': get_count('favorites'), + 'repost_count': get_count('reposts'), + 'comment_count': get_count('comments'), + 'comments': comments, + 'tags': tags, + 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, } -class MixcloudPlaylistBaseIE(InfoExtractor): - _PAGE_SIZE = 24 +class MixcloudPlaylistBaseIE(MixcloudBaseIE): + def _get_cloudcast(self, node): + return node - def _find_urls_in_page(self, page): - for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page): - yield self.url_result( - compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)), - MixcloudIE.ie_key()) + def _get_playlist_title(self, title, slug): + return title - def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None): - real_page_number = real_page_number or current_page + 1 - return self._download_webpage( - 'https://www.mixcloud.com/%s/' % path, video_id, - note='Download %s (page %d)' % (page_name, current_page + 1), - errnote='Unable to download %s' % page_name, - query={'page': real_page_number, 'list': 'main', '_ajax': '1'}, - headers={'X-Requested-With': 'XMLHttpRequest'}) + def _real_extract(self, url): + username, slug = re.match(self._VALID_URL, url).groups() + username = compat_urllib_parse_unquote(username) + if not slug: + slug = 'uploads' + else: + slug = compat_urllib_parse_unquote(slug) + playlist_id = '%s_%s' % (username, slug) - def _tracks_page_func(self, page, video_id, page_name, current_page): - resp = self._fetch_tracks_page(page, video_id, page_name, current_page) + is_playlist_type = self._ROOT_TYPE == 'playlist' + playlist_type = 'items' if is_playlist_type else slug + list_filter = '' - for item in self._find_urls_in_page(resp): - yield item + has_next_page = True + entries = [] + while has_next_page: + playlist = self._call_api( + self._ROOT_TYPE, '''%s + %s + %s(first: 100%s) { + edges { + node { + %s + } + } + pageInfo { + endCursor + hasNextPage + } + }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE), + playlist_id, username, slug if is_playlist_type else None) - def _get_user_description(self, page_content): - return self._html_search_regex( - r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>', - page_content, 'user description', fatal=False) + items = playlist.get(playlist_type) or {} + for edge in items.get('edges', []): + cloudcast = self._get_cloudcast(edge.get('node') or {}) + cloudcast_url = cloudcast.get('url') + if not cloudcast_url: + continue + entries.append(self.url_result( + cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug'))) + + page_info = items['pageInfo'] + has_next_page = page_info['hasNextPage'] + list_filter = ', after: "%s"' % page_info['endCursor'] + + return self.playlist_result( + entries, playlist_id, + self._get_playlist_title(playlist[self._TITLE_KEY], slug), + playlist.get(self._DESCRIPTION_KEY)) class MixcloudUserIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$' + _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$' IE_NAME = 'mixcloud:user' _TESTS = [{ @@ -244,68 +273,58 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'playlist_mincount': 11, + 'playlist_mincount': 36, }, { 'url': 'http://www.mixcloud.com/dholbach/uploads/', 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'playlist_mincount': 11, + 'playlist_mincount': 36, }, { 'url': 'http://www.mixcloud.com/dholbach/favorites/', 'info_dict': { 'id': 'dholbach_favorites', 'title': 'Daniel Holbach (favorites)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'params': { - 'playlist_items': '1-100', - }, - 'playlist_mincount': 100, + # 'params': { + # 'playlist_items': '1-100', + # }, + 'playlist_mincount': 396, }, { 'url': 'http://www.mixcloud.com/dholbach/listens/', 'info_dict': { 'id': 'dholbach_listens', 'title': 'Daniel Holbach (listens)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'params': { - 'playlist_items': '1-100', + # 'params': { + # 'playlist_items': '1-100', + # }, + 'playlist_mincount': 1623, + 'skip': 'Large list', + }, { + 'url': 'https://www.mixcloud.com/FirstEar/stream/', + 'info_dict': { + 'id': 'FirstEar_stream', + 'title': 'First Ear (stream)', + 'description': 'Curators of good music\r\n\r\nfirstearmusic.com', }, - 'playlist_mincount': 100, + 'playlist_mincount': 271, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('user') - list_type = mobj.group('type') + _TITLE_KEY = 'displayName' + _DESCRIPTION_KEY = 'biog' + _ROOT_TYPE = 'user' + _NODE_TEMPLATE = '''slug + url''' - # if only a profile URL was supplied, default to download all uploads - if list_type is None: - list_type = 'uploads' - - video_id = '%s_%s' % (user_id, list_type) - - profile = self._download_webpage( - 'https://www.mixcloud.com/%s/' % user_id, video_id, - note='Downloading user profile', - errnote='Unable to download user profile') - - username = self._og_search_title(profile) - description = self._get_user_description(profile) - - entries = OnDemandPagedList( - functools.partial( - self._tracks_page_func, - '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type), - self._PAGE_SIZE) - - return self.playlist_result( - entries, video_id, '%s (%s)' % (username, list_type), description) + def _get_playlist_title(self, title, slug): + return '%s (%s)' % (title, slug) class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): @@ -313,87 +332,20 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): IE_NAME = 'mixcloud:playlist' _TESTS = [{ - 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/', - 'info_dict': { - 'id': 'RedBullThre3style_tokyo-finalists-2015', - 'title': 'National Champions 2015', - 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3', - }, - 'playlist_mincount': 16, - }, { 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('user') - playlist_id = mobj.group('playlist') - video_id = '%s_%s' % (user_id, playlist_id) - - webpage = self._download_webpage( - url, user_id, - note='Downloading playlist page', - errnote='Unable to download playlist page') - - title = self._html_search_regex( - r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)', - webpage, 'playlist title', - default=None) or self._og_search_title(webpage, fatal=False) - description = self._get_user_description(webpage) - - entries = OnDemandPagedList( - functools.partial( - self._tracks_page_func, - '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'), - self._PAGE_SIZE) - - return self.playlist_result(entries, video_id, title, description) - - -class MixcloudStreamIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$' - IE_NAME = 'mixcloud:stream' - - _TEST = { - 'url': 'https://www.mixcloud.com/FirstEar/stream/', 'info_dict': { - 'id': 'FirstEar', - 'title': 'First Ear', - 'description': 'Curators of good music\nfirstearmusic.com', + 'id': 'maxvibes_jazzcat-on-ness-radio', + 'title': 'Ness Radio sessions', }, - 'playlist_mincount': 192, - } + 'playlist_mincount': 59, + }] + _TITLE_KEY = 'name' + _DESCRIPTION_KEY = 'description' + _ROOT_TYPE = 'playlist' + _NODE_TEMPLATE = '''cloudcast { + slug + url + }''' - def _real_extract(self, url): - user_id = self._match_id(url) - - webpage = self._download_webpage(url, user_id) - - entries = [] - prev_page_url = None - - def _handle_page(page): - entries.extend(self._find_urls_in_page(page)) - return self._search_regex( - r'm-next-page-url="([^"]+)"', page, - 'next page URL', default=None) - - next_page_url = _handle_page(webpage) - - for idx in itertools.count(0): - if not next_page_url or prev_page_url == next_page_url: - break - - prev_page_url = next_page_url - current_page = int(self._search_regex( - r'\?page=(\d+)', next_page_url, 'next page number')) - - next_page_url = _handle_page(self._fetch_tracks_page( - '%s/stream' % user_id, user_id, 'stream', idx, - real_page_number=current_page)) - - username = self._og_search_title(webpage) - description = self._get_user_description(webpage) - - return self.playlist_result(entries, user_id, username, description) + def _get_cloudcast(self, node): + return node.get('cloudcast') or {} From d4f53af482cc47b0473a3576da7ad902bea4ac39 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 23:14:26 +0100 Subject: [PATCH 099/154] [lnkgo] fix extraction(closes #16834) --- youtube_dl/extractor/lnkgo.py | 100 ++++++++++++---------------------- 1 file changed, 36 insertions(+), 64 deletions(-) diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py index cfec0d3d0..3e71852aa 100644 --- a/youtube_dl/extractor/lnkgo.py +++ b/youtube_dl/extractor/lnkgo.py @@ -5,24 +5,27 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + compat_str, int_or_none, - unified_strdate, + parse_iso8601, ) class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnkgo\.(?:alfa\.)?lt/visi-video/(?P<show>[^/]+)/ziurek-(?P<id>[A-Za-z0-9-]+)' + _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P<id>[A-Za-z0-9-]+)(?:/(?P<episode_id>\d+))?' _TESTS = [{ - 'url': 'http://lnkgo.alfa.lt/visi-video/yra-kaip-yra/ziurek-yra-kaip-yra-162', + 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', 'info_dict': { - 'id': '46712', + 'id': '10809', 'ext': 'mp4', - 'title': 'Yra kaip yra', - 'upload_date': '20150107', - 'description': 'md5:d82a5e36b775b7048617f263a0e3475e', - 'age_limit': 7, - 'duration': 3019, - 'thumbnail': r're:^https?://.*\.jpg$' + 'title': "Put'ka: Trys Klausimai", + 'upload_date': '20161216', + 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', + 'age_limit': 18, + 'duration': 117, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1481904000, }, 'params': { 'skip_download': True, # HLS download @@ -30,20 +33,21 @@ class LnkGoIE(InfoExtractor): }, { 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', 'info_dict': { - 'id': '47289', + 'id': '10467', 'ext': 'mp4', 'title': 'Nėrdas: Kompiuterio Valymas', 'upload_date': '20150113', 'description': 'md5:7352d113a242a808676ff17e69db6a69', 'age_limit': 18, 'duration': 346, - 'thumbnail': r're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1421164800, }, 'params': { 'skip_download': True, # HLS download }, }, { - 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', + 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', 'only_matching': True, }] _AGE_LIMITS = { @@ -51,66 +55,34 @@ class LnkGoIE(InfoExtractor): 'N-14': 14, 'S': 18, } + _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' def _real_extract(self, url): - display_id = self._match_id(url) + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage( - url, display_id, 'Downloading player webpage') - - video_id = self._search_regex( - r'data-ep="([^"]+)"', webpage, 'video ID') - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - upload_date = unified_strdate(self._search_regex( - r'class="[^"]*meta-item[^"]*air-time[^"]*">.*?<strong>([^<]+)</strong>', webpage, 'upload date', fatal=False)) - - thumbnail_w = int_or_none( - self._og_search_property('image:width', webpage, 'thumbnail width', fatal=False)) - thumbnail_h = int_or_none( - self._og_search_property('image:height', webpage, 'thumbnail height', fatal=False)) - thumbnail = { - 'url': self._og_search_thumbnail(webpage), - } - if thumbnail_w and thumbnail_h: - thumbnail.update({ - 'width': thumbnail_w, - 'height': thumbnail_h, - }) - - config = self._parse_json(self._search_regex( - r'episodePlayer\((\{.*?\}),\s*\{', webpage, 'sources'), video_id) - - if config.get('pGeo'): - self.report_warning( - 'This content might not be available in your country due to copyright reasons') - - formats = [{ - 'format_id': 'hls', - 'ext': 'mp4', - 'url': config['EpisodeVideoLink_HLS'], - }] - - m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<play_path>.+)$', config['EpisodeVideoLink']) - if m: - formats.append({ - 'format_id': 'rtmp', - 'ext': 'flv', - 'url': m.group('url'), - 'play_path': m.group('play_path'), - 'page_url': url, - }) + video_info = self._download_json( + 'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'), + display_id)['videoConfig']['videoInfo'] + video_id = compat_str(video_info['id']) + title = video_info['title'] + prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' + formats = self._extract_m3u8_formats( + self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), + video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) + poster_image = video_info.get('posterImage') + return { 'id': video_id, 'display_id': display_id, 'title': title, 'formats': formats, - 'thumbnails': [thumbnail], - 'duration': int_or_none(config.get('VideoTime')), - 'description': description, - 'age_limit': self._AGE_LIMITS.get(config.get('PGRating'), 0), - 'upload_date': upload_date, + 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None, + 'duration': int_or_none(video_info.get('duration')), + 'description': clean_html(video_info.get('htmlDescription')), + 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), + 'timestamp': parse_iso8601(video_info.get('airDate')), + 'view_count': int_or_none(video_info.get('viewsCount')), } From 0b16b3c2d35d1706ec5c55e5b06352c753127368 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 9 Nov 2019 09:22:24 +0100 Subject: [PATCH 100/154] [twitch] add support for Clip embed URLs --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ca7676fe2..a5681409c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -644,7 +644,7 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -667,6 +667,9 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', 'only_matching': True, + }, { + 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', + 'only_matching': True, }] def _real_extract(self, url): From 18ca61c5e153d1c1cb8b9a2de3c8b9dfdaa69b0e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 9 Nov 2019 09:23:20 +0100 Subject: [PATCH 101/154] [twitter] improve extraction - add support for generic embeds(closes #22168) - always extract http formats for native videos(closes #14934) - add support for Twitter Broadcasts(closes #21369) - extract more metadata - improve VMap format extraction - unify extraction code for both twitter statuses and cards --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/periscope.py | 80 ++-- youtube_dl/extractor/twitter.py | 570 +++++++++++++++-------------- 3 files changed, 344 insertions(+), 307 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2f9ba6893..598006061 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1241,6 +1241,7 @@ from .twitter import ( TwitterCardIE, TwitterIE, TwitterAmplifyIE, + TwitterBroadcastIE, ) from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index b337a56c0..c02e34aba 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -17,12 +17,54 @@ class PeriscopeBaseIE(InfoExtractor): 'https://api.periscope.tv/api/v2/%s' % method, item_id, query=query) + def _parse_broadcast_data(self, broadcast, video_id): + title = broadcast['status'] + uploader = broadcast.get('user_display_name') or broadcast.get('username') + title = '%s - %s' % (uploader, title) if uploader else title + is_live = broadcast.get('state').lower() == 'running' + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'title': self._live_title(title) if is_live else title, + 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'uploader': uploader, + 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(broadcast.get('total_watched')), + 'tags': broadcast.get('tags'), + 'is_live': is_live, + } + + @staticmethod + def _extract_common_format_info(broadcast): + return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) + + @staticmethod + def _add_width_and_height(f, width, height): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=fatal) + if len(m3u8_formats) == 1: + self._add_width_and_height(m3u8_formats[0], width, height) + return m3u8_formats + class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' - # Alive example URLs can be found here http://onperiscope.com/ + # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', @@ -61,21 +103,9 @@ class PeriscopeIE(PeriscopeBaseIE): 'accessVideoPublic', {'broadcast_id': token}, token) broadcast = stream['broadcast'] - title = broadcast['status'] + info = self._parse_broadcast_data(broadcast, token) - uploader = broadcast.get('user_display_name') or broadcast.get('username') - uploader_id = (broadcast.get('user_id') or broadcast.get('username')) - - title = '%s - %s' % (uploader, title) if uploader else title state = broadcast.get('state').lower() - if state == 'running': - title = self._live_title(title) - timestamp = parse_iso8601(broadcast.get('created_at')) - - thumbnails = [{ - 'url': broadcast[image], - } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - width = int_or_none(broadcast.get('width')) height = int_or_none(broadcast.get('height')) @@ -92,32 +122,20 @@ class PeriscopeIE(PeriscopeBaseIE): continue video_urls.add(video_url) if format_id != 'rtmp': - m3u8_formats = self._extract_m3u8_formats( - video_url, token, 'mp4', - entry_protocol='m3u8_native' - if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=False) - if len(m3u8_formats) == 1: - add_width_and_height(m3u8_formats[0]) + m3u8_formats = self._extract_pscp_m3u8_formats( + video_url, token, format_id, state, width, height, False) formats.extend(m3u8_formats) continue rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', } - add_width_and_height(rtmp_format) + self._add_width_and_height(rtmp_format) formats.append(rtmp_format) self._sort_formats(formats) - return { - 'id': broadcast.get('id') or token, - 'title': title, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'thumbnails': thumbnails, - 'formats': formats, - } + info['formats'] = formats + return info class PeriscopeUserIE(PeriscopeBaseIE): diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index cebb6238c..5f8d90fb4 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -4,32 +4,67 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) from ..utils import ( - determine_ext, dict_get, ExtractorError, float_or_none, int_or_none, - remove_end, try_get, + strip_or_none, + unified_timestamp, + update_url_query, xpath_text, ) -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeBaseIE, + PeriscopeIE, +) class TwitterBaseIE(InfoExtractor): + _API_BASE = 'https://api.twitter.com/1.1/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' + _GUEST_TOKEN = None + + def _extract_variant_formats(self, variant, video_id): + variant_url = variant.get('url') + if not variant_url: + return [] + elif '.m3u8' in variant_url: + return self._extract_m3u8_formats( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + else: + tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None + f = { + 'url': variant_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + } + self._search_dimensions_in_video_url(f, variant_url) + return [f] + def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) - video_url = xpath_text(vmap_data, './/MediaFile').strip() - if determine_ext(video_url) == 'm3u8': - return self._extract_m3u8_formats( - video_url, video_id, ext='mp4', m3u8_id='hls', - entry_protocol='m3u8_native') - return [{ - 'url': video_url, - }] + formats = [] + urls = [] + for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): + video_variant.attrib['url'] = compat_urllib_parse_unquote( + video_variant.attrib['url']) + urls.append(video_variant.attrib['url']) + formats.extend(self._extract_variant_formats( + video_variant.attrib, video_id)) + video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) + if video_url not in urls: + formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) + return formats @staticmethod def _search_dimensions_in_video_url(a_format, video_url): @@ -40,10 +75,30 @@ class TwitterBaseIE(InfoExtractor): 'height': int(m.group('height')), }) + def _call_api(self, path, video_id, query={}): + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + } + if not self._GUEST_TOKEN: + self._GUEST_TOKEN = self._download_json( + self._API_BASE + 'guest/activate.json', video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = self._GUEST_TOKEN + try: + return self._download_json( + self._API_BASE + path, video_id, headers=headers, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), + video_id)['errors'][0]['message'], expected=True) + raise -class TwitterCardIE(TwitterBaseIE): + +class TwitterCardIE(InfoExtractor): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -51,19 +106,28 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.", + 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96', + 'uploader': 'Twitter', + 'uploader_id': 'Twitter', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 30.033, + 'timestamp': 1422366112, + 'upload_date': '20150127', }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', - 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'md5': '7137eca597f72b9abbe61e5ae0161399', 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*$', + 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.", + 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA", + 'uploader': 'NASA', + 'uploader_id': 'NASA', + 'timestamp': 1437408129, + 'upload_date': '20150720', }, }, { @@ -75,7 +139,7 @@ class TwitterCardIE(TwitterBaseIE): 'title': 'Ubuntu 11.10 Overview', 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', 'upload_date': '20111013', - 'uploader': 'OMG! Ubuntu!', + 'uploader': 'OMG! UBUNTU!', 'uploader_id': 'omgubuntu', }, 'add_ie': ['Youtube'], @@ -99,190 +163,30 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", + 'uploader': 'Brent Yarina', + 'uploader_id': 'BTNBrentYarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, + 'skip': 'This content is no longer available.', }, { 'url': 'https://twitter.com/i/videos/752274308186120192', 'only_matching': True, }, ] - _API_BASE = 'https://api.twitter.com/1.1' - - def _parse_media_info(self, media_info, video_id): - formats = [] - for media_variant in media_info.get('variants', []): - media_url = media_variant['url'] - if media_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) - elif media_url.endswith('.mpd'): - formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) - else: - tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) - a_format = { - 'url': media_url, - 'format_id': 'http-%d' % tbr if tbr else 'http', - 'tbr': tbr, - } - # Reported bitRate may be zero - if not a_format['tbr']: - del a_format['tbr'] - - self._search_dimensions_in_video_url(a_format, media_url) - - formats.append(a_format) - return formats - - def _extract_mobile_formats(self, username, video_id): - webpage = self._download_webpage( - 'https://mobile.twitter.com/%s/status/%s' % (username, video_id), - video_id, 'Downloading mobile webpage', - headers={ - # A recent mobile UA is necessary for `gt` cookie - 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0', - }) - main_script_url = self._html_search_regex( - r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL') - main_script = self._download_webpage( - main_script_url, video_id, 'Downloading main script') - bearer_token = self._search_regex( - r'BEARER_TOKEN\s*:\s*"([^"]+)"', - main_script, 'bearer token') - # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id - api_data = self._download_json( - '%s/statuses/show/%s.json' % (self._API_BASE, video_id), - video_id, 'Downloading API data', - headers={ - 'Authorization': 'Bearer ' + bearer_token, - }) - media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {} - return self._parse_media_info(media_info, video_id) - def _real_extract(self, url): - path, video_id = re.search(self._VALID_URL, url).groups() - - config = None - formats = [] - duration = None - - urls = [url] - if path.startswith('cards/'): - urls.append('https://twitter.com/i/videos/' + video_id) - - for u in urls: - webpage = self._download_webpage( - u, video_id, headers={'Referer': 'https://twitter.com/'}) - - iframe_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', - webpage, 'video iframe', default=None) - if iframe_url: - return self.url_result(iframe_url) - - config = self._parse_json(self._html_search_regex( - r'data-(?:player-)?config="([^"]+)"', webpage, - 'data player config', default='{}'), - video_id) - - if config.get('source_type') == 'vine': - return self.url_result(config['player_url'], 'Vine') - - periscope_url = PeriscopeIE._extract_url(webpage) - if periscope_url: - return self.url_result(periscope_url, PeriscopeIE.ie_key()) - - video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') - - if video_url: - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) - else: - f = { - 'url': video_url, - } - - self._search_dimensions_in_video_url(f, video_url) - - formats.append(f) - - vmap_url = config.get('vmapUrl') or config.get('vmap_url') - if vmap_url: - formats.extend( - self._extract_formats_from_vmap_url(vmap_url, video_id)) - - media_info = None - - for entity in config.get('status', {}).get('entities', []): - if 'mediaInfo' in entity: - media_info = entity['mediaInfo'] - - if media_info: - formats.extend(self._parse_media_info(media_info, video_id)) - duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) - - username = config.get('user', {}).get('screen_name') - if username: - formats.extend(self._extract_mobile_formats(username, video_id)) - - if formats: - title = self._search_regex(r'<title>([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - break - - if not formats: - headers = { - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', - 'Referer': url, - } - ct0 = self._get_cookies(url).get('ct0') - if ct0: - headers['csrf_token'] = ct0.value - guest_token = self._download_json( - '%s/guest/activate.json' % self._API_BASE, video_id, - 'Downloading guest token', data=b'', - headers=headers)['guest_token'] - headers['x-guest-token'] = guest_token - self._set_cookie('api.twitter.com', 'gt', guest_token) - config = self._download_json( - '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), - video_id, headers=headers) - track = config['track'] - vmap_url = track.get('vmapUrl') - if vmap_url: - formats = self._extract_formats_from_vmap_url(vmap_url, video_id) - else: - playback_url = track['playbackUrl'] - if determine_ext(playback_url) == 'm3u8': - formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - else: - formats = [{ - 'url': playback_url, - }] - title = 'Twitter web player' - thumbnail = config.get('posterImage') - duration = float_or_none(track.get('durationMs'), scale=1000) - - self._remove_duplicate_formats(formats) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + status_id = self._match_id(url) + return self.url_result( + 'https://twitter.com/statuses/' + status_id, + TwitterIE.ie_key(), status_id) -class TwitterIE(InfoExtractor): +class TwitterIE(TwitterBaseIE): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P[^/]+))/status/(?P\d+)' - _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' - _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P\d+)' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -291,10 +195,13 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', 'duration': 12.922, + 'timestamp': 1442188653, + 'upload_date': '20150913', + 'age_limit': 18, }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -316,19 +223,23 @@ class TwitterIE(InfoExtractor): 'id': '665052190608723968', 'ext': 'mp4', 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', - 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', + 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'uploader_id': 'starwars', 'uploader': 'Star Wars', + 'timestamp': 1447395772, + 'upload_date': '20151113', }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', - 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", 'uploader_id': 'BTNBrentYarina', 'uploader': 'Brent Yarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, 'params': { # The same video as https://twitter.com/i/videos/tweet/705235433198714880 @@ -340,12 +251,14 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'Simon Vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'JG', - 'uploader_id': 'jaydingeer', + 'uploader': 'Simon Vertugo', + 'uploader_id': 'simonvertugo', 'duration': 30.0, + 'timestamp': 1455777459, + 'upload_date': '20160218', }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -353,10 +266,9 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'Vince Mancini - Vine of the day', - 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', - 'uploader': 'Vince Mancini', - 'uploader_id': 'Filmdrunk', + 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', + 'uploader': 'TAKUMA', + 'uploader_id': '1004126642786242560', 'timestamp': 1402826626, 'upload_date': '20140615', }, @@ -367,21 +279,22 @@ class TwitterIE(InfoExtractor): 'id': '719944021058060289', 'ext': 'mp4', 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', - 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', - 'uploader_id': 'captainamerica', + 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', + 'uploader_id': 'CaptainAmerica', 'uploader': 'Captain America', 'duration': 3.17, + 'timestamp': 1460483005, + 'upload_date': '20160412', }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', 'info_dict': { 'id': '1zqKVVlkqLaKB', 'ext': 'mp4', - 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence', - 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"', + 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', 'upload_date': '20160923', - 'uploader_id': 'OPP_HSD', - 'uploader': 'Sgt Kerry Schmidt', + 'uploader_id': '1PmKqpJdOJQoY', + 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', 'timestamp': 1474613214, }, 'add_ie': ['Periscope'], @@ -392,10 +305,12 @@ class TwitterIE(InfoExtractor): 'id': '852138619213144067', 'ext': 'mp4', 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', - 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', + 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', 'uploader': 'عالم الأخبار', 'uploader_id': 'news_al3alm', 'duration': 277.4, + 'timestamp': 1492000653, + 'upload_date': '20170412', }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', @@ -404,10 +319,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo', 'uploader': 'Préfet de Guadeloupe', 'uploader_id': 'Prefet971', 'duration': 47.48, + 'timestamp': 1505803395, + 'upload_date': '20170919', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -420,10 +337,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 're:.*?Shep is on a roll today.*?', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09', 'uploader': 'Lis Power', 'uploader_id': 'LisPower1', 'duration': 111.278, + 'timestamp': 1527623489, + 'upload_date': '20180529', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -435,88 +354,163 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:66d493500c013e3e2d434195746a7f78', + 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', 'uploader': 'Twitter', 'uploader_id': 'Twitter', 'duration': 61.567, + 'timestamp': 1548184644, + 'upload_date': '20190122', }, + }, { + # not available in Periscope + 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', + 'info_dict': { + 'id': '1vOGwqejwoWxB', + 'ext': 'mp4', + 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', + 'uploader': 'Vivi', + 'uploader_id': '1eVjYOLGkGrQL', + }, + 'add_ie': ['TwitterBroadcast'], + }, { + # Twitch Clip Embed + 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - twid = mobj.group('id') - - webpage, urlh = self._download_webpage_handle( - self._TEMPLATE_STATUSES_URL % twid, twid) - - if 'twitter.com/account/suspended' in urlh.geturl(): - raise ExtractorError('Account suspended by Twitter.', expected=True) - - user_id = None - - redirect_mobj = re.match(self._VALID_URL, urlh.geturl()) - if redirect_mobj: - user_id = redirect_mobj.group('user_id') - - if not user_id: - user_id = mobj.group('user_id') - - username = remove_end(self._og_search_title(webpage), ' on Twitter') - - title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') + twid = self._match_id(url) + status = self._call_api( + 'statuses/show/%s.json' % twid, twid, { + 'cards_platform': 'Web-12', + 'include_cards': 1, + 'include_reply_count': 1, + 'include_user_entities': 0, + 'tweet_mode': 'extended', + }) + title = description = status['full_text'].replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) + user = status.get('user') or {} + uploader = user.get('name') + if uploader: + title = '%s - %s' % (uploader, title) + uploader_id = user.get('screen_name') + + tags = [] + for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []): + hashtag_text = hashtag.get('text') + if not hashtag_text: + continue + tags.append(hashtag_text) info = { - 'uploader_id': user_id, - 'uploader': username, - 'webpage_url': url, - 'description': '%s on Twitter: "%s"' % (username, description), - 'title': username + ' - ' + title, + 'id': twid, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': unified_timestamp(status.get('created_at')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None, + 'like_count': int_or_none(status.get('favorite_count')), + 'repost_count': int_or_none(status.get('retweet_count')), + 'comment_count': int_or_none(status.get('reply_count')), + 'age_limit': 18 if status.get('possibly_sensitive') else 0, + 'tags': tags, } - mobj = re.search(r'''(?x) - ]+class="animated-gif"(?P[^>]+)>\s* - ]+video-src="(?P[^"]+)" - ''', webpage) + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + video_info = media.get('video_info') or {} + + formats = [] + for variant in video_info.get('variants', []): + formats.extend(self._extract_variant_formats(variant, twid)) + self._sort_formats(formats) + + thumbnails = [] + media_url = media.get('media_url_https') or media.get('media_url') + if media_url: + def add_thumbnail(name, size): + thumbnails.append({ + 'id': name, + 'url': update_url_query(media_url, {'name': name}), + 'width': int_or_none(size.get('w') or size.get('width')), + 'height': int_or_none(size.get('h') or size.get('height')), + }) + for name, size in media.get('sizes', {}).items(): + add_thumbnail(name, size) + add_thumbnail('orig', media.get('original_info') or {}) - if mobj: - more_info = mobj.group('more_info') - height = int_or_none(self._search_regex( - r'data-height="(\d+)"', more_info, 'height', fatal=False)) - width = int_or_none(self._search_regex( - r'data-width="(\d+)"', more_info, 'width', fatal=False)) - thumbnail = self._search_regex( - r'poster="([^"]+)"', more_info, 'poster', fatal=False) info.update({ - 'id': twid, - 'url': mobj.group('url'), - 'height': height, - 'width': width, - 'thumbnail': thumbnail, + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) - return info - - twitter_card_url = None - if 'class="PlayableMedia' in webpage: - twitter_card_url = '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid) else: - twitter_card_iframe_url = self._search_regex( - r'data-full-card-iframe-url=([\'"])(?P(?:(?!\1).)+)\1', - webpage, 'Twitter card iframe URL', default=None, group='url') - if twitter_card_iframe_url: - twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url) + card = status.get('card') + if card: + binding_values = card['binding_values'] - if twitter_card_url: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', - 'url': twitter_card_url, - }) - return info + def get_binding_value(k): + o = binding_values.get(k) or {} + return try_get(o, lambda x: x[x['type'].lower() + '_value']) - raise ExtractorError('There\'s no video in this tweet.') + card_name = card['name'].split(':')[-1] + if card_name == 'amplify': + formats = self._extract_formats_from_vmap_url( + get_binding_value('amplify_url_vmap'), + get_binding_value('amplify_content_id') or twid) + self._sort_formats(formats) + + thumbnails = [] + for suffix in ('_small', '', '_large', '_x_large', '_original'): + image = get_binding_value('player_image' + suffix) or {} + image_url = image.get('url') + if not image_url or '/player-placeholder' in image_url: + continue + thumbnails.append({ + 'id': suffix[1:] if suffix else 'medium', + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': int_or_none(get_binding_value( + 'content_duration_seconds')), + }) + elif card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + else: + raise ExtractorError('Unsupported Twitter Card.') + else: + expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) + if not expanded_url: + raise ExtractorError("There's no video in this tweet.") + info.update({ + '_type': 'url', + 'url': expanded_url, + }) + return info class TwitterAmplifyIE(TwitterBaseIE): @@ -573,3 +567,27 @@ class TwitterAmplifyIE(TwitterBaseIE): 'formats': formats, 'thumbnails': thumbnails, } + + +class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): + IE_NAME = 'twitter:broadcast' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P[0-9a-zA-Z]{13})' + + def _real_extract(self, url): + broadcast_id = self._match_id(url) + broadcast = self._call_api( + 'broadcasts/show.json', broadcast_id, + {'ids': broadcast_id})['broadcasts'][broadcast_id] + info = self._parse_broadcast_data(broadcast, broadcast_id) + media_key = broadcast['media_key'] + source = self._call_api( + 'live_video_stream/status/' + media_key, media_key)['source'] + m3u8_url = source.get('noRedirectPlaybackUrl') or source['location'] + if '/live_video_stream/geoblocked/' in m3u8_url: + self.raise_geo_restricted() + m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse( + m3u8_url).query).get('type', [None])[0] + state, width, height = self._extract_common_format_info(broadcast) + info['formats'] = self._extract_pscp_m3u8_formats( + m3u8_url, broadcast_id, m3u8_id, state, width, height) + return info From ce112a8c19ebcc9d401ff26a5cdcf58ba565901c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 11:01:07 +0100 Subject: [PATCH 102/154] [twitch] fix video comments URL(#18593)(closes #15828) --- youtube_dl/extractor/twitch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index a5681409c..8c0d70010 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -344,9 +344,8 @@ class TwitchVodIE(TwitchItemBaseIE): info['subtitles'] = { 'rechat': [{ 'url': update_url_query( - 'https://rechat.twitch.tv/rechat-messages', { - 'video_id': 'v%s' % item_id, - 'start': info['timestamp'], + 'https://api.twitch.tv/v5/videos/%s/comments' % item_id, { + 'client_id': self._CLIENT_ID, }), 'ext': 'json', }], From f81dd65ba2c1e7be549e5c8cfe6cbf0f0829edfe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 13:11:59 +0100 Subject: [PATCH 103/154] [extractor/common] clean jwplayer description HTML tags --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4a683f6d6..4c2f9303e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2689,7 +2689,7 @@ class InfoExtractor(object): entry = { 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), - 'description': video_data.get('description'), + 'description': clean_html(video_data.get('description')), 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), From 8fbf5d2f87fbfe0441bc20cf69d506109b2810bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 13:14:23 +0100 Subject: [PATCH 104/154] [seeker] remove Revision3 extractors and fix extraction --- youtube_dl/extractor/extractors.py | 4 - youtube_dl/extractor/revision3.py | 170 ----------------------------- youtube_dl/extractor/seeker.py | 45 ++++---- 3 files changed, 23 insertions(+), 196 deletions(-) delete mode 100644 youtube_dl/extractor/revision3.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 598006061..8df9d95b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -932,10 +932,6 @@ from .rentv import ( from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE -from .revision3 import ( - Revision3EmbedIE, - Revision3IE, -) from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py deleted file mode 100644 index 833d8a2f0..000000000 --- a/youtube_dl/extractor/revision3.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_iso8601, - unescapeHTML, - qualities, -) - - -class Revision3EmbedIE(InfoExtractor): - IE_NAME = 'revision3:embed' - _VALID_URL = r'(?:revision3:(?:(?P[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P\d+)' - _TEST = { - 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558', - 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', - 'info_dict': { - 'id': '67558', - 'ext': 'mp4', - 'title': 'The Pros & Cons Of Zoos', - 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', - 'uploader_id': 'dnews', - 'uploader': 'DNews', - } - } - _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('playlist_id') - playlist_type = mobj.group('playlist_type') or 'video_id' - video_data = self._download_json( - 'http://revision3.com/api/getPlaylist.json', playlist_id, query={ - 'api_key': self._API_KEY, - 'codecs': 'h264,vp8,theora', - playlist_type: playlist_id, - })['items'][0] - - formats = [] - for vcodec, media in video_data['media'].items(): - for quality_id, quality in media.items(): - if quality_id == 'hls': - formats.extend(self._extract_m3u8_formats( - quality['url'], playlist_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': quality['url'], - 'format_id': '%s-%s' % (vcodec, quality_id), - 'tbr': int_or_none(quality.get('bitrate')), - 'vcodec': vcodec, - }) - self._sort_formats(formats) - - return { - 'id': playlist_id, - 'title': unescapeHTML(video_data['title']), - 'description': unescapeHTML(video_data.get('summary')), - 'uploader': video_data.get('show', {}).get('name'), - 'uploader_id': video_data.get('show', {}).get('slug'), - 'duration': int_or_none(video_data.get('duration')), - 'formats': formats, - } - - -class Revision3IE(InfoExtractor): - IE_NAME = 'revision' - _VALID_URL = r'https?://(?:www\.)?(?P(?:revision3|animalist)\.com)/(?P[^/]+(?:/[^/?#]+)?)' - _TESTS = [{ - 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', - 'md5': 'd94a72d85d0a829766de4deb8daaf7df', - 'info_dict': { - 'id': '71089', - 'display_id': 'technobuffalo/5-google-predictions-for-2016', - 'ext': 'webm', - 'title': '5 Google Predictions for 2016', - 'description': 'Google had a great 2015, but it\'s already time to look ahead. Here are our five predictions for 2016.', - 'upload_date': '20151228', - 'timestamp': 1451325600, - 'duration': 187, - 'uploader': 'TechnoBuffalo', - 'uploader_id': 'technobuffalo', - } - }, { - # Show - 'url': 'http://revision3.com/variant', - 'only_matching': True, - }, { - # Tag - 'url': 'http://revision3.com/vr', - 'only_matching': True, - }] - _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' - - def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[0] - page_info = self._download_json( - self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) - - page_data = page_info['data'] - page_type = page_data['type'] - if page_type in ('episode', 'embed'): - show_data = page_data['show']['data'] - page_id = compat_str(page_data['id']) - video_id = compat_str(page_data['video']['data']['id']) - - preference = qualities(['mini', 'small', 'medium', 'large']) - thumbnails = [{ - 'url': image_url, - 'id': image_id, - 'preference': preference(image_id) - } for image_id, image_url in page_data.get('images', {}).items()] - - info = { - 'id': page_id, - 'display_id': display_id, - 'title': unescapeHTML(page_data['name']), - 'description': unescapeHTML(page_data.get('summary')), - 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '), - 'author': page_data.get('author'), - 'uploader': show_data.get('name'), - 'uploader_id': show_data.get('slug'), - 'thumbnails': thumbnails, - 'extractor_key': site, - } - - if page_type == 'embed': - info.update({ - '_type': 'url_transparent', - 'url': page_data['video']['data']['embed'], - }) - return info - - info.update({ - '_type': 'url_transparent', - 'url': 'revision3:%s' % video_id, - }) - return info - else: - list_data = page_info[page_type]['data'] - episodes_data = page_info['episodes']['data'] - num_episodes = page_info['meta']['totalEpisodes'] - processed_episodes = 0 - entries = [] - page_num = 1 - while True: - entries.extend([{ - '_type': 'url', - 'url': 'http://%s%s' % (domain, episode['path']), - 'id': compat_str(episode['id']), - 'ie_key': 'Revision3', - 'extractor_key': site, - } for episode in episodes_data]) - processed_episodes += len(episodes_data) - if processed_episodes == num_episodes: - break - page_num += 1 - episodes_data = self._download_json(self._PAGE_DATA_TEMPLATE % ( - domain, display_id + '/' + compat_str(page_num), domain), - display_id)['episodes']['data'] - - return self.playlist_result( - entries, compat_str(list_data['id']), - list_data.get('name'), list_data.get('summary')) diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py index 3b9c65e7e..7872dc80d 100644 --- a/youtube_dl/extractor/seeker.py +++ b/youtube_dl/extractor/seeker.py @@ -4,34 +4,37 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + get_element_by_class, + strip_or_none, +) class SeekerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P.*)-(?P\d+)\.html' _TESTS = [{ - # player.loadRevision3Item 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', - 'md5': '30c1dc4030cc715cf05b423d0947ac18', + 'md5': '897d44bbe0d8986a2ead96de565a92db', 'info_dict': { - 'id': '76243', - 'ext': 'webm', + 'id': 'Elrn3gnY', + 'ext': 'mp4', 'title': 'Should Trump Be Required To Release His Tax Returns?', - 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?', - 'uploader': 'Seeker Daily', - 'uploader_id': 'seekerdaily', + 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', + 'timestamp': 1490090165, + 'upload_date': '20170321', } }, { 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', 'playlist': [ { - 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'md5': '0497b9f20495174be73ae136949707d2', 'info_dict': { - 'id': '67558', + 'id': 'FihYQ8AE', 'ext': 'mp4', 'title': 'The Pros & Cons Of Zoos', - 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', - 'uploader': 'DNews', - 'uploader_id': 'dnews', + 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', + 'timestamp': 1490039133, + 'upload_date': '20170320', }, } ], @@ -45,13 +48,11 @@ class SeekerIE(InfoExtractor): def _real_extract(self, url): display_id, article_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage) - if mobj: - playlist_type, playlist_id = mobj.groups() - return self.url_result( - 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id) - else: - entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall( - r']+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)] - return self.playlist_result( - entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage)) + entries = [] + for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): + entries.append(self.url_result( + 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) + return self.playlist_result( + entries, article_id, + self._og_search_title(webpage), + strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) From 20baa17c0180c7254644abea968792abcf0743cb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 16:00:12 +0100 Subject: [PATCH 105/154] [daisuki] remove extractor --- youtube_dl/extractor/daisuki.py | 154 ----------------------------- youtube_dl/extractor/extractors.py | 4 - 2 files changed, 158 deletions(-) delete mode 100644 youtube_dl/extractor/daisuki.py diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py deleted file mode 100644 index dbc1aa5d4..000000000 --- a/youtube_dl/extractor/daisuki.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import unicode_literals - -import base64 -import json -import random -import re - -from .common import InfoExtractor -from ..aes import ( - aes_cbc_decrypt, - aes_cbc_encrypt, -) -from ..compat import compat_b64decode -from ..utils import ( - bytes_to_intlist, - bytes_to_long, - extract_attributes, - ExtractorError, - intlist_to_bytes, - js_to_json, - int_or_none, - long_to_bytes, - pkcs1pad, -) - - -class DaisukiMottoIE(InfoExtractor): - _VALID_URL = r'https?://motto\.daisuki\.net/framewatch/embed/[^/]+/(?P[0-9a-zA-Z]{3})' - - _TEST = { - 'url': 'http://motto.daisuki.net/framewatch/embed/embedDRAGONBALLSUPERUniverseSurvivalsaga/V2e/760/428', - 'info_dict': { - 'id': 'V2e', - 'ext': 'mp4', - 'title': '#117 SHOWDOWN OF LOVE! ANDROIDS VS UNIVERSE 2!!', - 'subtitles': { - 'mul': [{ - 'ext': 'ttml', - }], - }, - }, - 'params': { - 'skip_download': True, # AES-encrypted HLS stream - }, - } - - # The public key in PEM format can be found in clientlibs_anime_watch.min.js - _RSA_KEY = (0xc5524c25e8e14b366b3754940beeb6f96cb7e2feef0b932c7659a0c5c3bf173d602464c2df73d693b513ae06ff1be8f367529ab30bf969c5640522181f2a0c51ea546ae120d3d8d908595e4eff765b389cde080a1ef7f1bbfb07411cc568db73b7f521cedf270cbfbe0ddbc29b1ac9d0f2d8f4359098caffee6d07915020077d, 65537) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - flashvars = self._parse_json(self._search_regex( - r'(?s)var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), - video_id, transform_source=js_to_json) - - iv = [0] * 16 - - data = {} - for key in ('device_cd', 'mv_id', 'ss1_prm', 'ss2_prm', 'ss3_prm', 'ss_id'): - data[key] = flashvars.get(key, '') - - encrypted_rtn = None - - # Some AES keys are rejected. Try it with different AES keys - for idx in range(5): - aes_key = [random.randint(0, 254) for _ in range(32)] - padded_aeskey = intlist_to_bytes(pkcs1pad(aes_key, 128)) - - n, e = self._RSA_KEY - encrypted_aeskey = long_to_bytes(pow(bytes_to_long(padded_aeskey), e, n)) - init_data = self._download_json( - 'http://motto.daisuki.net/fastAPI/bgn/init/', - video_id, query={ - 's': flashvars.get('s', ''), - 'c': flashvars.get('ss3_prm', ''), - 'e': url, - 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( - bytes_to_intlist(json.dumps(data)), - aes_key, iv))).decode('ascii'), - 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), - }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) - - if 'rtn' in init_data: - encrypted_rtn = init_data['rtn'] - break - - self._sleep(5, video_id) - - if encrypted_rtn is None: - raise ExtractorError('Failed to fetch init data') - - rtn = self._parse_json( - intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist( - compat_b64decode(encrypted_rtn)), - aes_key, iv)).decode('utf-8').rstrip('\0'), - video_id) - - title = rtn['title_str'] - - formats = self._extract_m3u8_formats( - rtn['play_url'], video_id, ext='mp4', entry_protocol='m3u8_native') - - subtitles = {} - caption_url = rtn.get('caption_url') - if caption_url: - # mul: multiple languages - subtitles['mul'] = [{ - 'url': caption_url, - 'ext': 'ttml', - }] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - } - - -class DaisukiMottoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://motto\.daisuki\.net/(?Pinformation)/' - - _TEST = { - 'url': 'http://motto.daisuki.net/information/', - 'info_dict': { - 'title': 'DRAGON BALL SUPER', - }, - 'playlist_mincount': 117, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [] - for li in re.findall(r'(]+?data-product_id="[a-zA-Z0-9]{3}"[^>]+>)', webpage): - attr = extract_attributes(li) - ad_id = attr.get('data-ad_id') - product_id = attr.get('data-product_id') - if ad_id and product_id: - episode_id = attr.get('data-chapter') - entries.append({ - '_type': 'url_transparent', - 'url': 'http://motto.daisuki.net/framewatch/embed/%s/%s/760/428' % (ad_id, product_id), - 'episode_id': episode_id, - 'episode_number': int_or_none(episode_id), - 'ie_key': 'DaisukiMotto', - }) - - return self.playlist_result(entries, playlist_title='DRAGON BALL SUPER') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8df9d95b1..e2ebe8f95 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -254,10 +254,6 @@ from .dailymotion import ( DailymotionPlaylistIE, DailymotionUserIE, ) -from .daisuki import ( - DaisukiMottoIE, - DaisukiMottoPlaylistIE, -) from .daum import ( DaumIE, DaumClipIE, From 88b87b08b1ed06940053ee018547de051bf8d986 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 17:01:21 +0100 Subject: [PATCH 106/154] [minhateca] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/minhateca.py | 70 ------------------------------ 2 files changed, 71 deletions(-) delete mode 100644 youtube_dl/extractor/minhateca.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e2ebe8f95..dfd0ef198 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -625,7 +625,6 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) -from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py deleted file mode 100644 index dccc54249..000000000 --- a/youtube_dl/extractor/minhateca.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - parse_filesize, - sanitized_Request, - urlencode_postdata, -) - - -class MinhatecaIE(InfoExtractor): - _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P[0-9]+)\.' - _TEST = { - 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)', - 'info_dict': { - 'id': '125848331', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - 'thumbnail': r're:^https?://.*\.jpg$', - 'filesize_approx': 1530000, - 'duration': 9, - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - token = self._html_search_regex( - r'(.*?)', webpage, 'title') - title, _, ext = title_str.rpartition('.') - filesize_approx = parse_filesize(self._html_search_regex( - r'

(.*?)

', - webpage, 'file size approximation', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'(?s)

.*?class="bold">(.*?)<', - webpage, 'duration', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r'

([0-9]+)

', - webpage, 'view count', fatal=False)) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'ext': ext, - 'filesize_approx': filesize_approx, - 'duration': duration, - 'view_count': view_count, - 'thumbnail': self._og_search_thumbnail(webpage), - } From 9e46d1f8aadd38f6de7c2b921b294e67ed2267eb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 17:15:15 +0100 Subject: [PATCH 107/154] [addanime] remove extractor --- youtube_dl/extractor/addanime.py | 95 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 96 deletions(-) delete mode 100644 youtube_dl/extractor/addanime.py diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py deleted file mode 100644 index 5e7c0724e..000000000 --- a/youtube_dl/extractor/addanime.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - qualities, -) - - -class AddAnimeIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' - _TESTS = [{ - 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', - 'md5': '72954ea10bc979ab5e2eb288b21425a0', - 'info_dict': { - 'id': '24MR3YO5SAS9', - 'ext': 'mp4', - 'description': 'One Piece 606', - 'title': 'One Piece 606', - }, - 'skip': 'Video is gone', - }, { - 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - webpage = self._download_webpage(url, video_id) - except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError) or \ - ee.cause.code != 503: - raise - - redir_webpage = ee.cause.read().decode('utf-8') - action = self._search_regex( - r'
', - redir_webpage, 'redirect vc value') - av = re.search( - r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', - redir_webpage) - if av is None: - raise ExtractorError('Cannot find redirect math task') - av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) - - parsed_url = compat_urllib_parse_urlparse(url) - av_val = av_res + len(parsed_url.netloc) - confirm_url = ( - parsed_url.scheme + '://' + parsed_url.netloc - + action + '?' - + compat_urllib_parse_urlencode({ - 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) - self._download_webpage( - confirm_url, video_id, - note='Confirming after redirect') - webpage = self._download_webpage(url, video_id) - - FORMATS = ('normal', 'hq') - quality = qualities(FORMATS) - formats = [] - for format_id in FORMATS: - rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) - video_url = self._search_regex(rex, webpage, 'video file URLx', - fatal=False) - if not video_url: - continue - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'quality': quality(format_id), - }) - self._sort_formats(formats) - video_title = self._og_search_title(webpage) - video_description = self._og_search_description(webpage) - - return { - '_type': 'video', - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'description': video_description - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dfd0ef198..d96f0d284 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -18,7 +18,6 @@ from .acast import ( ACastIE, ACastChannelIE, ) -from .addanime import AddAnimeIE from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import ( From 433e0710585e2414697cff6d444204e1db950bd7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 10 Nov 2019 17:02:47 +0100 Subject: [PATCH 108/154] [facebook] fix posts video data extraction(closes #22473) --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index c723726b7..ce64e2683 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -334,7 +334,7 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json( self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) From 2e9ad59a4d6dfd82b34a965cfc5b8c5a647d1598 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Nov 2019 09:53:04 +0100 Subject: [PATCH 109/154] [soundcloud] check if the soundtrack has downloads left(closes #23045) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 875b9d887..e8ffb2cbe 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -276,7 +276,7 @@ class SoundcloudIE(InfoExtractor): if secret_token: query['secret_token'] = secret_token - if info.get('downloadable'): + if info.get('downloadable') and info.get('has_downloads_left'): format_url = update_url_query( info.get('download_url') or track_base_url + '/download', query) format_urls.add(format_url) From 48970d5cc8838ac404a64462d175b248401e2bd2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Nov 2019 10:51:54 +0100 Subject: [PATCH 110/154] [teamcoco] add support for new videos(closes #23054) --- youtube_dl/extractor/teamcoco.py | 68 +++++++++++++++++--------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 7640cf00a..5793b711f 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -84,6 +84,19 @@ class TeamcocoIE(TurnerBaseIE): 'only_matching': True, } ] + _RECORD_TEMPL = '''id + title + teaser + publishOn + thumb { + preview + } + tags { + name + } + duration + turnerMediaId + turnerMediaAuthToken''' def _graphql_call(self, query_template, object_type, object_id): find_object = 'find' + object_type @@ -98,36 +111,36 @@ class TeamcocoIE(TurnerBaseIE): display_id = self._match_id(url) response = self._graphql_call('''{ - %s(slug: "%s") { + %%s(slug: "%%s") { ... on RecordSlug { record { + %s + } + } + ... on PageSlug { + child { id - title - teaser - publishOn - thumb { - preview - } - file { - url - } - tags { - name - } - duration - turnerMediaId - turnerMediaAuthToken } } ... on NotFoundSlug { status } } -}''', 'Slug', display_id) +}''' % self._RECORD_TEMPL, 'Slug', display_id) if response.get('status'): raise ExtractorError('This video is no longer available.', expected=True) - record = response['record'] + child = response.get('child') + if child: + record = self._graphql_call('''{ + %%s(id: "%%s") { + ... on Video { + %s + } + } +}''' % self._RECORD_TEMPL, 'Record', child['id']) + else: + record = response['record'] video_id = record['id'] info = { @@ -150,25 +163,21 @@ class TeamcocoIE(TurnerBaseIE): 'accessTokenType': 'jws', })) else: - d = self._download_json( + video_sources = self._download_json( 'https://teamcoco.com/_truman/d/' + video_id, - video_id, fatal=False) or {} - video_sources = d.get('meta') or {} - if not video_sources: - video_sources = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id) or {} + video_id)['meta']['src'] + if isinstance(video_sources, dict): + video_sources = video_sources.values() formats = [] get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in video_sources.get('src', {}).items(): + for src in video_sources: if not isinstance(src, dict): continue src_url = src.get('src') if not src_url: continue + format_id = src.get('label') ext = determine_ext(src_url, mimetype2ext(src.get('type'))) if format_id == 'hls' or ext == 'm3u8': # compat_urllib_parse.urljoin does not work here @@ -190,9 +199,6 @@ class TeamcocoIE(TurnerBaseIE): 'format_id': format_id, 'quality': get_quality(format_id), }) - if not formats: - formats = self._extract_m3u8_formats( - record['file']['url'], video_id, 'mp4', fatal=False) self._sort_formats(formats) info['formats'] = formats From eb22d1b55744b69d5ec3556529868acfba6c217f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Nov 2019 19:09:32 +0100 Subject: [PATCH 111/154] [nexx] Add support for Multi Player JS Setup(closes #23052) --- youtube_dl/extractor/nexx.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index f9aad83c4..586c1b7eb 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -108,7 +108,7 @@ class NexxIE(InfoExtractor): @staticmethod def _extract_domain_id(webpage): mobj = re.search( - r']+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud|cdn\.com)/(?P\d+)', + r']+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P\d+)', webpage) return mobj.group('id') if mobj else None @@ -123,7 +123,7 @@ class NexxIE(InfoExtractor): domain_id = NexxIE._extract_domain_id(webpage) if domain_id: for video_id in re.findall( - r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)', + r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)', webpage): entries.append( 'https://api.nexx.cloud/v3/%s/videos/byid/%s' @@ -410,8 +410,8 @@ class NexxIE(InfoExtractor): class NexxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P[^/?#&]+)' + _TESTS = [{ 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', 'md5': '16746bfc28c42049492385c989b26c4a', 'info_dict': { @@ -420,7 +420,6 @@ class NexxEmbedIE(InfoExtractor): 'title': 'Nervenkitzel Achterbahn', 'alt_title': 'Karussellbauer in Deutschland', 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2761, @@ -431,7 +430,10 @@ class NexxEmbedIE(InfoExtractor): 'format': 'bestvideo', 'skip_download': True, }, - } + }, { + 'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7', + 'only_matching': True, + }] @staticmethod def _extract_urls(webpage): From 5709d661a2509fab0c9f3412239ecbe7a621f45b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Nov 2019 01:45:04 +0700 Subject: [PATCH 112/154] [drtv] Add support for new URL schema (closes #23059) --- youtube_dl/extractor/drtv.py | 57 ++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 218f10209..390e79f8c 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -17,6 +17,7 @@ from ..utils import ( float_or_none, mimetype2ext, str_or_none, + try_get, unified_timestamp, update_url_query, url_or_none, @@ -24,7 +25,14 @@ from ..utils import ( class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + ) + (?P[\da-z_-]+) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['DK'] IE_NAME = 'drtv' @@ -83,6 +91,26 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', + 'info_dict': { + 'id': '00951930010', + 'ext': 'mp4', + 'title': 'Bonderøven (1:8)', + 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', + 'timestamp': 1546542000, + 'upload_date': '20190103', + 'duration': 2576.6, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769', + 'only_matching': True, + }, { + 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,13 +128,32 @@ class DRTVIE(InfoExtractor): webpage, 'video id', default=None) if not video_id: - video_id = compat_urllib_parse_unquote(self._search_regex( + video_id = self._search_regex( r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', - webpage, 'urn')) + webpage, 'urn', default=None) + if video_id: + video_id = compat_urllib_parse_unquote(video_id) + + _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard' + query = {'expanded': 'true'} + + if video_id: + programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) + else: + programcard_url = _PROGRAMCARD_BASE + page = self._parse_json( + self._search_regex( + r'data\s*=\s*({.+?})\s*(?:;| Date: Thu, 14 Nov 2019 06:38:55 +0100 Subject: [PATCH 113/154] [comcarcoff] remove extractor --- youtube_dl/extractor/comcarcoff.py | 74 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 75 deletions(-) delete mode 100644 youtube_dl/extractor/comcarcoff.py diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py deleted file mode 100644 index 588aad0d9..000000000 --- a/youtube_dl/extractor/comcarcoff.py +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_duration, - parse_iso8601, -) - - -class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' - _TESTS = [{ - 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', - 'info_dict': { - 'id': '2494164', - 'ext': 'mp4', - 'upload_date': '20141127', - 'timestamp': 1417107600, - 'duration': 1232, - 'title': 'Happy Thanksgiving Miranda', - 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', - }, - 'params': { - 'skip_download': 'requires ffmpeg', - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - if not display_id: - display_id = 'comediansincarsgettingcoffee.com' - webpage = self._download_webpage(url, display_id) - - full_data = self._parse_json( - self._search_regex( - r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), - display_id)['videoData'] - - display_id = full_data['activeVideo']['video'] - video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id] - - video_id = compat_str(video_data['mediaId']) - title = video_data['title'] - formats = self._extract_m3u8_formats( - video_data['mediaUrl'], video_id, 'mp4') - self._sort_formats(formats) - - thumbnails = [{ - 'url': video_data['images']['thumb'], - }, { - 'url': video_data['images']['poster'], - }] - - timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( - video_data.get('pubDate')) - duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( - video_data.get('duration')) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': timestamp, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - 'season_number': int_or_none(video_data.get('season')), - 'episode_number': int_or_none(video_data.get('episode')), - 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d96f0d284..cf4bb8f20 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -222,7 +222,6 @@ from .comedycentral import ( ComedyCentralTVIE, ToshIE, ) -from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( MmsIE, From 656c20010f53851c1b01e839744f7fe48497c03f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 15 Nov 2019 21:17:47 +0100 Subject: [PATCH 114/154] [ivi] fix format extraction(closes #21991) --- youtube_dl/extractor/ivi.py | 56 ++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 86c014b07..efdc3cc98 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -18,6 +18,8 @@ class IviIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] + _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' + _LIGHT_URL = 'https://api.ivi.ru/light/' _TESTS = [ # Single movie @@ -78,25 +80,41 @@ class IviIE(InfoExtractor): 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') def _real_extract(self, url): + try: + from Crypto.Cipher import Blowfish + from Crypto.Hash import CMAC + except ImportError: + raise ExtractorError('pycrypto not found. Please install it.', expected=True) + video_id = self._match_id(url) - data = { + timestamp = self._download_json( + self._LIGHT_URL, video_id, + 'Downloading timestamp JSON', data=json.dumps({ + 'method': 'da.timestamp.get', + 'params': [] + }).encode())['result'] + + data = json.dumps({ 'method': 'da.content.get', 'params': [ video_id, { - 'site': 's183', + 'site': 's353', 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, 'contentid': video_id } ] - } + }).encode() video_json = self._download_json( - 'http://api.digitalaccess.ru/api/json/', video_id, - 'Downloading video JSON', data=json.dumps(data)) + self._LIGHT_URL, video_id, + 'Downloading video JSON', data=data, query={ + 'ts': timestamp, + 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + data, Blowfish).hexdigest(), + }) - if 'error' in video_json: - error = video_json['error'] + error = video_json.get('error') + if error: origin = error['origin'] if origin == 'NotAllowedForLocation': self.raise_geo_restricted( @@ -108,20 +126,24 @@ class IviIE(InfoExtractor): expected=True) result = video_json['result'] + title = result['title'] quality = qualities(self._KNOWN_FORMATS) - formats = [{ - 'url': x['url'], - 'format_id': x.get('content_format'), - 'quality': quality(x.get('content_format')), - } for x in result['files'] if x.get('url')] - + formats = [] + for f in result.get('files', []): + f_url = f.get('url') + content_format = f.get('content_format') + if not f_url or '-MDRM-' in content_format or '-FPS-' in content_format: + continue + formats.append({ + 'url': f_url, + 'format_id': content_format, + 'quality': quality(content_format), + 'filesize': int_or_none(f.get('size_in_bytes')), + }) self._sort_formats(formats) - title = result['title'] - - duration = int_or_none(result.get('duration')) compilation = result.get('compilation') episode = title if compilation else None @@ -158,7 +180,7 @@ class IviIE(InfoExtractor): 'episode_number': episode_number, 'thumbnails': thumbnails, 'description': description, - 'duration': duration, + 'duration': int_or_none(result.get('duration')), 'formats': formats, } From 1bba88efc7e1f82095f7ae38348e56026db4bf3c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 15 Nov 2019 23:46:31 +0100 Subject: [PATCH 115/154] [ivi] sign content request only when pycryptodome is available --- youtube_dl/extractor/ivi.py | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index efdc3cc98..1dcb17c9b 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -80,38 +80,42 @@ class IviIE(InfoExtractor): 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') def _real_extract(self, url): - try: - from Crypto.Cipher import Blowfish - from Crypto.Hash import CMAC - except ImportError: - raise ExtractorError('pycrypto not found. Please install it.', expected=True) - video_id = self._match_id(url) - timestamp = self._download_json( - self._LIGHT_URL, video_id, - 'Downloading timestamp JSON', data=json.dumps({ - 'method': 'da.timestamp.get', - 'params': [] - }).encode())['result'] - data = json.dumps({ 'method': 'da.content.get', 'params': [ video_id, { - 'site': 's353', + 'site': 's%d', 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, 'contentid': video_id } ] }).encode() - video_json = self._download_json( - self._LIGHT_URL, video_id, - 'Downloading video JSON', data=data, query={ + try: + from Crypto.Cipher import Blowfish + from Crypto.Hash import CMAC + + timestamp = self._download_json( + self._LIGHT_URL, video_id, + 'Downloading timestamp JSON', data=json.dumps({ + 'method': 'da.timestamp.get', + 'params': [] + }).encode())['result'] + + data = data % 353 + query = { 'ts': timestamp, 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + data, Blowfish).hexdigest(), - }) + } + except ImportError: + data = data % 183 + query = {} + + video_json = self._download_json( + self._LIGHT_URL, video_id, + 'Downloading video JSON', data=data, query=query) error = video_json.get('error') if error: @@ -121,6 +125,8 @@ class IviIE(InfoExtractor): msg=error['message'], countries=self._GEO_COUNTRIES) elif origin == 'NoRedisValidData': raise ExtractorError('Video %s does not exist' % video_id, expected=True) + elif origin == 'NotAllowedError': + raise ExtractorError('pycryptodome not found. Please install it.', expected=True) raise ExtractorError( 'Unable to download video %s: %s' % (video_id, error['message']), expected=True) From 7360c06facfd96ee603ad4fc27f5903d3f8f6694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Nov 2019 05:44:14 +0700 Subject: [PATCH 116/154] [extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use --- youtube_dl/extractor/common.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4c2f9303e..04d676378 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1455,14 +1455,14 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): + fatal=True, m3u8_id=None, data=None, headers={}, query={}): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source, - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if manifest is False: return [] @@ -1586,12 +1586,13 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, headers={}): + fatal=True, live=False, data=None, headers={}, + query={}): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal, headers=headers) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] @@ -2009,12 +2010,12 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal, headers=headers) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] mpd_doc, urlh = res @@ -2317,12 +2318,12 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): + def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] ism_doc, urlh = res From 6c79785bb0c96d6fc22d942946196f0842d70a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Nov 2019 07:47:23 +0700 Subject: [PATCH 117/154] [travis] Add python 3.8 build --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.travis.yml b/.travis.yml index 6d16c2955..14d95fa84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,12 @@ matrix: - python: 3.7 dist: xenial env: YTDL_TEST_SET=download + - python: 3.8 + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.8 + dist: xenial + env: YTDL_TEST_SET=download - python: 3.8-dev dist: xenial env: YTDL_TEST_SET=core From 9e4e864639bf606a1931a684f130e219e869adfd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 16 Nov 2019 01:51:31 +0100 Subject: [PATCH 118/154] [ivi] improve error detection --- youtube_dl/extractor/ivi.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 1dcb17c9b..7f1146d95 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -119,17 +119,20 @@ class IviIE(InfoExtractor): error = video_json.get('error') if error: - origin = error['origin'] + origin = error.get('origin') + message = error.get('message') or error.get('user_message') + extractor_msg = 'Unable to download video %s' if origin == 'NotAllowedForLocation': - self.raise_geo_restricted( - msg=error['message'], countries=self._GEO_COUNTRIES) + self.raise_geo_restricted(message, self._GEO_COUNTRIES) elif origin == 'NoRedisValidData': - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - elif origin == 'NotAllowedError': - raise ExtractorError('pycryptodome not found. Please install it.', expected=True) - raise ExtractorError( - 'Unable to download video %s: %s' % (video_id, error['message']), - expected=True) + extractor_msg = 'Video %s does not exist' + elif message: + if 'недоступен для просмотра на площадке s183' in message: + raise ExtractorError( + 'pycryptodome not found. Please install it.', + expected=True) + extractor_msg += ': ' + message + raise ExtractorError(extractor_msg % video_id, expected=True) result = video_json['result'] title = result['title'] From 7e70620a342c57746812d4a8fae6f436bd90cf57 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 18 Nov 2019 12:51:25 +0100 Subject: [PATCH 119/154] [vk] fix wall audio thumbnails extraction(closes #23135) --- youtube_dl/extractor/vk.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 4c8ca4f41..195875938 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -634,14 +634,15 @@ class VKWallPostIE(VKBaseIE): if not a.url: continue title = unescapeHTML(a.title) + performer = unescapeHTML(a.performer) entries.append({ 'id': '%s_%s' % (a.owner_id, a.id), 'url': self._unmask_url(a.url, a.ads['vk_id']), - 'title': '%s - %s' % (a.performer, title) if a.performer else title, - 'thumbnail': a.cover_url.split(',') if a.cover_url else None, - 'duration': a.duration, + 'title': '%s - %s' % (performer, title) if performer else title, + 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None, + 'duration': int_or_none(a.duration), 'uploader': uploader, - 'artist': a.performer, + 'artist': performer, 'track': title, 'ext': 'mp4', 'protocol': 'm3u8', From f9c4a4521068a02c583803ea422c6fedfa7598e3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 18 Nov 2019 21:40:53 +0100 Subject: [PATCH 120/154] [ntvru] add support for non relative file URLs(closes #23140) --- youtube_dl/extractor/ntvru.py | 49 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 4f9cedb84..c47d1dfa4 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - clean_html, - xpath_text, int_or_none, + strip_or_none, + unescapeHTML, + xpath_text, ) @@ -47,10 +48,10 @@ class NTVRuIE(InfoExtractor): 'duration': 1496, }, }, { - 'url': 'http://www.ntv.ru/kino/Koma_film', - 'md5': 'f825770930937aa7e5aca0dc0d29319a', + 'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/', + 'md5': 'e9c7cde24d9d3eaed545911a04e6d4f4', 'info_dict': { - 'id': '1007609', + 'id': '1126480', 'ext': 'mp4', 'title': 'Остросюжетный фильм «Кома»', 'description': 'Остросюжетный фильм «Кома»', @@ -68,6 +69,10 @@ class NTVRuIE(InfoExtractor): 'thumbnail': r're:^http://.*\.jpg', 'duration': 2590, }, + }, { + # Schemeless file URL + 'url': 'https://www.ntv.ru/video/1797442', + 'only_matching': True, }] _VIDEO_ID_REGEXES = [ @@ -96,37 +101,31 @@ class NTVRuIE(InfoExtractor): 'http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML') - title = clean_html(xpath_text(player, './data/title', 'title', fatal=True)) - description = clean_html(xpath_text(player, './data/description', 'description')) + title = strip_or_none(unescapeHTML(xpath_text(player, './data/title', 'title', fatal=True))) video = player.find('./data/video') - video_id = xpath_text(video, './id', 'video id') - thumbnail = xpath_text(video, './splash', 'thumbnail') - duration = int_or_none(xpath_text(video, './totaltime', 'duration')) - view_count = int_or_none(xpath_text(video, './views', 'view count')) - - token = self._download_webpage( - 'http://stat.ntv.ru/services/access/token', - video_id, 'Downloading access token') formats = [] for format_id in ['', 'hi', 'webm']: - file_ = video.find('./%sfile' % format_id) - if file_ is None: + file_ = xpath_text(video, './%sfile' % format_id) + if not file_: continue - size = video.find('./%ssize' % format_id) + if file_.startswith('//'): + file_ = self._proto_relative_url(file_) + elif not file_.startswith('http'): + file_ = 'http://media.ntv.ru/vod/' + file_ formats.append({ - 'url': 'http://media2.ntv.ru/vod/%s&tok=%s' % (file_.text, token), - 'filesize': int_or_none(size.text if size is not None else None), + 'url': file_, + 'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)), }) self._sort_formats(formats) return { - 'id': video_id, + 'id': xpath_text(video, './id'), 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, + 'description': strip_or_none(unescapeHTML(xpath_text(player, './data/description'))), + 'thumbnail': xpath_text(video, './splash'), + 'duration': int_or_none(xpath_text(video, './totaltime')), + 'view_count': int_or_none(xpath_text(video, './views')), 'formats': formats, } From 76d9eca43dd4fd7698d138b90ab6b2dd159559e0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Nov 2019 20:16:31 +0100 Subject: [PATCH 121/154] [ivi] fallback to old extraction method for unknown error codes --- youtube_dl/extractor/ivi.py | 79 +++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 7f1146d95..0db023622 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -93,46 +93,57 @@ class IviIE(InfoExtractor): ] }).encode() - try: - from Crypto.Cipher import Blowfish - from Crypto.Hash import CMAC + for site in (353, 183): + content_data = data % site + if site == 353: + try: + from Cryptodome.Cipher import Blowfish + from Cryptodome.Hash import CMAC + pycryptodomex_found = True + except ImportError: + pycryptodomex_found = False + continue - timestamp = self._download_json( + timestamp = (self._download_json( + self._LIGHT_URL, video_id, + 'Downloading timestamp JSON', data=json.dumps({ + 'method': 'da.timestamp.get', + 'params': [] + }).encode(), fatal=False) or {}).get('result') + if not timestamp: + continue + + query = { + 'ts': timestamp, + 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(), + } + else: + query = {} + + video_json = self._download_json( self._LIGHT_URL, video_id, - 'Downloading timestamp JSON', data=json.dumps({ - 'method': 'da.timestamp.get', - 'params': [] - }).encode())['result'] + 'Downloading video JSON', data=content_data, query=query) - data = data % 353 - query = { - 'ts': timestamp, - 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + data, Blowfish).hexdigest(), - } - except ImportError: - data = data % 183 - query = {} - - video_json = self._download_json( - self._LIGHT_URL, video_id, - 'Downloading video JSON', data=data, query=query) - - error = video_json.get('error') - if error: - origin = error.get('origin') - message = error.get('message') or error.get('user_message') - extractor_msg = 'Unable to download video %s' - if origin == 'NotAllowedForLocation': - self.raise_geo_restricted(message, self._GEO_COUNTRIES) - elif origin == 'NoRedisValidData': - extractor_msg = 'Video %s does not exist' - elif message: - if 'недоступен для просмотра на площадке s183' in message: + error = video_json.get('error') + if error: + origin = error.get('origin') + message = error.get('message') or error.get('user_message') + extractor_msg = 'Unable to download video %s' + if origin == 'NotAllowedForLocation': + self.raise_geo_restricted(message, self._GEO_COUNTRIES) + elif origin == 'NoRedisValidData': + extractor_msg = 'Video %s does not exist' + elif site == 353: + continue + elif not pycryptodomex_found: raise ExtractorError( 'pycryptodome not found. Please install it.', expected=True) - extractor_msg += ': ' + message - raise ExtractorError(extractor_msg % video_id, expected=True) + elif message: + extractor_msg += ': ' + message + raise ExtractorError(extractor_msg % video_id, expected=True) + else: + break result = video_json['result'] title = result['title'] From f0f6a7e73f55b6227c40af17c6fcab44b5a2df79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Nov 2019 23:21:03 +0700 Subject: [PATCH 122/154] [chaturbate] Fix extraction (closes #23010, closes #23012) --- youtube_dl/extractor/chaturbate.py | 42 +++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 656e715ae..a459dcb8d 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + lowercase_escape, + url_or_none, +) class ChaturbateIE(InfoExtractor): @@ -38,12 +42,31 @@ class ChaturbateIE(InfoExtractor): 'https://chaturbate.com/%s/' % video_id, video_id, headers=self.geo_verification_headers()) - m3u8_urls = [] + found_m3u8_urls = [] - for m in re.finditer( - r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage): - m3u8_fast_url, m3u8_no_fast_url = m.group('url'), m.group( - 'url').replace('_fast', '') + data = self._parse_json( + self._search_regex( + r'initialRoomDossier\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data', default='{}', group='value'), + video_id, transform_source=lowercase_escape, fatal=False) + if data: + m3u8_url = url_or_none(data.get('hls_source')) + if m3u8_url: + found_m3u8_urls.append(m3u8_url) + + if not found_m3u8_urls: + for m in re.finditer( + r'(\\u002[27])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(lowercase_escape(m.group('url'))) + + if not found_m3u8_urls: + for m in re.finditer( + r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(m.group('url')) + + m3u8_urls = [] + for found_m3u8_url in found_m3u8_urls: + m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '') for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url): if m3u8_url not in m3u8_urls: m3u8_urls.append(m3u8_url) @@ -63,7 +86,12 @@ class ChaturbateIE(InfoExtractor): formats = [] for m3u8_url in m3u8_urls: - m3u8_id = 'fast' if '_fast' in m3u8_url else 'slow' + for known_id in ('fast', 'slow'): + if '_%s' % known_id in m3u8_url: + m3u8_id = known_id + break + else: + m3u8_id = None formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, ext='mp4', # ffmpeg skips segments for fast m3u8 From 25d3f770e6ef518a4230ad41bd4ea69dd2e851af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Nov 2019 23:22:59 +0700 Subject: [PATCH 123/154] [ivi] Ask for pycryptodomex instead of pycryptodome See discussion at https://github.com/ytdl-org/youtube-dl/commit/1bba88efc7e1f82095f7ae38348e56026db4bf3c#r35982110 --- youtube_dl/extractor/ivi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 0db023622..52b53bfeb 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -137,7 +137,7 @@ class IviIE(InfoExtractor): continue elif not pycryptodomex_found: raise ExtractorError( - 'pycryptodome not found. Please install it.', + 'pycryptodomex not found. Please install it.', expected=True) elif message: extractor_msg += ': ' + message From f8015c15746e83394ecc395c6a13823d20971772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Nov 2019 23:38:39 +0700 Subject: [PATCH 124/154] [ivi] Fix python 3.4 support --- youtube_dl/extractor/ivi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 52b53bfeb..315ea03fa 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -91,10 +91,10 @@ class IviIE(InfoExtractor): 'contentid': video_id } ] - }).encode() + }) for site in (353, 183): - content_data = data % site + content_data = (data % site).encode() if site == 353: try: from Cryptodome.Cipher import Blowfish From 80a51fc2ef3ebb7d3e3d5fd0b6e9942bb4be6f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Nov 2019 01:10:24 +0700 Subject: [PATCH 125/154] [ivi] Skip s353 for bundled exe See https://github.com/Legrandin/pycryptodome/issues/228 --- youtube_dl/extractor/ivi.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 315ea03fa..a502e8806 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -1,8 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import json +import re +import sys from .common import InfoExtractor from ..utils import ( @@ -93,9 +94,13 @@ class IviIE(InfoExtractor): ] }) + bundled = hasattr(sys, 'frozen') + for site in (353, 183): content_data = (data % site).encode() if site == 353: + if bundled: + continue try: from Cryptodome.Cipher import Blowfish from Cryptodome.Hash import CMAC @@ -135,6 +140,10 @@ class IviIE(InfoExtractor): extractor_msg = 'Video %s does not exist' elif site == 353: continue + elif bundled: + raise ExtractorError( + 'This feature does not work from bundled exe. Run youtube-dl from sources.', + expected=True) elif not pycryptodomex_found: raise ExtractorError( 'pycryptodomex not found. Please install it.', From fb8dfc5a2772ca35dd65bad7b7565ad6ec1ad4dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Nov 2019 01:21:00 +0700 Subject: [PATCH 126/154] [ChangeLog] Actualize [ci skip] --- ChangeLog | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/ChangeLog b/ChangeLog index d46d20082..acee2a75a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,49 @@ +version + +Core ++ [extractor/common] Clean jwplayer description HTML tags ++ [extractor/common] Add data, headers and query to all major extract formats + methods + +Extractors +* [chaturbate] Fix extraction (#23010, #23012) ++ [ntvru] Add support for non relative file URLs (#23140) +* [vk] Fix wall audio thumbnails extraction (#23135) +* [ivi] Fix format extraction (#21991) +- [comcarcoff] Remove extractor ++ [drtv] Add support for new URL schema (#23059) ++ [nexx] Add support for Multi Player JS Setup (#23052) ++ [teamcoco] Add support for new videos (#23054) +* [soundcloud] Check if the soundtrack has downloads left (#23045) +* [facebook] Fix posts video data extraction (#22473) +- [addanime] Remove extractor +- [minhateca] Remove extractor +- [daisuki] Remove extractor +* [seeker] Fix extraction +- [revision3] Remove extractors +* [twitch] Fix video comments URL (#18593, #15828) +* [twitter] Improve extraction + + Add support for generic embeds (#22168) + * Always extract http formats for native videos (#14934) + + Add support for Twitter Broadcasts (#21369) + + Extract more metadata + * Improve VMap format extraction + * Unify extraction code for both twitter statuses and cards ++ [twitch] Add support for Clip embed URLs +* [lnkgo] Fix extraction (#16834) +* [mixcloud] Improve extraction + * Improve metadata extraction (#11721) + * Fix playlist extraction (#22378) + * Fix user mixes extraction (#15197, #17865) ++ [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384) +* [onionstudios] Fix extraction ++ [hotstar] Pass Referer header to format requests (#22836) +* [dplay] Minimize response size ++ [patreon] Extract uploader_id and filesize +* [patreon] Minimize response size +* [roosterteeth] Fix login request (#16094, #22689) + + version 2019.11.05 Extractors From 0de9fd24dc8723c78a90cb546e4a05818304521e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Nov 2019 01:24:27 +0700 Subject: [PATCH 127/154] release 2019.11.22 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 10 ++-------- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 22 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 12de9add2..d3e11cdcf 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.05 + [debug] youtube-dl version 2019.11.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8a6202cf6..51bf4db3b 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 83f91d5fe..19025ff25 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index be8e70f1e..a381b6979 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.05 + [debug] youtube-dl version 2019.11.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 7544d171c..9c945d5ec 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index acee2a75a..daaff3eef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.11.22 Core + [extractor/common] Clean jwplayer description HTML tags diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 536b87479..3dcb026c5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -26,7 +26,6 @@ - **AcademicEarth:Course** - **acast** - **acast:channel** - - **AddAnime** - **ADN**: Anime Digital Network - **AdobeConnect** - **AdobeTV** @@ -175,7 +174,6 @@ - **CNN** - **CNNArticle** - **CNNBlogs** - - **ComCarCoff** - **ComedyCentral** - **ComedyCentralFullEpisodes** - **ComedyCentralShortname** @@ -203,8 +201,6 @@ - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** - - **DaisukiMotto** - - **DaisukiMottoPlaylist** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -404,6 +400,7 @@ - **Ketnet** - **KhanAcademy** - **KickStarter** + - **KinjaEmbed** - **KinoPoisk** - **KonserthusetPlay** - **kontrtube**: KontrTube.ru - Труба зовёт @@ -485,14 +482,12 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** - - **Minhateca** - **MinistryGrid** - **Minoto** - **miomio.tv** - **MiTele**: mitele.es - **mixcloud** - **mixcloud:playlist** - - **mixcloud:stream** - **mixcloud:user** - **Mixer:live** - **Mixer:vod** @@ -723,8 +718,6 @@ - **Restudy** - **Reuters** - **ReverbNation** - - **revision** - - **revision3:embed** - **RICE** - **RMCDecouverte** - **RockstarGames** @@ -958,6 +951,7 @@ - **twitch:vod** - **twitter** - **twitter:amplify** + - **twitter:broadcast** - **twitter:card** - **udemy** - **udemy:course** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8012a66db..361809681 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.11.05' +__version__ = '2019.11.22' From cf3c9eafad5e6b83788e15a605aa6804b1ab307c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 Nov 2019 00:03:51 +0700 Subject: [PATCH 128/154] [soundcloud] Update client id (closes #23214) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index e8ffb2cbe..988dec4fa 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -245,7 +245,7 @@ class SoundcloudIE(InfoExtractor): _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' - _CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI' + _CLIENT_ID = 'UW9ajvMgVdMMW3cdeBi8lPfN6dvOVGji' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _ARTWORK_MAP = { From 9d30c2132acf2d12bfa8e559987c341c76d9cd24 Mon Sep 17 00:00:00 2001 From: InfernalUnderling <42065091+InfernalUnderling@users.noreply.github.com> Date: Tue, 26 Nov 2019 17:08:37 +0000 Subject: [PATCH 129/154] [utils] Handle rd-suffixed day parts in unified_strdate (#23199) --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 3920542bb..0db37d9d8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -340,6 +340,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('July 15th, 2013'), '20130715') self.assertEqual(unified_strdate('September 1st, 2013'), '20130901') self.assertEqual(unified_strdate('Sep 2nd, 2013'), '20130902') + self.assertEqual(unified_strdate('November 3rd, 2019'), '20191103') + self.assertEqual(unified_strdate('October 23rd, 2005'), '20051023') def test_unified_timestamps(self): self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index aed988b88..0d30075aa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1718,13 +1718,16 @@ DATE_FORMATS = ( '%B %d %Y', '%B %dst %Y', '%B %dnd %Y', + '%B %drd %Y', '%B %dth %Y', '%b %d %Y', '%b %dst %Y', '%b %dnd %Y', + '%b %drd %Y', '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', + '%b %drd %Y %I:%M', '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', From 6ddd4bf6ac04ae0b8ba39fb4124e844afc49b5a9 Mon Sep 17 00:00:00 2001 From: InfernalUnderling <42065091+InfernalUnderling@users.noreply.github.com> Date: Tue, 26 Nov 2019 17:20:39 +0000 Subject: [PATCH 130/154] [bitchute] Extract upload date (closes #22990) (#23193) --- youtube_dl/extractor/bitchute.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 430663fbf..0c773e66e 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from ..utils import ( orderedSet, + unified_strdate, urlencode_postdata, ) @@ -23,6 +24,7 @@ class BitChuteIE(InfoExtractor): 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Victoria X Rave', + 'upload_date': '20170813', }, }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', @@ -74,12 +76,17 @@ class BitChuteIE(InfoExtractor): r'(?s)]+\bclass=["\']video-author[^>]+>(.+?)

'), webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', + webpage, 'upload date', fatal=False)) + return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'upload_date': upload_date, 'formats': formats, } From 1ced222120c00854865c5b16e89838235ed549ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 Nov 2019 02:26:42 +0700 Subject: [PATCH 131/154] [utils] Add generic caesar cipher and rot47 --- test/test_utils.py | 16 ++++++++++++++++ youtube_dl/utils.py | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 0db37d9d8..e83c8ea11 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -19,6 +19,7 @@ from youtube_dl.utils import ( age_restricted, args_to_str, encode_base_n, + caesar, clean_html, date_from_str, DateRange, @@ -69,6 +70,7 @@ from youtube_dl.utils import ( remove_start, remove_end, remove_quotes, + rot47, shell_quote, smuggle_url, str_to_int, @@ -1369,6 +1371,20 @@ Line 1 self.assertRaises(ValueError, encode_base_n, 0, 70) self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + def test_caesar(self): + self.assertEqual(caesar('ace', 'abcdef', 2), 'cea') + self.assertEqual(caesar('cea', 'abcdef', -2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', -2), 'eac') + self.assertEqual(caesar('eac', 'abcdef', 2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', 0), 'ace') + self.assertEqual(caesar('xyz', 'abcdef', 2), 'xyz') + self.assertEqual(caesar('abc', 'acegik', 2), 'ebg') + self.assertEqual(caesar('ebg', 'acegik', -2), 'abc') + + def test_rot47(self): + self.assertEqual(rot47('youtube-dl'), r'J@FEF36\5=') + self.assertEqual(rot47('YOUTUBE-DL'), r'*~&%&qt\s{') + def test_urshift(self): self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(-3, 1), 2147483646) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0d30075aa..b14603d8a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -5383,6 +5383,19 @@ def decode_packed_codes(code): obfucasted_code) +def caesar(s, alphabet, shift): + if shift == 0: + return s + l = len(alphabet) + return ''.join( + alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c + for c in s) + + +def rot47(s): + return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47) + + def parse_m3u8_attributes(attrib): info = {} for (key, val) in re.findall(r'(?P[A-Z0-9-]+)=(?P"[^"]+"|[^",]+)(?:,|$)', attrib): From edc2a1f68b267abc6b4c94991da4ad83fd8374bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 Nov 2019 02:28:06 +0700 Subject: [PATCH 132/154] [vivo] Fix extraction (closes #22328, closes #22279) --- youtube_dl/extractor/shared.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index ff575f592..02295d1a4 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,13 +1,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_b64decode +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote_plus, +) from ..utils import ( determine_ext, ExtractorError, int_or_none, + js_to_json, KNOWN_EXTENSIONS, parse_filesize, + rot47, url_or_none, urlencode_postdata, ) @@ -112,16 +117,22 @@ class VivoIE(SharedBaseIE): webpage, 'filesize', fatal=False)) def _extract_video_url(self, webpage, video_id, url): - def decode_url(encoded_url): + def decode_url_old(encoded_url): return compat_b64decode(encoded_url).decode('utf-8') - stream_url = url_or_none(decode_url(self._search_regex( + stream_url = self._search_regex( r'data-stream\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'stream url', default=None, group='url'))) + 'stream url', default=None, group='url') + if stream_url: + stream_url = url_or_none(decode_url_old(stream_url)) if stream_url: return stream_url - return self._parse_json( + + def decode_url(encoded_url): + return rot47(compat_urllib_parse_unquote_plus(encoded_url)) + + return decode_url(self._parse_json( self._search_regex( - r'InitializeStream\s*\(\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'stream', group='url'), - video_id, transform_source=decode_url)[0] + r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage, + 'stream'), + video_id, transform_source=js_to_json)['source']) From df65a4a1ed3096b8210c097c77d00f0391f78503 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Nov 2019 21:53:51 +0100 Subject: [PATCH 133/154] [corus] improve extraction - add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com and disneylachaine.ca(closes #20861) - add support for self hosted videos(closes #22075) - detect DRM protection(closes #14910)(closes #9164) --- youtube_dl/extractor/corus.py | 169 ++++++++++++++++++++++------------ 1 file changed, 112 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py index a1b251804..e11aadf14 100644 --- a/youtube_dl/extractor/corus.py +++ b/youtube_dl/extractor/corus.py @@ -4,7 +4,12 @@ from __future__ import unicode_literals import re from .theplatform import ThePlatformFeedIE -from ..utils import int_or_none +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, +) class CorusIE(ThePlatformFeedIE): @@ -12,24 +17,49 @@ class CorusIE(ThePlatformFeedIE): https?:// (?:www\.)? (?P - (?:globaltv|etcanada)\.com| - (?:hgtv|foodnetwork|slice|history|showcase|bigbrothercanada)\.ca + (?: + globaltv| + etcanada| + seriesplus| + wnetwork| + ytv + )\.com| + (?: + hgtv| + foodnetwork| + slice| + history| + showcase| + bigbrothercanada| + abcspark| + disney(?:channel|lachaine) + )\.ca + ) + /(?:[^/]+/)* + (?: + video\.html\?.*?\bv=| + videos?/(?:[^/]+/)*(?:[a-z0-9-]+-)? + ) + (?P + [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}| + (?:[A-Z]{4})?\d{12,20} ) - /(?:video/(?:[^/]+/)?|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=)) - (?P\d+) ''' _TESTS = [{ 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', - 'md5': '05dcbca777bf1e58c2acbb57168ad3a6', 'info_dict': { 'id': '870923331648', 'ext': 'mp4', 'title': 'Movie Night Popcorn with Bryan', 'description': 'Bryan whips up homemade popcorn, the old fashion way for Jojo and Lincoln.', - 'uploader': 'SHWM-NEW', 'upload_date': '20170206', 'timestamp': 1486392197, }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], }, { 'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753', 'only_matching': True, @@ -48,58 +78,83 @@ class CorusIE(ThePlatformFeedIE): }, { 'url': 'https://www.bigbrothercanada.ca/video/big-brother-canada-704/1457812035894/', 'only_matching': True + }, { + 'url': 'https://www.seriesplus.com/emissions/dre-mary-mort-sur-ordonnance/videos/deux-coeurs-battant/SERP0055626330000200/', + 'only_matching': True + }, { + 'url': 'https://www.disneychannel.ca/shows/gabby-duran-the-unsittables/video/crybaby-duran-clip/2f557eec-0588-11ea-ae2b-e2c6776b770e/', + 'only_matching': True }] - - _TP_FEEDS = { - 'globaltv': { - 'feed_id': 'ChQqrem0lNUp', - 'account_id': 2269680845, - }, - 'etcanada': { - 'feed_id': 'ChQqrem0lNUp', - 'account_id': 2269680845, - }, - 'hgtv': { - 'feed_id': 'L0BMHXi2no43', - 'account_id': 2414428465, - }, - 'foodnetwork': { - 'feed_id': 'ukK8o58zbRmJ', - 'account_id': 2414429569, - }, - 'slice': { - 'feed_id': '5tUJLgV2YNJ5', - 'account_id': 2414427935, - }, - 'history': { - 'feed_id': 'tQFx_TyyEq4J', - 'account_id': 2369613659, - }, - 'showcase': { - 'feed_id': '9H6qyshBZU3E', - 'account_id': 2414426607, - }, - 'bigbrothercanada': { - 'feed_id': 'ChQqrem0lNUp', - 'account_id': 2269680845, - }, + _GEO_BYPASS = False + _SITE_MAP = { + 'globaltv': 'series', + 'etcanada': 'series', + 'foodnetwork': 'food', + 'bigbrothercanada': 'series', + 'disneychannel': 'disneyen', + 'disneylachaine': 'disneyfr', } def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).groups() - feed_info = self._TP_FEEDS[domain.split('.')[0]] - return self._extract_feed_info('dtjsEC', feed_info['feed_id'], 'byId=' + video_id, video_id, lambda e: { - 'episode_number': int_or_none(e.get('pl1$episode')), - 'season_number': int_or_none(e.get('pl1$season')), - 'series': e.get('pl1$show'), - }, { - 'HLS': { - 'manifest': 'm3u', - }, - 'DesktopHLS Default': { - 'manifest': 'm3u', - }, - 'MP4 MBR': { - 'manifest': 'm3u', - }, - }, feed_info['account_id']) + site = domain.split('.')[0] + path = self._SITE_MAP.get(site, site) + if path != 'series': + path = 'migration/' + path + video = self._download_json( + 'https://globalcontent.corusappservices.com/templates/%s/playlist/' % path, + video_id, query={'byId': video_id}, + headers={'Accept': 'application/json'})[0] + title = video['title'] + + formats = [] + for source in video.get('sources', []): + smil_url = source.get('file') + if not smil_url: + continue + source_type = source.get('type') + note = 'Downloading%s smil file' % (' ' + source_type if source_type else '') + resp = self._download_webpage( + smil_url, video_id, note, fatal=False, + headers=self.geo_verification_headers()) + if not resp: + continue + error = self._parse_json(resp, video_id, fatal=False) + if error: + if error.get('exception') == 'GeoLocationBlocked': + self.raise_geo_restricted(countries=['CA']) + raise ExtractorError(error['description']) + smil = self._parse_xml(resp, video_id, fatal=False) + if smil is None: + continue + namespace = self._parse_smil_namespace(smil) + formats.extend(self._parse_smil_formats( + smil, smil_url, video_id, namespace)) + if not formats and video.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) + + subtitles = {} + for track in video.get('tracks', []): + track_url = track.get('file') + if not track_url: + continue + lang = 'fr' if site in ('disneylachaine', 'seriesplus') else 'en' + subtitles.setdefault(lang, []).append({'url': track_url}) + + metadata = video.get('metadata') or {} + get_number = lambda x: int_or_none(video.get('pl1$' + x) or metadata.get(x + 'Number')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': dict_get(video, ('defaultThumbnailUrl', 'thumbnail', 'image')), + 'description': video.get('description'), + 'timestamp': int_or_none(video.get('availableDate'), 1000), + 'subtitles': subtitles, + 'duration': float_or_none(metadata.get('duration')), + 'series': dict_get(video, ('show', 'pl1$show')), + 'season_number': get_number('season'), + 'episode_number': get_number('episode'), + } From 5ef62fc4ce1f255343d67b70f3cee2f2240cdfba Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Nov 2019 22:01:34 +0100 Subject: [PATCH 134/154] [dailymotion] improve extraction - extract http formats included in m3u8 manifest - fix user extraction(closes #3553)(closes #21415) - add suport for User Authentication(closes #11491) - fix password protected videos extraction(closes #23176) - respect age limit option and family filter cookie value(closes #18437) - handle video url playlist query param - report alowed countries for geo-restricted videos --- youtube_dl/extractor/common.py | 13 + youtube_dl/extractor/dailymotion.py | 559 +++++++++++----------------- youtube_dl/extractor/vk.py | 3 +- 3 files changed, 234 insertions(+), 341 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 04d676378..eaae5e484 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1766,6 +1766,19 @@ class InfoExtractor(object): # the same GROUP-ID f['acodec'] = 'none' formats.append(f) + + # for DailyMotion + progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') + if progressive_uri: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': progressive_uri, + }) + formats.append(http_f) + last_stream_inf = {} return formats diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 745971900..327fdb04a 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,50 +1,93 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import functools -import hashlib -import itertools import json -import random import re -import string from .common import InfoExtractor -from ..compat import compat_struct_pack +from ..compat import compat_HTTPError from ..utils import ( - determine_ext, - error_to_compat_str, + age_restricted, + clean_html, ExtractorError, int_or_none, - mimetype2ext, OnDemandPagedList, - parse_iso8601, - sanitized_Request, - str_to_int, try_get, unescapeHTML, - update_url_query, - url_or_none, urlencode_postdata, ) class DailymotionBaseInfoExtractor(InfoExtractor): + _FAMILY_FILTER = None + _HEADERS = { + 'Content-Type': 'application/json', + 'Origin': 'https://www.dailymotion.com', + } + _NETRC_MACHINE = 'dailymotion' + + def _get_dailymotion_cookies(self): + return self._get_cookies('https://www.dailymotion.com/') + @staticmethod - def _build_request(url): - """Build a request with the family filter disabled""" - request = sanitized_Request(url) - request.add_header('Cookie', 'family_filter=off; ff=off') - return request + def _get_cookie_value(cookies, name): + cookie = cookies.get('name') + if cookie: + return cookie.value - def _download_webpage_handle_no_ff(self, url, *args, **kwargs): - request = self._build_request(url) - return self._download_webpage_handle(request, *args, **kwargs) + def _set_dailymotion_cookie(self, name, value): + self._set_cookie('www.dailymotion.com', name, value) - def _download_webpage_no_ff(self, url, *args, **kwargs): - request = self._build_request(url) - return self._download_webpage(request, *args, **kwargs) + def _real_initialize(self): + cookies = self._get_dailymotion_cookies() + ff = self._get_cookie_value(cookies, 'ff') + self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self._downloader.params.get('age_limit')) + self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off') + + def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): + if not self._HEADERS.get('Authorization'): + cookies = self._get_dailymotion_cookies() + token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') + if not token: + data = { + 'client_id': 'f1a362d288c1b98099c7', + 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', + } + username, password = self._get_login_info() + if username: + data.update({ + 'grant_type': 'password', + 'password': password, + 'username': username, + }) + else: + data['grant_type'] = 'client_credentials' + try: + token = self._download_json( + 'https://graphql.api.dailymotion.com/oauth/token', + None, 'Downloading Access Token', + data=urlencode_postdata(data))['access_token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), xid)['error_description'], expected=True) + raise + self._set_dailymotion_cookie('access_token' if username else 'client_token', token) + self._HEADERS['Authorization'] = 'Bearer ' + token + + resp = self._download_json( + 'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({ + 'query': '''{ + %s(xid: "%s"%s) { + %s + } +}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields), + }).encode(), headers=self._HEADERS) + obj = resp['data'][object_type] + if not obj: + raise ExtractorError(resp['errors'][0]['message'], expected=True) + return obj class DailymotionIE(DailymotionBaseInfoExtractor): @@ -54,18 +97,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| (?:www\.)?lequipe\.fr/video ) - /(?P[^/?_]+) + /(?P[^/?_]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? ''' IE_NAME = 'dailymotion' - - _FORMATS = [ - ('stream_h264_ld_url', 'ld'), - ('stream_h264_url', 'standard'), - ('stream_h264_hq_url', 'hq'), - ('stream_h264_hd_url', 'hd'), - ('stream_h264_hd1080_url', 'hd180'), - ] - _TESTS = [{ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'md5': '074b95bdee76b9e3654137aee9c79dfe', @@ -74,7 +108,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'ext': 'mp4', 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', 'duration': 187, 'timestamp': 1493651285, 'upload_date': '20170501', @@ -146,7 +179,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor): }, { 'url': 'https://www.lequipe.fr/video/k7MtHciueyTcrFtFKA2', 'only_matching': True, + }, { + 'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw', + 'only_matching': True, }] + _GEO_BYPASS = False + _COMMON_MEDIA_FIELDS = '''description + geoblockedCountries { + allowed + } + xid''' @staticmethod def _extract_urls(webpage): @@ -162,264 +204,140 @@ class DailymotionIE(DailymotionBaseInfoExtractor): return urls def _real_extract(self, url): - video_id = self._match_id(url) + video_id, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage_no_ff( - 'https://www.dailymotion.com/video/%s' % video_id, video_id) + if playlist_id: + if not self._downloader.params.get('noplaylist'): + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + return self.url_result( + 'http://www.dailymotion.com/playlist/' + playlist_id, + 'DailymotionPlaylist', playlist_id) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - age_limit = self._rta_search(webpage) - - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description') - - view_count_str = self._search_regex( - (r']+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', - r'video_views_count[^>]+>\s+([\s\d\,.]+)'), - webpage, 'view count', default=None) - if view_count_str: - view_count_str = re.sub(r'\s', '', view_count_str) - view_count = str_to_int(view_count_str) - comment_count = int_or_none(self._search_regex( - r']+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', - webpage, 'comment count', default=None)) - - player_v5 = self._search_regex( - [r'buildPlayer\(({.+?})\);\n', # See https://github.com/ytdl-org/youtube-dl/issues/7826 - r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', - r'buildPlayer\(({.+?})\);', - r'var\s+config\s*=\s*({.+?});', - # New layout regex (see https://github.com/ytdl-org/youtube-dl/issues/13580) - r'__PLAYER_CONFIG__\s*=\s*({.+?});'], - webpage, 'player v5', default=None) - if player_v5: - player = self._parse_json(player_v5, video_id, fatal=False) or {} - metadata = try_get(player, lambda x: x['metadata'], dict) - if not metadata: - metadata_url = url_or_none(try_get( - player, lambda x: x['context']['metadata_template_url1'])) - if metadata_url: - metadata_url = metadata_url.replace(':videoId', video_id) - else: - metadata_url = update_url_query( - 'https://www.dailymotion.com/player/metadata/video/%s' - % video_id, { - 'embedder': url, - 'integration': 'inline', - 'GK_PV5_NEON': '1', - }) - metadata = self._download_json( - metadata_url, video_id, 'Downloading metadata JSON') - - if try_get(metadata, lambda x: x['error']['type']) == 'password_protected': - password = self._downloader.params.get('videopassword') - if password: - r = int(metadata['id'][1:], 36) - us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') - t = ''.join(random.choice(string.ascii_letters) for i in range(10)) - n = us64e(compat_struct_pack('I', r)) - i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) - metadata = self._download_json( - 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) - - self._check_error(metadata) - - formats = [] - for quality, media_list in metadata['qualities'].items(): - for media in media_list: - media_url = media.get('url') - if not media_url: - continue - type_ = media.get('type') - if type_ == 'application/vnd.lumberjack.manifest': - continue - ext = mimetype2ext(type_) or determine_ext(media_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - media_url, video_id, 'mp4', preference=-1, - m3u8_id='hls', fatal=False) - for f in m3u8_formats: - f['url'] = f['url'].split('#')[0] - formats.append(f) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) - else: - f = { - 'url': media_url, - 'format_id': 'http-%s' % quality, - 'ext': ext, - } - m = re.search(r'H264-(?P\d+)x(?P\d+)', media_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(f) - self._sort_formats(formats) - - title = metadata['title'] - duration = int_or_none(metadata.get('duration')) - timestamp = int_or_none(metadata.get('created_time')) - thumbnail = metadata.get('poster_url') - uploader = metadata.get('owner', {}).get('screenname') - uploader_id = metadata.get('owner', {}).get('id') - - subtitles = {} - subtitles_data = metadata.get('subtitles', {}).get('data', {}) - if subtitles_data and isinstance(subtitles_data, dict): - for subtitle_lang, subtitle in subtitles_data.items(): - subtitles[subtitle_lang] = [{ - 'ext': determine_ext(subtitle_url), - 'url': subtitle_url, - } for subtitle_url in subtitle.get('urls', [])] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'age_limit': age_limit, - 'view_count': view_count, - 'comment_count': comment_count, - 'formats': formats, - 'subtitles': subtitles, - } - - # vevo embed - vevo_id = self._search_regex( - r'[\w]*)', - webpage, 'vevo embed', default=None) - if vevo_id: - return self.url_result('vevo:%s' % vevo_id, 'Vevo') - - # fallback old player - embed_page = self._download_webpage_no_ff( - 'https://www.dailymotion.com/embed/video/%s' % video_id, - video_id, 'Downloading embed page') - - timestamp = parse_iso8601(self._html_search_meta( - 'video:release_date', webpage, 'upload date')) - - info = self._parse_json( - self._search_regex( - r'var info = ({.*?}),$', embed_page, - 'video info', flags=re.MULTILINE), - video_id) - - self._check_error(info) - - formats = [] - for (key, format_id) in self._FORMATS: - video_url = info.get(key) - if video_url is not None: - m_size = re.search(r'H264-(\d+)x(\d+)', video_url) - if m_size is not None: - width, height = map(int_or_none, (m_size.group(1), m_size.group(2))) - else: - width, height = None, None - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'format_id': format_id, - 'width': width, - 'height': height, - }) - self._sort_formats(formats) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, webpage) - - title = self._og_search_title(webpage, default=None) - if title is None: - title = self._html_search_regex( - r'(?s)]*>(.*?)', webpage, - 'title') - - return { - 'id': video_id, - 'formats': formats, - 'uploader': info['owner.screenname'], - 'timestamp': timestamp, - 'title': title, - 'description': description, - 'subtitles': video_subtitles, - 'thumbnail': info['thumbnail_url'], - 'age_limit': age_limit, - 'view_count': view_count, - 'duration': info['duration'] + password = self._downloader.params.get('videopassword') + media = self._call_api( + 'media', video_id, '''... on Video { + %s + stats { + likes { + total } + views { + total + } + } + } + ... on Live { + %s + audienceCount + isOnAir + }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata', + 'password: "%s"' % self._downloader.params.get('videopassword') if password else None) + xid = media['xid'] - def _check_error(self, info): - error = info.get('error') + metadata = self._download_json( + 'https://www.dailymotion.com/player/metadata/video/' + xid, + xid, 'Downloading metadata JSON', + query={'app': 'com.dailymotion.neon'}) + + error = metadata.get('error') if error: - title = error.get('title') or error['message'] + title = error.get('title') or error['raw_message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': - self.raise_geo_restricted(msg=title) + allowed_countries = try_get(media, lambda x: x['geoblockedCountries']['allowed'], list) + self.raise_geo_restricted(msg=title, countries=allowed_countries) raise ExtractorError( '%s said: %s' % (self.IE_NAME, title), expected=True) - def _get_subtitles(self, video_id, webpage): - try: - sub_list = self._download_webpage( - 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - info = json.loads(sub_list) - if (info['total'] > 0): - sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) - return sub_lang_list - self._downloader.report_warning('video doesn\'t have subtitles') - return {} + title = metadata['title'] + is_live = media.get('isOnAir') + formats = [] + for quality, media_list in metadata['qualities'].items(): + for m in media_list: + media_url = m.get('url') + media_type = m.get('type') + if not media_url or media_type == 'application/vnd.lumberjack.manifest': + continue + if media_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + f = { + 'url': media_url, + 'format_id': 'http-' + quality, + } + m = re.search(r'/H264-(\d+)x(\d+)(?:-(60)/)?', media_url) + if m: + width, height, fps = map(int_or_none, m.groups()) + f.update({ + 'fps': fps, + 'height': height, + 'width': width, + }) + formats.append(f) + for f in formats: + f['url'] = f['url'].split('#')[0] + if not f.get('fps') and f['format_id'].endswith('@60'): + f['fps'] = 60 + self._sort_formats(formats) + + subtitles = {} + subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} + for subtitle_lang, subtitle in subtitles_data.items(): + subtitles[subtitle_lang] = [{ + 'url': subtitle_url, + } for subtitle_url in subtitle.get('urls', [])] + + thumbnails = [] + for height, poster_url in metadata.get('posters', {}).items(): + thumbnails.append({ + 'height': int_or_none(height), + 'id': height, + 'url': poster_url, + }) + + owner = metadata.get('owner') or {} + stats = media.get('stats') or {} + get_count = lambda x: int_or_none(try_get(stats, lambda y: y[x + 's']['total'])) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': clean_html(media.get('description')), + 'thumbnails': thumbnails, + 'duration': int_or_none(metadata.get('duration')) or None, + 'timestamp': int_or_none(metadata.get('created_time')), + 'uploader': owner.get('screenname'), + 'uploader_id': owner.get('id') or metadata.get('screenname'), + 'age_limit': 18 if metadata.get('explicit') else 0, + 'tags': metadata.get('tags'), + 'view_count': get_count('view') or int_or_none(media.get('audienceCount')), + 'like_count': get_count('like'), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } -class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): - IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?Px[0-9a-z]+)' - _TESTS = [{ - 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', - 'info_dict': { - 'title': 'SPORT', - 'id': 'xv4bw', - }, - 'playlist_mincount': 20, - }] +class DailymotionPlaylistBaseIE(DailymotionBaseInfoExtractor): _PAGE_SIZE = 100 - def _fetch_page(self, playlist_id, authorizaion, page): + def _fetch_page(self, playlist_id, page): page += 1 - videos = self._download_json( - 'https://graphql.api.dailymotion.com', - playlist_id, 'Downloading page %d' % page, - data=json.dumps({ - 'query': '''{ - collection(xid: "%s") { - videos(first: %d, page: %d) { - pageInfo { - hasNextPage - nextPage - } + videos = self._call_api( + self._OBJECT_TYPE, playlist_id, + '''videos(allowExplicit: %s, first: %d, page: %d) { edges { node { xid url } } - } - } -}''' % (playlist_id, self._PAGE_SIZE, page) - }).encode(), headers={ - 'Authorization': authorizaion, - 'Origin': 'https://www.dailymotion.com', - })['data']['collection']['videos'] + }''' % ('false' if self._FAMILY_FILTER else 'true', self._PAGE_SIZE, page), + 'Downloading page %d' % page)['videos'] for edge in videos['edges']: node = edge['node'] yield self.url_result( @@ -427,86 +345,49 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - api = self._parse_json(self._search_regex( - r'__PLAYER_CONFIG__\s*=\s*({.+?});', - webpage, 'player config'), playlist_id)['context']['api'] - auth = self._download_json( - api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), - playlist_id, data=urlencode_postdata({ - 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), - 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), - 'grant_type': 'client_credentials', - })) - authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) entries = OnDemandPagedList(functools.partial( - self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) + self._fetch_page, playlist_id), self._PAGE_SIZE) return self.playlist_result( - entries, playlist_id, - self._og_search_title(webpage)) + entries, playlist_id) -class DailymotionUserIE(DailymotionBaseInfoExtractor): +class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): + IE_NAME = 'dailymotion:playlist' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?Px[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', + 'info_dict': { + 'id': 'xv4bw', + }, + 'playlist_mincount': 20, + }] + _OBJECT_TYPE = 'collection' + + +class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P[^/]+)' - _MORE_PAGES_INDICATOR = r'(?s)
.*?[^/]+)' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', 'info_dict': { 'id': 'nqtv', - 'title': 'Rémi Gaillard', }, - 'playlist_mincount': 100, + 'playlist_mincount': 152, }, { 'url': 'http://www.dailymotion.com/user/UnderProject', 'info_dict': { 'id': 'UnderProject', - 'title': 'UnderProject', }, - 'playlist_mincount': 1800, - 'expected_warnings': [ - 'Stopped at duplicated page', - ], + 'playlist_mincount': 1000, 'skip': 'Takes too long time', + }, { + 'url': 'https://www.dailymotion.com/user/nqtv', + 'info_dict': { + 'id': 'nqtv', + }, + 'playlist_mincount': 148, + 'params': { + 'age_limit': 0, + }, }] - - def _extract_entries(self, id): - video_ids = set() - processed_urls = set() - for pagenum in itertools.count(1): - page_url = self._PAGE_TEMPLATE % (id, pagenum) - webpage, urlh = self._download_webpage_handle_no_ff( - page_url, id, 'Downloading page %s' % pagenum) - if urlh.geturl() in processed_urls: - self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( - page_url, urlh.geturl()), id) - break - - processed_urls.add(urlh.geturl()) - - for video_id in re.findall(r'data-xid="(.+?)"', webpage): - if video_id not in video_ids: - yield self.url_result( - 'http://www.dailymotion.com/video/%s' % video_id, - DailymotionIE.ie_key(), video_id) - video_ids.add(video_id) - - if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: - break - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user = mobj.group('user') - webpage = self._download_webpage( - 'https://www.dailymotion.com/user/%s' % user, user) - full_user = unescapeHTML(self._html_search_regex( - r'' % re.escape(user), - webpage, 'user')) - - return { - '_type': 'playlist', - 'id': user, - 'title': full_user, - 'entries': self._extract_entries(user), - } + _OBJECT_TYPE = 'channel' diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 195875938..a5e4a3e67 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -216,8 +216,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', - # TODO: fix test by fixing dailymotion description extraction - 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', + 'description': 'md5:424b8e88cc873217f520e582ba28bb36', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', 'uploader_id': 'x1p5vl5', From 6471d0d3b8086b282622c84a9eea968d4edfcf9b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Nov 2019 23:57:37 +0100 Subject: [PATCH 135/154] [openload] remove OpenLoad related extractors(closes #11999)(closes #15406) --- youtube_dl/extractor/extractors.py | 5 - youtube_dl/extractor/generic.py | 16 -- youtube_dl/extractor/openload.py | 263 ----------------------------- youtube_dl/extractor/streamango.py | 128 -------------- 4 files changed, 412 deletions(-) delete mode 100644 youtube_dl/extractor/streamango.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cf4bb8f20..0e349b778 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -796,10 +796,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openload import ( - OpenloadIE, - VerystreamIE, -) from .ora import OraTVIE from .orf import ( ORFTVthekIE, @@ -1060,7 +1056,6 @@ from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamable import StreamableIE -from .streamango import StreamangoIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3d919f656..743ef47db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -88,10 +88,6 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE -from .openload import ( - OpenloadIE, - VerystreamIE, -) from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -3048,18 +3044,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - # Look for Openload embeds - openload_urls = OpenloadIE._extract_urls(webpage) - if openload_urls: - return self.playlist_from_matches( - openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) - - # Look for Verystream embeds - verystream_urls = VerystreamIE._extract_urls(webpage) - if verystream_urls: - return self.playlist_from_matches( - verystream_urls, video_id, video_title, ie=VerystreamIE.ie_key()) - # Look for VideoPress embeds videopress_urls = VideoPressIE._extract_urls(webpage) if videopress_urls: diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 66e38cdb4..0c20d0177 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -3,21 +3,17 @@ from __future__ import unicode_literals import json import os -import re import subprocess import tempfile -from .common import InfoExtractor from ..compat import ( compat_urlparse, compat_kwargs, ) from ..utils import ( check_executable, - determine_ext, encodeArgument, ExtractorError, - get_element_by_id, get_exe_version, is_outdated_version, std_headers, @@ -240,262 +236,3 @@ class PhantomJSwrapper(object): self._load_cookies() return (html, encodeArgument(out)) - - -class OpenloadIE(InfoExtractor): - _DOMAINS = r''' - (?: - openload\.(?:co|io|link|pw)| - oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|online|monster|press|pw|life|live|space|services|website|vip)| - oladblock\.(?:services|xyz|me)|openloed\.co - ) - ''' - _VALID_URL = r'''(?x) - https?:// - (?P - (?:www\.)? - %s - )/ - (?:f|embed)/ - (?P[a-zA-Z0-9-_]+) - ''' % _DOMAINS - _EMBED_WORD = 'embed' - _STREAM_WORD = 'f' - _REDIR_WORD = 'stream' - _URL_IDS = ('streamurl', 'streamuri', 'streamurj') - _TESTS = [{ - 'url': 'https://openload.co/f/kUEfGclsU9o', - 'md5': 'bf1c059b004ebc7a256f89408e65c36e', - 'info_dict': { - 'id': 'kUEfGclsU9o', - 'ext': 'mp4', - 'title': 'skyrim_no-audio_1080.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://openload.co/embed/rjC09fkPLYs', - 'info_dict': { - 'id': 'rjC09fkPLYs', - 'ext': 'mp4', - 'title': 'movie.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': { - 'en': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, # test subtitles only - }, - }, { - 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', - 'only_matching': True, - }, { - 'url': 'https://openload.io/f/ZAn6oz-VZGE/', - 'only_matching': True, - }, { - 'url': 'https://openload.co/f/_-ztPaZtMhM/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout - # for title and ext - 'url': 'https://openload.co/embed/Sxz5sADo82g/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available - # via https://openload.co/f/e-Ixz9ZR5L0/ - 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', - 'only_matching': True, - }, { - 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', - 'only_matching': True, - }, { - 'url': 'http://www.openload.link/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.stream/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', - 'only_matching': True, - }, { - 'url': 'https://oload.win/f/kUEfGclsU9o', - 'only_matching': True, - }, { - 'url': 'https://oload.download/f/kUEfGclsU9o', - 'only_matching': True, - }, { - 'url': 'https://oload.cloud/f/4ZDnBXRWiB8', - 'only_matching': True, - }, { - # Its title has not got its extension but url has it - 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', - 'only_matching': True, - }, { - 'url': 'https://oload.cc/embed/5NEAbI2BDSk', - 'only_matching': True, - }, { - 'url': 'https://oload.icu/f/-_i4y_F_Hs8', - 'only_matching': True, - }, { - 'url': 'https://oload.fun/f/gb6G1H4sHXY', - 'only_matching': True, - }, { - 'url': 'https://oload.club/f/Nr1L-aZ2dbQ', - 'only_matching': True, - }, { - 'url': 'https://oload.info/f/5NEAbI2BDSk', - 'only_matching': True, - }, { - 'url': 'https://openload.pw/f/WyKgK8s94N0', - 'only_matching': True, - }, { - 'url': 'https://oload.pw/f/WyKgK8s94N0', - 'only_matching': True, - }, { - 'url': 'https://oload.live/f/-Z58UZ-GR4M', - 'only_matching': True, - }, { - 'url': 'https://oload.space/f/IY4eZSst3u8/', - 'only_matching': True, - }, { - 'url': 'https://oload.services/embed/bs1NWj1dCag/', - 'only_matching': True, - }, { - 'url': 'https://oload.online/f/W8o2UfN1vNY/', - 'only_matching': True, - }, { - 'url': 'https://oload.monster/f/W8o2UfN1vNY/', - 'only_matching': True, - }, { - 'url': 'https://oload.press/embed/drTBl1aOTvk/', - 'only_matching': True, - }, { - 'url': 'https://oload.website/embed/drTBl1aOTvk/', - 'only_matching': True, - }, { - 'url': 'https://oload.life/embed/oOzZjNPw9Dc/', - 'only_matching': True, - }, { - 'url': 'https://oload.biz/f/bEk3Gp8ARr4/', - 'only_matching': True, - }, { - 'url': 'https://oload.best/embed/kkz9JgVZeWc/', - 'only_matching': True, - }, { - 'url': 'https://oladblock.services/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://oladblock.xyz/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://oladblock.me/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://openloed.co/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://oload.vip/f/kUEfGclsU9o', - 'only_matching': True, - }] - - @classmethod - def _extract_urls(cls, webpage): - return re.findall( - r'(?x)]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' - % (cls._DOMAINS, cls._EMBED_WORD), webpage) - - def _extract_decrypted_page(self, page_url, webpage, video_id): - phantom = PhantomJSwrapper(self, required_version='2.0') - webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id) - return webpage - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') - - url_pattern = 'https://%s/%%s/%s/' % (host, video_id) - - for path in (self._EMBED_WORD, self._STREAM_WORD): - page_url = url_pattern % path - last = path == self._STREAM_WORD - webpage = self._download_webpage( - page_url, video_id, 'Downloading %s webpage' % path, - fatal=last) - if not webpage: - continue - if 'File not found' in webpage or 'deleted by the owner' in webpage: - if not last: - continue - raise ExtractorError('File not found', expected=True, video_id=video_id) - break - - webpage = self._extract_decrypted_page(page_url, webpage, video_id) - for element_id in self._URL_IDS: - decoded_id = get_element_by_id(element_id, webpage) - if decoded_id: - break - if not decoded_id: - decoded_id = self._search_regex( - (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', - r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', - r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, - 'stream URL') - video_url = 'https://%s/%s/%s?mime=true' % (host, self._REDIR_WORD, decoded_id) - - title = self._og_search_title(webpage, default=None) or self._search_regex( - r']+class=["\']title["\'][^>]*>([^<]+)', webpage, - 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - entries = self._parse_html5_media_entries(page_url, webpage, video_id) - entry = entries[0] if entries else {} - subtitles = entry.get('subtitles') - - return { - 'id': video_id, - 'title': title, - 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), - 'url': video_url, - 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), - 'subtitles': subtitles, - } - - -class VerystreamIE(OpenloadIE): - IE_NAME = 'verystream' - - _DOMAINS = r'(?:verystream\.com|woof\.tube)' - _VALID_URL = r'''(?x) - https?:// - (?P - (?:www\.)? - %s - )/ - (?:stream|e)/ - (?P[a-zA-Z0-9-_]+) - ''' % _DOMAINS - _EMBED_WORD = 'e' - _STREAM_WORD = 'stream' - _REDIR_WORD = 'gettoken' - _URL_IDS = ('videolink', ) - _TESTS = [{ - 'url': 'https://verystream.com/stream/c1GWQ9ngBBx/', - 'md5': 'd3e8c5628ccb9970b65fd65269886795', - 'info_dict': { - 'id': 'c1GWQ9ngBBx', - 'ext': 'mp4', - 'title': 'Big Buck Bunny.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://verystream.com/e/c1GWQ9ngBBx/', - 'only_matching': True, - }] - - def _extract_decrypted_page(self, page_url, webpage, video_id): - return webpage # for Verystream, the webpage is already decrypted diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py deleted file mode 100644 index f1e17dd88..000000000 --- a/youtube_dl/extractor/streamango.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_chr -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - js_to_json, -) - - -class StreamangoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net|streamcherry\.com)/(?:f|embed)/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', - 'md5': 'e992787515a182f55e38fc97588d802a', - 'info_dict': { - 'id': 'clapasobsptpkdfe', - 'ext': 'mp4', - 'title': '20170315_150006.mp4', - } - }, { - # no og:title - 'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4', - 'info_dict': { - 'id': 'foqebrpftarclpob', - 'ext': 'mp4', - 'title': 'foqebrpftarclpob', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'gone', - }, { - 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', - 'only_matching': True, - }, { - 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4', - 'only_matching': True, - }, { - 'url': 'https://streamcherry.com/f/clapasobsptpkdfe/', - 'only_matching': True, - }] - - def _real_extract(self, url): - def decrypt_src(encoded, val): - ALPHABET = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA' - encoded = re.sub(r'[^A-Za-z0-9+/=]', '', encoded) - decoded = '' - sm = [None] * 4 - i = 0 - str_len = len(encoded) - while i < str_len: - for j in range(4): - sm[j % 4] = ALPHABET.index(encoded[i]) - i += 1 - char_code = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val - decoded += compat_chr(char_code) - if sm[2] != 0x40: - char_code = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2) - decoded += compat_chr(char_code) - if sm[3] != 0x40: - char_code = ((sm[2] & 0x3) << 0x6) | sm[3] - decoded += compat_chr(char_code) - return decoded - - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage, default=video_id) - - formats = [] - for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): - mobj = re.search(r'(src\s*:\s*[^(]+\(([^)]*)\)[\s,]*)', format_) - if mobj is None: - continue - - format_ = format_.replace(mobj.group(0), '') - - video = self._parse_json( - format_, video_id, transform_source=js_to_json, - fatal=False) or {} - - mobj = re.search( - r'([\'"])(?P(?:(?!\1).)+)\1\s*,\s*(?P\d+)', - mobj.group(1)) - if mobj is None: - continue - - src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val'))) - if not src: - continue - - ext = determine_ext(src, default_ext=None) - if video.get('type') == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': src, - 'ext': ext or 'mp4', - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'tbr': int_or_none(video.get('bitrate')), - }) - - if not formats: - error = self._search_regex( - r']+\bclass=["\']lead[^>]+>(.+?)

', webpage, - 'error', default=None) - if not error and '>Sorry' in webpage: - error = 'Video %s is not available' % video_id - if error: - raise ExtractorError(error, expected=True) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'url': url, - 'title': title, - 'formats': formats, - } From 681ac7c92abbbd55be9796de86c2cc0d1d70a4c9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 27 Nov 2019 13:57:30 +0100 Subject: [PATCH 136/154] [vimeo] improve extraction - fix review extraction - fix ondemand extraction - make password protected player case as an expected error(closes #22896) - simplify channel based extractors code --- youtube_dl/extractor/vimeo.py | 177 +++++++++++++++++----------------- 1 file changed, 87 insertions(+), 90 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 9abd59d98..baa46d5f3 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -15,18 +15,20 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, determine_ext, + dict_get, ExtractorError, js_to_json, int_or_none, merge_dicts, - NO_DEFAULT, OnDemandPagedList, parse_filesize, RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, + str_or_none, try_get, unified_timestamp, unsmuggle_url, @@ -210,7 +212,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_uploader_url = owner.get('url') return { - 'id': video_id, + 'id': str_or_none(video_data.get('id')) or video_id, 'title': self._live_title(video_title) if is_live else video_title, 'uploader': owner.get('name'), 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, @@ -258,11 +260,11 @@ class VimeoIE(VimeoBaseInfoExtractor): (?: (?: www| - (?Pplayer) + player ) \. )? - vimeo(?Ppro)?\.com/ + vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:.*?/)? (?: @@ -284,7 +286,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '56015672', 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - 'description': 'md5:509a9ad5c9bf97c60faee9203aca4479', + 'description': 'md5:2d3305bad981a06ff79f027f19865021', 'timestamp': 1355990239, 'upload_date': '20121220', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', @@ -293,6 +295,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, 'license': 'by-sa', }, + 'params': { + 'format': 'best[protocol=https]', + }, }, { 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', @@ -305,8 +310,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30', + 'description': 'md5:2c362968038d4499f4d79f88458590c1', 'duration': 1595, + 'upload_date': '20130610', + 'timestamp': 1370893156, + }, + 'params': { + 'format': 'best[protocol=https]', }, }, { @@ -323,6 +333,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 3610, 'description': None, }, + 'params': { + 'format': 'best[protocol=https]', + }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/68375962', @@ -341,6 +355,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', }, 'params': { + 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, }, @@ -441,10 +456,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': '10Ft Films', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms', 'uploader_id': 'tenfootfilms', + 'description': 'md5:0fa704e05b04f91f40b7f3ca2e801384', + 'upload_date': '20130830', + 'timestamp': 1377853339, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -459,6 +478,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, }, 'params': { + 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, }, @@ -523,7 +543,7 @@ class VimeoIE(VimeoBaseInfoExtractor): def _verify_player_video_password(self, url, video_id, headers): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option') + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -552,28 +572,26 @@ class VimeoIE(VimeoBaseInfoExtractor): r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) orig_url = url - if mobj.group('pro'): + is_pro = 'vimeopro.com/' in url + is_player = '://player.vimeo.com/video/' in url + if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) - elif mobj.group('player'): + if not url: + url = 'https://vimeo.com/' + video_id + elif is_player: url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id - # Retrieve video webpage to extract further information - request = sanitized_Request(url, headers=headers) try: - webpage, urlh = self._download_webpage_handle(request, video_id) + # Retrieve video webpage to extract further information + webpage, urlh = self._download_webpage_handle( + url, video_id, headers=headers) redirect_url = compat_str(urlh.geturl()) - # Some URLs redirect to ondemand can't be extracted with - # this extractor right away thus should be passed through - # ondemand extractor (e.g. https://vimeo.com/73445910) - if VimeoOndemandIE.suitable(redirect_url): - return self.url_result(redirect_url, VimeoOndemandIE.ie_key()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -600,6 +618,7 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None + video_description = None # Extract the config JSON try: @@ -611,17 +630,17 @@ class VimeoIE(VimeoBaseInfoExtractor): # Sometimes new react-based page is served instead of old one that require # different config URL extraction approach (see # https://github.com/ytdl-org/youtube-dl/pull/7209) - vimeo_clip_page_config = self._search_regex( - r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, - 'vimeo clip page config') - page_config = self._parse_json(vimeo_clip_page_config, video_id) + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config'), video_id) config_url = page_config['player']['config_url'] cc_license = page_config.get('cc_license') timestamp = try_get( page_config, lambda x: x['clip']['uploaded_on'], compat_str) - config_json = self._download_webpage(config_url, video_id) - config = json.loads(config_json) + video_description = clean_html(dict_get( + page_config, ('description', 'description_html_escaped'))) + config = self._download_json(config_url, video_id) except RegexNotFoundError: # For pro videos or player.vimeo.com urls # We try to find out to which variable is assigned the config dic @@ -675,14 +694,14 @@ class VimeoIE(VimeoBaseInfoExtractor): {'force_feature_id': True}), 'Vimeo') # Extract video description - - video_description = self._html_search_regex( - r'(?s)]*>(.*?)
', - webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_regex( + r'(?s)]*>(.*?)
', + webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( 'description', webpage, default=None) - if not video_description and mobj.group('pro'): + if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, note='Downloading webpage for description', @@ -690,7 +709,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not mobj.group('player'): + if not video_description and not is_player: self._downloader.report_warning('Cannot find video description') # Extract upload date @@ -747,9 +766,9 @@ class VimeoIE(VimeoBaseInfoExtractor): return info_dict -class VimeoOndemandIE(VimeoBaseInfoExtractor): +class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', @@ -761,24 +780,32 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader': 'גם סרטים', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', + 'description': 'md5:4c027c965e439de4baab621e48b60791', + 'upload_date': '20140906', + 'timestamp': 1410032453, }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', 'info_dict': { - 'id': '126682985', + 'id': '126584684', 'ext': 'mp4', 'title': 'Rävlock, rätt läte på rätt plats', 'uploader': 'Lindroth & Norin', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847', - 'uploader_id': 'user14430847', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin', + 'uploader_id': 'lindrothnorin', + 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', + 'upload_date': '20150502', + 'timestamp': 1430586422, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -790,16 +817,6 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - return self.url_result( - # Some videos require Referer to be passed along with og:video:url - # similarly to generic vimeo embeds (e.g. - # https://vimeo.com/ondemand/36938/126682985). - VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), - VimeoIE.ie_key()) - class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' @@ -815,6 +832,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): }, 'playlist_mincount': 25, }] + _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' def _page_url(self, base_url, pagenum): return '%s/videos/page:%d/' % (base_url, pagenum) @@ -886,14 +904,13 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self.playlist_result(title_and_entries, list_id, list_title) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id) + channel_id = self._match_id(url) + return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id) class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' _TITLE_RE = r']+?class="user">([^<>]+?)' _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', @@ -903,11 +920,7 @@ class VimeoUserIE(VimeoChannelIE): }, 'playlist_mincount': 66, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - return self._extract_videos(name, 'https://vimeo.com/%s' % name) + _BASE_URL_TEMPL = 'https://vimeo.com/%s' class VimeoAlbumIE(VimeoChannelIE): @@ -969,25 +982,18 @@ class VimeoAlbumIE(VimeoChannelIE): r'\s*(.+?)(?:\s+on Vimeo)?', webpage, 'title', fatal=False)) -class VimeoGroupsIE(VimeoAlbumIE): +class VimeoGroupsIE(VimeoChannelIE): IE_NAME = 'vimeo:group' - _VALID_URL = r'https://vimeo\.com/groups/(?P[^/]+)(?:/(?!videos?/\d+)|$)' + _VALID_URL = r'https://vimeo\.com/groups/(?P[^/]+)(?:/(?!videos?/\d+)|$)' _TESTS = [{ - 'url': 'https://vimeo.com/groups/rolexawards', + 'url': 'https://vimeo.com/groups/kattykay', 'info_dict': { - 'id': 'rolexawards', - 'title': 'Rolex Awards for Enterprise', + 'id': 'kattykay', + 'title': 'Katty Kay', }, - 'playlist_mincount': 73, + 'playlist_mincount': 27, }] - - def _extract_list_title(self, webpage): - return self._og_search_title(webpage, fatal=False) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name) + _BASE_URL_TEMPL = 'https://vimeo.com/groups/%s' class VimeoReviewIE(VimeoBaseInfoExtractor): @@ -1003,7 +1009,9 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'title': "DICK HARDWICK 'Comedian'", 'uploader': 'Richard Hardwick', 'uploader_id': 'user21297594', - } + 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", + }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1016,7 +1024,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'duration': 2773, 'thumbnail': r're:^https?://.*\.jpg$', 'uploader_id': 'user22258446', - } + }, + 'skip': 'video gone', }, { 'note': 'Password protected', 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', @@ -1036,32 +1045,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() - def _get_config_url(self, webpage_url, video_id, video_password_verified=False): - webpage = self._download_webpage(webpage_url, video_id) - config_url = self._html_search_regex( - r'data-config-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'config URL', default=None, group='url') - if not config_url: - data = self._parse_json(self._search_regex( - r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data', - default=NO_DEFAULT if video_password_verified else '{}'), video_id) - config = data.get('vimeo_esi', {}).get('config', {}) - config_url = config.get('configUrl') or try_get(config, lambda x: x['clipData']['configUrl']) - if config_url is None: - self._verify_video_password(webpage_url, video_id, webpage) - config_url = self._get_config_url( - webpage_url, video_id, video_password_verified=True) - return config_url - def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - config_url = self._get_config_url(url, video_id) + clip_data = self._download_json( + page_url.replace('/review/', '/review/data/'), + video_id)['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) - source_format = self._extract_original_format(page_url, video_id) + source_format = self._extract_original_format( + page_url + '/action', video_id) if source_format: info_dict['formats'].append(source_format) self._vimeo_sort_formats(info_dict['formats']) + info_dict['description'] = clean_html(clip_data.get('description')) return info_dict From e3f00f139fc227217325c8e84e0b340e12ee9bb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 28 Nov 2019 23:09:48 +0700 Subject: [PATCH 137/154] [ChangeLog] Actualize [ci skip] --- ChangeLog | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/ChangeLog b/ChangeLog index daaff3eef..d724d75ce 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,36 @@ +version + +Core ++ [utils] Add generic caesar cipher and rot47 +* [utils] Handle rd-suffixed day parts in unified_strdate (#23199) + +Extractors +* [vimeo] Improve extraction + * Fix review extraction + * Fix ondemand extraction + * Make password protected player case as an expected error (#22896) + * Simplify channel based extractors code +- [openload] Remove extractor (#11999) +- [verystream] Remove extractor +- [streamango] Remove extractor (#15406) +* [dailymotion] Improve extraction + * Extract http formats included in m3u8 manifest + * Fix user extraction (#3553, #21415) + + Add suport for User Authentication (#11491) + * Fix password protected videos extraction (#23176) + * Respect age limit option and family filter cookie value (#18437) + * Handle video url playlist query param + * Report allowed countries for geo-restricted videos +* [corus] Improve extraction + + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com + and disneylachaine.ca (#20861) + + Add support for self hosted videos (#22075) + * Detect DRM protection (#14910, #9164) +* [vivo] Fix extraction (#22328, #22279) ++ [bitchute] Extract upload date (#22990, #23193) +* [soundcloud] Update client id (#23214) + + version 2019.11.22 Core From b568561eba6f4aceb87419e21aba11567c5de7da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 28 Nov 2019 23:25:25 +0700 Subject: [PATCH 138/154] release 2019.11.28 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 --- youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 17 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d3e11cdcf..3a94bd621 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.22 + [debug] youtube-dl version 2019.11.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 51bf4db3b..72bee12aa 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 19025ff25..ddf67e951 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index a381b6979..7122e2714 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.22 + [debug] youtube-dl version 2019.11.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 9c945d5ec..a93882b39 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index d724d75ce..d4f809fc6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.11.28 Core + [utils] Add generic caesar cipher and rot47 diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3dcb026c5..2744dfca8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -618,7 +618,6 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** - - **Openload** - **OraTV** - **orf:fm4**: radio FM4 - **orf:fm4:story**: fm4.orf.at stories @@ -825,7 +824,6 @@ - **Steam** - **Stitcher** - **Streamable** - - **Streamango** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** @@ -976,7 +974,6 @@ - **Vbox7** - **VeeHD** - **Veoh** - - **verystream** - **Vesti**: Вести.Ru - **Vevo** - **VevoPlaylist** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 361809681..1227abc0a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.11.22' +__version__ = '2019.11.28' From 348c6bf1c1a00eec323d6e21ff7b9b12699afe04 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:05:06 +0100 Subject: [PATCH 139/154] [utils] handle int values passed to str_to_int --- test/test_utils.py | 1 + youtube_dl/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e83c8ea11..fed94a906 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -499,6 +499,7 @@ class TestUtil(unittest.TestCase): def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) + self.assertEqual(str_to_int(523), 523) def test_url_basename(self): self.assertEqual(url_basename('http://foo.de/'), '') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b14603d8a..328f037a8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3519,8 +3519,8 @@ def str_or_none(v, default=None): def str_to_int(int_str): """ A more relaxed version of int_or_none """ - if int_str is None: - return None + if not isinstance(int_str, compat_str): + return int_str int_str = re.sub(r'[,\.\+]', '', int_str) return int(int_str) From 7f641d2c7a68b70d6c1e273af108741e5779bc28 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:06:34 +0100 Subject: [PATCH 140/154] [adobetv] improve extaction - use OnDemandPagedList for list extractors - reduce show extraction requests - extract original video format and subtitles - add support for adobe tv embeds --- youtube_dl/extractor/adobetv.py | 239 ++++++++++++++++++++--------- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 166 insertions(+), 74 deletions(-) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 008c98e51..80060f037 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -1,25 +1,119 @@ from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - parse_duration, - unified_strdate, - str_to_int, - int_or_none, float_or_none, + int_or_none, ISO639Utils, - determine_ext, + OnDemandPagedList, + parse_duration, + str_or_none, + str_to_int, + unified_strdate, ) class AdobeTVBaseIE(InfoExtractor): - _API_BASE_URL = 'http://tv.adobe.com/api/v4/' + def _call_api(self, path, video_id, query, note=None): + return self._download_json( + 'http://tv.adobe.com/api/v4/' + path, + video_id, note, query=query)['data'] + + def _parse_subtitles(self, video_data, url_key): + subtitles = {} + for translation in video_data.get('translations', []): + vtt_path = translation.get(url_key) + if not vtt_path: + continue + lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) + subtitles.setdefault(lang, []).append({ + 'ext': 'vtt', + 'url': vtt_path, + }) + return subtitles + + def _parse_video_data(self, video_data): + video_id = compat_str(video_data['id']) + title = video_data['title'] + + s3_extracted = False + formats = [] + for source in video_data.get('videos', []): + source_url = source.get('url') + if not source_url: + continue + f = { + 'format_id': source.get('quality_level'), + 'fps': int_or_none(source.get('frame_rate')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('video_data_rate')), + 'width': int_or_none(source.get('width')), + 'url': source_url, + } + original_filename = source.get('original_filename') + if original_filename: + if not (f.get('height') and f.get('width')): + mobj = re.search(r'_(\d+)x(\d+)', original_filename) + if mobj: + f.update({ + 'height': int(mobj.group(2)), + 'width': int(mobj.group(1)), + }) + if original_filename.startswith('s3://') and not s3_extracted: + formats.append({ + 'format_id': 'original', + 'preference': 1, + 'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'), + }) + s3_extracted = True + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail'), + 'upload_date': unified_strdate(video_data.get('start_date')), + 'duration': parse_duration(video_data.get('duration')), + 'view_count': str_to_int(video_data.get('playcount')), + 'formats': formats, + 'subtitles': self._parse_subtitles(video_data, 'vtt'), + } + + +class AdobeTVEmbedIE(AdobeTVBaseIE): + IE_NAME = 'adobetv:embed' + _VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P\d+)' + _TEST = { + 'url': 'https://tv.adobe.com/embed/22/4153', + 'md5': 'c8c0461bf04d54574fc2b4d07ac6783a', + 'info_dict': { + 'id': '4153', + 'ext': 'flv', + 'title': 'Creating Graphics Optimized for BlackBerry', + 'description': 'md5:eac6e8dced38bdaae51cd94447927459', + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20091109', + 'duration': 377, + 'view_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._call_api( + 'episode/' + video_id, video_id, {'disclosure': 'standard'})[0] + return self._parse_video_data(video_data) class AdobeTVIE(AdobeTVBaseIE): + IE_NAME = 'adobetv' _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?watch/(?P[^/]+)/(?P[^/]+)' _TEST = { @@ -42,45 +136,33 @@ class AdobeTVIE(AdobeTVBaseIE): if not language: language = 'en' - video_data = self._download_json( - self._API_BASE_URL + 'episode/get/?language=%s&show_urlname=%s&urlname=%s&disclosure=standard' % (language, show_urlname, urlname), - urlname)['data'][0] - - formats = [{ - 'url': source['url'], - 'format_id': source.get('quality_level') or source['url'].split('-')[-1].split('.')[0] or None, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'tbr': int_or_none(source.get('video_data_rate')), - } for source in video_data['videos']] - self._sort_formats(formats) - - return { - 'id': compat_str(video_data['id']), - 'title': video_data['title'], - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnail'), - 'upload_date': unified_strdate(video_data.get('start_date')), - 'duration': parse_duration(video_data.get('duration')), - 'view_count': str_to_int(video_data.get('playcount')), - 'formats': formats, - } + video_data = self._call_api( + 'episode/get', urlname, { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + 'urlname': urlname, + })[0] + return self._parse_video_data(video_data) class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): - def _parse_page_data(self, page_data): - return [self.url_result(self._get_element_url(element_data)) for element_data in page_data] + _PAGE_SIZE = 25 - def _extract_playlist_entries(self, url, display_id): - page = self._download_json(url, display_id) - entries = self._parse_page_data(page['data']) - for page_num in range(2, page['paging']['pages'] + 1): - entries.extend(self._parse_page_data( - self._download_json(url + '&page=%d' % page_num, display_id)['data'])) - return entries + def _fetch_page(self, display_id, query, page): + page += 1 + query['page'] = page + for element_data in self._call_api( + self._RESOURCE, display_id, query, 'Download Page %d' % page): + yield self._process_data(element_data) + + def _extract_playlist_entries(self, display_id, query): + return OnDemandPagedList(functools.partial( + self._fetch_page, display_id, query), self._PAGE_SIZE) class AdobeTVShowIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:show' _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?show/(?P[^/]+)' _TEST = { @@ -92,26 +174,31 @@ class AdobeTVShowIE(AdobeTVPlaylistBaseIE): }, 'playlist_mincount': 136, } - - def _get_element_url(self, element_data): - return element_data['urls'][0] + _RESOURCE = 'episode' + _process_data = AdobeTVBaseIE._parse_video_data def _real_extract(self, url): language, show_urlname = re.match(self._VALID_URL, url).groups() if not language: language = 'en' - query = 'language=%s&show_urlname=%s' % (language, show_urlname) + query = { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + } - show_data = self._download_json(self._API_BASE_URL + 'show/get/?%s' % query, show_urlname)['data'][0] + show_data = self._call_api( + 'show/get', show_urlname, query)[0] return self.playlist_result( - self._extract_playlist_entries(self._API_BASE_URL + 'episode/?%s' % query, show_urlname), - compat_str(show_data['id']), - show_data['show_name'], - show_data['show_description']) + self._extract_playlist_entries(show_urlname, query), + str_or_none(show_data.get('id')), + show_data.get('show_name'), + show_data.get('show_description')) class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:channel' _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?channel/(?P[^/]+)(?:/(?P[^/]+))?' _TEST = { @@ -121,24 +208,30 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): }, 'playlist_mincount': 96, } + _RESOURCE = 'show' - def _get_element_url(self, element_data): - return element_data['url'] + def _process_data(self, show_data): + return self.url_result( + show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id'))) def _real_extract(self, url): language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() if not language: language = 'en' - query = 'language=%s&channel_urlname=%s' % (language, channel_urlname) + query = { + 'channel_urlname': channel_urlname, + 'language': language, + } if category_urlname: - query += '&category_urlname=%s' % category_urlname + query['category_urlname'] = category_urlname return self.playlist_result( - self._extract_playlist_entries(self._API_BASE_URL + 'show/?%s' % query, channel_urlname), + self._extract_playlist_entries(channel_urlname, query), channel_urlname) -class AdobeTVVideoIE(InfoExtractor): +class AdobeTVVideoIE(AdobeTVBaseIE): + IE_NAME = 'adobetv:video' _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' _TEST = { @@ -160,38 +253,36 @@ class AdobeTVVideoIE(InfoExtractor): video_data = self._parse_json(self._search_regex( r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) + title = video_data['title'] - formats = [{ - 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), - 'url': source['src'], - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'tbr': int_or_none(source.get('bitrate')), - } for source in video_data['sources']] + formats = [] + sources = video_data.get('sources') or [] + for source in sources: + source_src = source.get('src') + if not source_src: + continue + formats.append({ + 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), + 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'height': int_or_none(source.get('height') or None), + 'tbr': int_or_none(source.get('bitrate') or None), + 'width': int_or_none(source.get('width') or None), + 'url': source_src, + }) self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one duration = max(filter(None, [ float_or_none(source.get('duration'), scale=1000) - for source in video_data['sources']])) - - subtitles = {} - for translation in video_data.get('translations', []): - lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) - if lang_id not in subtitles: - subtitles[lang_id] = [] - subtitles[lang_id].append({ - 'url': translation['vttPath'], - 'ext': 'vtt', - }) + for source in sources])) return { 'id': video_id, 'formats': formats, - 'title': video_data['title'], + 'title': title, 'description': video_data.get('description'), - 'thumbnail': video_data['video'].get('poster'), + 'thumbnail': video_data.get('video', {}).get('poster'), 'duration': duration, - 'subtitles': subtitles, + 'subtitles': self._parse_subtitles(video_data, 'vttPath'), } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0e349b778..0f27c9678 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -21,6 +21,7 @@ from .acast import ( from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import ( + AdobeTVEmbedIE, AdobeTVIE, AdobeTVShowIE, AdobeTVChannelIE, From a15adbe461584e2e631d1be97805e81c17cfd3fe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:12:55 +0100 Subject: [PATCH 141/154] [channel9] reduce response size and update tests --- youtube_dl/extractor/channel9.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 81108e704..09cacf6d3 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -32,7 +32,7 @@ class Channel9IE(InfoExtractor): 'upload_date': '20130828', 'session_code': 'KOS002', 'session_room': 'Arena 1A', - 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'], + 'session_speakers': 'count:5', }, }, { 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', @@ -64,15 +64,15 @@ class Channel9IE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', - 'info_dict': { - 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', - 'title': 'Channel 9', - }, - 'playlist_mincount': 100, }, { 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', + 'info_dict': { + 'id': 'Events/DEVintersection/DEVintersection-2016', + 'title': 'DEVintersection 2016 Orlando Sessions', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', 'only_matching': True, }, { 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', @@ -112,11 +112,11 @@ class Channel9IE(InfoExtractor): episode_data), content_path) content_id = episode_data['contentId'] is_session = '/Sessions(' in episode_data['api'] - content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' if is_session: - content_url += '?$expand=Speakers' + content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' else: - content_url += '?$expand=Authors' + content_url += 'Authors,Body&$expand=Authors' content_data = self._download_json(content_url, content_id) title = content_data['Title'] @@ -210,7 +210,7 @@ class Channel9IE(InfoExtractor): 'id': content_id, 'title': title, 'description': clean_html(content_data.get('Description') or content_data.get('Body')), - 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'), + 'thumbnail': content_data.get('VideoPlayerPreviewImage'), 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), 'timestamp': parse_iso8601(content_data.get('PublishedDate')), 'avg_rating': int_or_none(content_data.get('Rating')), From 88a7a9089a0f3ccdd5e0e6f10b529652a24cbc7e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:22:54 +0100 Subject: [PATCH 142/154] [abcotvs] relax _VALID_URL regex and improve metadata extraction(closes #18014) --- youtube_dl/extractor/abcotvs.py | 79 ++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 03b92a39c..0bc69a64f 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -4,29 +4,30 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + dict_get, int_or_none, - parse_iso8601, + try_get, ) class ABCOTVSIE(InfoExtractor): IE_NAME = 'abcotvs' IE_DESC = 'ABC Owned Television Stations' - _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' + _VALID_URL = r'https?://(?Pabc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P[^/]+))?/(?P\d+)' _TESTS = [ { 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', 'info_dict': { - 'id': '472581', + 'id': '472548', 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', - 'title': 'East Bay museum celebrates vintage synthesizers', + 'title': 'East Bay museum celebrates synthesized music', 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421123075, + 'timestamp': 1421118520, 'upload_date': '20150113', - 'uploader': 'Jonathan Bloom', }, 'params': { # m3u8 download @@ -37,39 +38,63 @@ class ABCOTVSIE(InfoExtractor): 'url': 'http://abc7news.com/472581', 'only_matching': True, }, + { + 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/', + 'only_matching': True, + }, ] + _SITE_MAP = { + '6abc': 'wpvi', + 'abc11': 'wtvd', + 'abc13': 'ktrk', + 'abc30': 'kfsn', + 'abc7': 'kabc', + 'abc7chicago': 'wls', + 'abc7news': 'kgo', + 'abc7ny': 'wabc', + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = display_id or video_id + station = self._SITE_MAP[site] - webpage = self._download_webpage(url, display_id) + data = self._download_json( + 'https://api.abcotvs.com/v2/content', display_id, query={ + 'id': video_id, + 'key': 'otv.web.%s.story' % station, + 'station': station, + })['data'] + video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data + video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id)) + title = video.get('title') or video['linkText'] - m3u8 = self._html_search_meta( - 'contentURL', webpage, 'm3u8 url', fatal=True).split('?')[0] - - formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + formats = [] + m3u8_url = video.get('m3u8') + if m3u8_url: + formats = self._extract_m3u8_formats( + video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False) + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'abr': 128, + 'format_id': 'https', + 'height': 360, + 'url': mp4_url, + 'width': 640, + }) self._sort_formats(formats) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'
\s*