diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 444a86ee3..40a869113 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.06 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index a1c69a45b..7b10df3d4 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d391b6d6b..04bbcfa68 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 7422446b0..a9e231817 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.06 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 247d3594d..4a3d32d51 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 0efae7d9e..f753972c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,38 @@ +version 2020.03.24 + +Core +- [utils] Revert support for cookie files with spaces used instead of tabs + +Extractors +* [teachable] Update upskillcourses and gns3 domains +* [generic] Look for teachable embeds before wistia ++ [teachable] Extract chapter metadata (#24421) ++ [bilibili] Add support for player.bilibili.com (#24402) ++ [bilibili] Add support for new URL schema with BV ids (#24439, #24442) +* [limelight] Remove disabled API requests (#24255) +* [soundcloud] Fix download URL extraction (#24394) ++ [cbc:watch] Add support for authentication (#19160) +* [hellporno] Fix extraction (#24399) +* [xtube] Fix formats extraction (#24348) +* [ndr] Fix extraction (#24326) +* [nhk] Update m3u8 URL and use native HLS downloader (#24329) +- [nhk] Remove obsolete rtmp formats (#24329) +* [nhk] Relax URL regular expression (#24329) +- [vimeo] Revert fix showcase password protected video extraction (#24224) + + +version 2020.03.08 + +Core ++ [utils] Add support for cookie files with spaces used instead of tabs + +Extractors ++ [pornhub] Add support for pornhubpremium.com (#24288) +- [youtube] Remove outdated code and unnecessary requests +* [youtube] Improve extraction in 429 HTTP error conditions (#24283) +* [nhk] Update API version (#24270) + + version 2020.03.06 Extractors diff --git a/README.md b/README.md index 01f975958..4f54a5240 100644 --- a/README.md +++ b/README.md @@ -835,7 +835,9 @@ In February 2015, the new YouTube player contained a character sequence in a str ### HTTP Error 429: Too Many Requests or 402: Payment Required -These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. +These two error codes indicate that the service is blocking your IP address because of overuse. Usually this is a soft block meaning that you can gain access again after solving CAPTCHA. Just open a browser and solve a CAPTCHA the service suggests you and after that [pass cookies](#how-do-i-pass-cookies-to-youtube-dl) to youtube-dl. Note that if your machine has multiple external IPs then you should also pass exactly the same IP you've used for solving CAPTCHA with [`--source-address`](#network-options). Also you may need to pass a `User-Agent` HTTP header of your browser with [`--user-agent`](#workarounds). + +If this is not the case (no CAPTCHA suggested to solve by the service) then you can contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. ### SyntaxError: Non-ASCII character diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 02bc088ab..174b83bf3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -98,6 +98,7 @@ - **BiliBili** - **BilibiliAudio** - **BilibiliAudioAlbum** + - **BiliBiliPlayer** - **BioBioChileTV** - **BIQLE** - **BitChute** diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index e7831a7e2..2959038ed 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -27,7 +27,18 @@ md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P\d+)/play#)(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|bangumi)\.)? + bilibili\.(?:tv|com)/ + (?: + (?: + video/[aA][vV]| + anime/(?P\d+)/play\# + )(?P\d+)| + video/[bB][vV](?P[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', @@ -95,6 +106,10 @@ class BiliBiliIE(InfoExtractor): 'skip_download': True, # Test metadata only }, }] + }, { + # new BV video id format + 'url': 'https://www.bilibili.com/video/BV1JE411F741', + 'only_matching': True, }] _APP_KEY = 'iVGUTjsxvpLeuDCf' @@ -112,7 +127,7 @@ class BiliBiliIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('id') or mobj.group('id_bv') anime_id = mobj.group('anime_id') webpage = self._download_webpage(url, video_id) @@ -488,3 +503,16 @@ class BilibiliNewBangumiIE(InfoExtractor): 'Referer': url, } } + +class BiliBiliPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P\d+)' + _TEST = { + 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.bilibili.tv/video/av%s/' % video_id, + ie=BiliBiliIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 751a3a8f2..fd5ec6033 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -1,8 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import hashlib import json import re +from xml.sax.saxutils import escape from .common import InfoExtractor from ..compat import ( @@ -216,6 +218,29 @@ class CBCWatchBaseIE(InfoExtractor): 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } _GEO_COUNTRIES = ['CA'] + _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' + _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' + _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcwatch' + + def _signature(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._API_KEY} + resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + # token + query = { + 'access_token': access_token, + 'apikey': self._API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) + return resp['signature'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path @@ -239,7 +264,8 @@ class CBCWatchBaseIE(InfoExtractor): def _real_initialize(self): if self._valid_device_token(): return - device = self._downloader.cache.load('cbcwatch', 'device') or {} + device = self._downloader.cache.load( + 'cbcwatch', self._cache_device_key()) or {} self._device_id, self._device_token = device.get('id'), device.get('token') if self._valid_device_token(): return @@ -248,16 +274,30 @@ class CBCWatchBaseIE(InfoExtractor): def _valid_device_token(self): return self._device_id and self._device_token + def _cache_device_key(self): + email, _ = self._get_login_info() + return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' + def _register_device(self): - self._device_id = self._device_token = None result = self._download_xml( self._API_BASE_URL + 'device/register', None, 'Acquiring device token', data=b'web') self._device_id = xpath_text(result, 'deviceId', fatal=True) - self._device_token = xpath_text(result, 'deviceToken', fatal=True) + email, password = self._get_login_info() + if email and password: + signature = self._signature(email, password) + data = '{0}{1}web'.format( + escape(signature), escape(self._device_id)).encode() + url = self._API_BASE_URL + 'device/login' + result = self._download_xml( + url, None, data=data, + headers={'content-type': 'application/xml'}) + self._device_token = xpath_text(result, 'token', fatal=True) + else: + self._device_token = xpath_text(result, 'deviceToken', fatal=True) self._downloader.cache.store( - 'cbcwatch', 'device', { + 'cbcwatch', self._cache_device_key(), { 'id': self._device_id, 'token': self._device_token, }) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a6a4bd5f6..149d80f9d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -106,6 +106,7 @@ from .bilibili import ( BilibiliAudioIE, BilibiliAudioAlbumIE, BilibiliNewBangumiIE, + BiliBiliPlayerIE, ) from .biobiochiletv import BioBioChileTVIE from .bitchute import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d1ec56be9..a495ee15a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2536,6 +2536,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) + # Look for Teachable embeds, must be before Wistia + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + # Look for embedded Wistia player wistia_urls = WistiaIE._extract_urls(webpage) if wistia_urls: @@ -3141,10 +3146,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py index 0ee8ea712..fae425103 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/youtube_dl/extractor/hellporno.py @@ -1,12 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - js_to_json, + int_or_none, + merge_dicts, remove_end, - determine_ext, + unified_timestamp, ) @@ -14,15 +13,21 @@ class HellPornoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?hellporno\.(?:com/videos|net/v)/(?P[^/]+)' _TESTS = [{ 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', - 'md5': '1fee339c610d2049699ef2aa699439f1', + 'md5': 'f0a46ebc0bed0c72ae8fe4629f7de5f3', 'info_dict': { 'id': '149116', 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic', 'ext': 'mp4', 'title': 'Dixie is posing with naked ass very erotic', + 'description': 'md5:9a72922749354edb1c4b6e540ad3d215', + 'categories': list, 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 240, + 'timestamp': 1398762720, + 'upload_date': '20140429', + 'view_count': int, 'age_limit': 18, - } + }, }, { 'url': 'http://hellporno.net/v/186271/', 'only_matching': True, @@ -36,40 +41,36 @@ class HellPornoIE(InfoExtractor): title = remove_end(self._html_search_regex( r'([^<]+)', webpage, 'title'), ' - Hell Porno') - flashvars = self._parse_json(self._search_regex( - r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), - display_id, transform_source=js_to_json) + info = self._parse_html5_media_entries(url, webpage, display_id)[0] + self._sort_formats(info['formats']) - video_id = flashvars.get('video_id') - thumbnail = flashvars.get('preview_url') - ext = determine_ext(flashvars.get('postfix'), 'mp4') + video_id = self._search_regex( + (r'chs_object\s*=\s*["\'](\d+)', + r'params\[["\']video_id["\']\]\s*=\s*(\d+)'), webpage, 'video id', + default=display_id) + description = self._search_regex( + r'class=["\']desc_video_view_v2[^>]+>([^<]+)', webpage, + 'description', fatal=False) + categories = [ + c.strip() + for c in self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + if c.strip()] + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, fatal=False)) + timestamp = unified_timestamp(self._og_search_property( + 'video:release_date', webpage, fatal=False)) + view_count = int_or_none(self._search_regex( + r'>Views\s+(\d+)', webpage, 'view count', fatal=False)) - formats = [] - for video_url_key in ['video_url', 'video_alt_url']: - video_url = flashvars.get(video_url_key) - if not video_url: - continue - video_text = flashvars.get('%s_text' % video_url_key) - fmt = { - 'url': video_url, - 'ext': ext, - 'format_id': video_text, - } - m = re.search(r'^(?P\d+)[pP]', video_text) - if m: - fmt['height'] = int(m.group('height')) - formats.append(fmt) - self._sort_formats(formats) - - categories = self._html_search_meta( - 'keywords', webpage, 'categories', default='').split(',') - - return { + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, 'categories': categories, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, 'age_limit': 18, - 'formats': formats, - } + }) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 729d8de50..39f74d282 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -18,7 +18,6 @@ from ..utils import ( class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' - _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' @classmethod def _extract_urls(cls, webpage, source_url): @@ -70,7 +69,8 @@ class LimelightBaseIE(InfoExtractor): try: return self._download_json( self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers) + item_id, 'Downloading PlaylistService %s JSON' % method, + fatal=fatal, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] @@ -79,22 +79,22 @@ class LimelightBaseIE(InfoExtractor): raise ExtractorError(error, expected=True) raise - def _call_api(self, organization_id, item_id, method): - return self._download_json( - self._API_URL % (organization_id, self._API_PATH, item_id, method), - item_id, 'Downloading API %s JSON' % method) - - def _extract(self, item_id, pc_method, mobile_method, meta_method, referer=None): + def _extract(self, item_id, pc_method, mobile_method, referer=None): pc = self._call_playlist_service(item_id, pc_method, referer=referer) - metadata = self._call_api(pc['orgId'], item_id, meta_method) - mobile = self._call_playlist_service(item_id, mobile_method, fatal=False, referer=referer) - return pc, mobile, metadata + mobile = self._call_playlist_service( + item_id, mobile_method, fatal=False, referer=referer) + return pc, mobile + + def _extract_info(self, pc, mobile, i, referer): + get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {} + pc_item = get_item(pc, 'playlistItems') + mobile_item = get_item(mobile, 'mediaList') + video_id = pc_item.get('mediaId') or mobile_item['mediaId'] + title = pc_item.get('title') or mobile_item['title'] - def _extract_info(self, streams, mobile_urls, properties): - video_id = properties['media_id'] formats = [] urls = [] - for stream in streams: + for stream in pc_item.get('streams', []): stream_url = stream.get('url') if not stream_url or stream.get('drmProtected') or stream_url in urls: continue @@ -155,7 +155,7 @@ class LimelightBaseIE(InfoExtractor): }) formats.append(fmt) - for mobile_url in mobile_urls: + for mobile_url in mobile_item.get('mobileUrls', []): media_url = mobile_url.get('mobileUrl') format_id = mobile_url.get('targetMediaPlatform') if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: @@ -179,54 +179,34 @@ class LimelightBaseIE(InfoExtractor): self._sort_formats(formats) - title = properties['title'] - description = properties.get('description') - timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date')) - duration = float_or_none(properties.get('duration_in_milliseconds'), 1000) - filesize = int_or_none(properties.get('total_storage_in_bytes')) - categories = [properties.get('category')] - tags = properties.get('tags', []) - thumbnails = [{ - 'url': thumbnail['url'], - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')] - subtitles = {} - for caption in properties.get('captions', []): - lang = caption.get('language_code') - subtitles_url = caption.get('url') - if lang and subtitles_url: - subtitles.setdefault(lang, []).append({ - 'url': subtitles_url, - }) - closed_captions_url = properties.get('closed_captions_url') - if closed_captions_url: - subtitles.setdefault('en', []).append({ - 'url': closed_captions_url, - 'ext': 'ttml', - }) + for flag in mobile_item.get('flags'): + if flag == 'ClosedCaptions': + closed_captions = self._call_playlist_service( + video_id, 'getClosedCaptionsDetailsByMediaId', + False, referer) or [] + for cc in closed_captions: + cc_url = cc.get('webvttFileUrl') + if not cc_url: + continue + lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + break + + get_meta = lambda x: pc_item.get(x) or mobile_item.get(x) return { 'id': video_id, 'title': title, - 'description': description, + 'description': get_meta('description'), 'formats': formats, - 'timestamp': timestamp, - 'duration': duration, - 'filesize': filesize, - 'categories': categories, - 'tags': tags, - 'thumbnails': thumbnails, + 'duration': float_or_none(get_meta('durationInMilliseconds'), 1000), + 'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'), 'subtitles': subtitles, } - def _extract_info_helper(self, pc, mobile, i, metadata): - return self._extract_info( - try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [], - try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [], - metadata) - class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' @@ -251,8 +231,6 @@ class LimelightMediaIE(LimelightBaseIE): 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 144.23, - 'timestamp': 1244136834, - 'upload_date': '20090604', }, 'params': { # m3u8 download @@ -268,30 +246,29 @@ class LimelightMediaIE(LimelightBaseIE): 'title': '3Play Media Overview Video', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 78.101, - 'timestamp': 1338929955, - 'upload_date': '20120605', - 'subtitles': 'mincount:9', + # TODO: extract all languages that were accessible via API + # 'subtitles': 'mincount:9', + 'subtitles': 'mincount:1', }, }, { 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'media' - _API_PATH = 'media' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) + source_url = smuggled_data.get('source_url') self._initialize_geo_bypass({ 'countries': smuggled_data.get('geo_countries'), }) - pc, mobile, metadata = self._extract( + pc, mobile = self._extract( video_id, 'getPlaylistByMediaId', - 'getMobilePlaylistByMediaId', 'properties', - smuggled_data.get('source_url')) + 'getMobilePlaylistByMediaId', source_url) - return self._extract_info_helper(pc, mobile, 0, metadata) + return self._extract_info(pc, mobile, 0, source_url) class LimelightChannelIE(LimelightBaseIE): @@ -313,6 +290,7 @@ class LimelightChannelIE(LimelightBaseIE): 'info_dict': { 'id': 'ab6a524c379342f9b23642917020c082', 'title': 'Javascript Sample Code', + 'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html', }, 'playlist_mincount': 3, }, { @@ -320,22 +298,23 @@ class LimelightChannelIE(LimelightBaseIE): 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'channel' - _API_PATH = 'channels' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) channel_id = self._match_id(url) + source_url = smuggled_data.get('source_url') - pc, mobile, medias = self._extract( + pc, mobile = self._extract( channel_id, 'getPlaylistByChannelId', 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', - 'media', smuggled_data.get('source_url')) + source_url) entries = [ - self._extract_info_helper(pc, mobile, i, medias['media_list'][i]) - for i in range(len(medias['media_list']))] + self._extract_info(pc, mobile, i, source_url) + for i in range(len(pc['playlistItems']))] - return self.playlist_result(entries, channel_id, pc['title']) + return self.playlist_result( + entries, channel_id, pc.get('title'), mobile.get('description')) class LimelightChannelListIE(LimelightBaseIE): @@ -368,10 +347,12 @@ class LimelightChannelListIE(LimelightBaseIE): def _real_extract(self, url): channel_list_id = self._match_id(url) - channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById') + channel_list = self._call_playlist_service( + channel_list_id, 'getMobileChannelListById') entries = [ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') for channel in channel_list['channelList']] - return self.playlist_result(entries, channel_list_id, channel_list['title']) + return self.playlist_result( + entries, channel_list_id, channel_list['title']) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 9c8bf05af..2447c812e 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + merge_dicts, parse_iso8601, qualities, try_get, @@ -87,21 +88,25 @@ class NDRIE(NDRBaseIE): def _extract_embed(self, webpage, display_id): embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', fatal=True) + 'embedURL', webpage, 'embed URL', + default=None) or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url') description = self._search_regex( r']+itemprop="description">([^<]+)

', webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', fatal=False)) - return { + webpage, 'upload date', default=None)) + info = self._search_json_ld(webpage, display_id, default={}) + return merge_dicts({ '_type': 'url_transparent', 'url': embed_url, 'display_id': display_id, 'description': description, 'timestamp': timestamp, - } + }, info) class NJoyIE(NDRBaseIE): diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 6a2c6cb7b..de6a707c4 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[a-z]+-\d{8}-\d+)' + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[^/]+?-\d{8}-\d+)' # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -30,8 +30,11 @@ class NhkVodIE(InfoExtractor): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', + 'only_matching': True, }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7/episode/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' def _real_extract(self, url): lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() @@ -82,15 +85,9 @@ class NhkVodIE(InfoExtractor): audio = episode['audio'] audio_path = audio['audio'] info['formats'] = self._extract_m3u8_formats( - 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', m3u8_id='hls', fatal=False) - for proto in ('rtmpt', 'rtmp'): - info['formats'].append({ - 'ext': 'flv', - 'format_id': proto, - 'url': '%s://flv.nhk.or.jp/ondemand/mp4:flv%s' % (proto, audio_path), - 'vcodec': 'none', - }) + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang return info diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py index dd5f17f11..80222d428 100644 --- a/youtube_dl/extractor/pokemon.py +++ b/youtube_dl/extractor/pokemon.py @@ -20,20 +20,16 @@ class PokemonIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Ol’ Raise and Switch!', 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', - 'timestamp': 1511824728, - 'upload_date': '20171127', }, 'add_id': ['LimelightMedia'], }, { # no data-video-title - 'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008', + 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', 'info_dict': { - 'id': '99f3bae270bf4e5097274817239ce9c8', + 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', 'ext': 'mp4', - 'title': 'Pokémon: The Rise of Darkrai', - 'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d', - 'timestamp': 1417778347, - 'upload_date': '20141205', + 'title': "Pokémon : L'ascension de Darkrai", + 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', }, 'add_id': ['LimelightMedia'], 'params': { diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b8f65af7c..3567a3283 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -52,7 +52,7 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -149,6 +149,9 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, }] @staticmethod @@ -166,6 +169,13 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') + if 'premium' in host: + if not self._downloader.params.get('cookiefile'): + raise ExtractorError( + 'PornHub Premium requires authentication.' + ' You may want to use --cookies.', + expected=True) + self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): @@ -405,7 +415,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -473,7 +483,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?P(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -588,7 +598,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index a1372d389..ff6be0b54 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -27,6 +27,7 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, + urlhandle_detect_ext, ) @@ -96,7 +97,7 @@ class SoundcloudIE(InfoExtractor): 'repost_count': int, } }, - # not streamable song, preview + # geo-restricted { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { @@ -108,17 +109,13 @@ class SoundcloudIE(InfoExtractor): 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 30, + 'duration': 227.155, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, - 'params': { - # rtmp - 'skip_download': True, - }, }, # private link { @@ -229,7 +226,6 @@ class SoundcloudIE(InfoExtractor): 'skip_download': True, }, }, - # not available via api.soundcloud.com/i1/tracks/id/streams { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', @@ -250,11 +246,9 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - 'expected_warnings': ['Unable to download JSON metadata'], } ] - _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' @@ -316,10 +310,9 @@ class SoundcloudIE(InfoExtractor): def _resolv_url(cls, url): return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url - def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): + def _extract_info_dict(self, info, full_title=None, secret_token=None): track_id = compat_str(info['id']) title = info['title'] - track_base_url = self._API_BASE + 'tracks/%s' % track_id format_urls = set() formats = [] @@ -328,21 +321,22 @@ class SoundcloudIE(InfoExtractor): query['secret_token'] = secret_token if info.get('downloadable') and info.get('has_downloads_left'): - format_url = update_url_query( - info.get('download_url') or track_base_url + '/download', query) - format_urls.add(format_url) - if version == 2: - v1_info = self._download_json( - track_base_url, track_id, query=query, fatal=False) or {} - else: - v1_info = info - formats.append({ - 'format_id': 'download', - 'ext': v1_info.get('original_format') or 'mp3', - 'filesize': int_or_none(v1_info.get('original_content_size')), - 'url': format_url, - 'preference': 10, - }) + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'preference': 10, + }) def invalid_url(url): return not url or url in format_urls @@ -406,42 +400,11 @@ class SoundcloudIE(InfoExtractor): }, 'http' if protocol == 'progressive' else protocol, t.get('snipped') or '/preview/' in format_url) - if not formats: - # Old API, does not work for some tracks (e.g. - # https://soundcloud.com/giovannisarani/mezzo-valzer) - # and might serve preview URLs (e.g. - # http://www.soundcloud.com/snbrn/ele) - format_dict = self._download_json( - track_base_url + '/streams', track_id, - 'Downloading track url', query=query, fatal=False) or {} - - for key, stream_url in format_dict.items(): - if invalid_url(stream_url): - continue - format_urls.add(stream_url) - mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key) - if mobj: - protocol, ext, abr = mobj.groups() - add_format({ - 'abr': abr, - 'ext': ext, - 'url': stream_url, - }, protocol) - - if not formats: - # We fallback to the stream_url in the original info, this - # cannot be always used, sometimes it can give an HTTP 404 error - urlh = self._request_webpage( - HEADRequest(info.get('stream_url') or track_base_url + '/stream'), - track_id, query=query, fatal=False) - if urlh: - stream_url = urlh.geturl() - if not invalid_url(stream_url): - add_format({'url': stream_url}, 'http') - for f in formats: f['vcodec'] = 'none' + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted() self._sort_formats(formats) user = info.get('user') or {} @@ -511,16 +474,10 @@ class SoundcloudIE(InfoExtractor): resolve_title += '/%s' % token info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - version = 2 info = self._download_json( - info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) - if not info: - info = self._download_json( - info_json_url.replace(self._API_V2_BASE, self._API_BASE), - full_title, 'Downloading info JSON', query=query) - version = 1 + info_json_url, full_title, 'Downloading info JSON', query=query) - return self._extract_info_dict(info, full_title, token, version) + return self._extract_info_dict(info, full_title, token) class SoundcloudPlaylistBaseIE(SoundcloudIE): diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 4316a6962..a75369dbe 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -7,7 +7,9 @@ from .wistia import WistiaIE from ..utils import ( clean_html, ExtractorError, + int_or_none, get_element_by_class, + strip_or_none, urlencode_postdata, urljoin, ) @@ -19,8 +21,8 @@ class TeachableBaseIE(InfoExtractor): _SITES = { # Only notable ones here - 'upskillcourses.com': 'upskill', - 'academy.gns3.com': 'gns3', + 'v1.upskillcourses.com': 'upskill', + 'gns3.teachable.com': 'gns3', 'academyhacker.com': 'academyhacker', 'stackskills.com': 'stackskills', 'market.saleshacker.com': 'saleshacker', @@ -109,27 +111,29 @@ class TeachableIE(TeachableBaseIE): ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', 'info_dict': { - 'id': 'uzw6zw58or', - 'ext': 'mp4', - 'title': 'Welcome to the Course!', - 'description': 'md5:65edb0affa582974de4625b9cdea1107', - 'duration': 138.763, - 'timestamp': 1479846621, - 'upload_date': '20161122', + 'id': 'untlgzk1v7', + 'ext': 'bin', + 'title': 'Overview', + 'description': 'md5:071463ff08b86c208811130ea1c2464c', + 'duration': 736.4, + 'timestamp': 1542315762, + 'upload_date': '20181115', + 'chapter': 'Welcome', + 'chapter_number': 1, }, 'params': { 'skip_download': True, }, }, { - 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', 'only_matching': True, }, { - 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 'only_matching': True, }] @@ -173,11 +177,34 @@ class TeachableIE(TeachableBaseIE): title = self._og_search_title(webpage, default=None) + chapter = None + chapter_number = None + section_item = self._search_regex( + r'(?s)(?P
  • ]+\bdata-lecture-id=["\']%s[^>]+>.+?
  • )' % video_id, + webpage, 'section item', default=None, group='li') + if section_item: + chapter_number = int_or_none(self._search_regex( + r'data-ss-position=["\'](\d+)', section_item, 'section id', + default=None)) + if chapter_number is not None: + sections = [] + for s in re.findall( + r'(?s)]+\bclass=["\']section-title[^>]+>(.+?)', webpage): + section = strip_or_none(clean_html(s)) + if not section: + sections = [] + break + sections.append(section) + if chapter_number <= len(sections): + chapter = sections[chapter_number - 1] + entries = [{ '_type': 'url_transparent', 'url': wistia_url, 'ie_key': WistiaIE.ie_key(), 'title': title, + 'chapter': chapter, + 'chapter_number': chapter_number, } for wistia_url in wistia_urls] return self.playlist_result(entries, video_id, title) @@ -192,20 +219,20 @@ class TeachableCourseIE(TeachableBaseIE): /(?:courses|p)/(?:enrolled/)?(?P[^/?#&]+) ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, }, { - 'url': 'http://upskillcourses.com/courses/119763/', + 'url': 'http://v1.upskillcourses.com/courses/119763/', 'only_matching': True, }, { - 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'url': 'https://gns3.teachable.com/courses/enrolled/423415', 'only_matching': True, }, { 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index ae9f66787..c82c94b3a 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -38,8 +38,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', - 'upload_date': '20180222', - 'timestamp': 1519326631, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py index 0e2370cd8..0631cb7ab 100644 --- a/youtube_dl/extractor/tfo.py +++ b/youtube_dl/extractor/tfo.py @@ -17,14 +17,12 @@ class TFOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P\d+)' _TEST = { 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', - 'md5': '47c987d0515561114cf03d1226a9d4c7', + 'md5': 'cafbe4f47a8dae0ca0159937878100d6', 'info_dict': { - 'id': '100463871', + 'id': '7da3d50e495c406b8fc0b997659cc075', 'ext': 'mp4', 'title': 'Video Game Hackathon', 'description': 'md5:558afeba217c6c8d96c60e5421795c07', - 'upload_date': '20160212', - 'timestamp': 1455310233, } } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cea686afc..8cd611e1e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -585,7 +585,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = 'https://vimeo.com/' + video_id elif is_player: url = 'https://player.vimeo.com/video/' + video_id - elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf', '/album/', '/showcase/')): + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id try: diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 47caec1de..01b253dcb 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -98,9 +98,9 @@ class XTubeIE(InfoExtractor): title = config.get('title') thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) - sources = config.get('sources') + sources = config.get('sources') or config.get('format') - if isinstance(sources, dict): + if not isinstance(sources, dict): sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', webpage, 'sources', group='sources'), video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d3e18a6ad..908defecd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,7 +29,6 @@ from ..compat import ( from ..utils import ( bool_or_none, clean_html, - dict_get, error_to_compat_str, extract_attributes, ExtractorError, @@ -1708,9 +1707,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - def extract_token(v_info): - return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) - def extract_player_response(player_response, video_id): pl_response = str_or_none(player_response) if not pl_response: @@ -1723,6 +1719,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = {} # Get video info + video_info = {} embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True @@ -1737,19 +1734,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) + try: + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') + except ExtractorError: + video_info_webpage = None + if video_info_webpage: + video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False - video_info = None - sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: @@ -1766,61 +1765,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True - sts = ytplayer_config.get('sts') if not player_response: player_response = extract_player_response(args.get('player_response'), video_id) if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el in ('embedded', 'detailpage', 'vevo', ''): - query = { - 'video_id': video_id, - 'ps': 'default', - 'eurl': '', - 'gl': 'US', - 'hl': 'en', - } - if el: - query['el'] = el - if sts: - query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) - if not video_info_webpage: - continue - get_video_info = compat_parse_qs(video_info_webpage) - if not player_response: - pl_response = get_video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(get_video_info) - if view_count is None: - view_count = extract_view_count(get_video_info) - if not video_info: - video_info = get_video_info - get_token = extract_token(get_video_info) - if get_token: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - token = extract_token(video_info) - if not token: - video_info = get_video_info - break def extract_unavailable_message(): messages = [] @@ -1833,13 +1781,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if messages: return '\n'.join(messages) - if not video_info: + if not video_info and not player_response: unavailable_message = extract_unavailable_message() if not unavailable_message: unavailable_message = 'Unable to extract video data' raise ExtractorError( 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) + if not isinstance(video_info, dict): + video_info = {} + video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} @@ -2392,30 +2343,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['stretched_ratio'] = ratio if not formats: - token = extract_token(video_info) - if not token: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): - raise ExtractorError('This video is DRM protected.', expected=True) + if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message + raise ExtractorError( + 'YouTube said: %s' % reason, + expected=True, video_id=video_id) + if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8ccf25489..38262bee4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2729,6 +2729,11 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ _HTTPONLY_PREFIX = '#HttpOnly_' def save(self, filename=None, ignore_discard=False, ignore_expires=False): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 56330ea2e..5aedd3268 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.03.06' +__version__ = '2020.03.24'