diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 716768242..40a869113 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.02.16 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 3fd6a0bd6..7b10df3d4 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d160fcce9..04bbcfa68 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index f97644f65..a9e231817 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.02.16 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dedef6d53..4a3d32d51 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index a6e2c3c19..f753972c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,70 @@ +version 2020.03.24 + +Core +- [utils] Revert support for cookie files with spaces used instead of tabs + +Extractors +* [teachable] Update upskillcourses and gns3 domains +* [generic] Look for teachable embeds before wistia ++ [teachable] Extract chapter metadata (#24421) ++ [bilibili] Add support for player.bilibili.com (#24402) ++ [bilibili] Add support for new URL schema with BV ids (#24439, #24442) +* [limelight] Remove disabled API requests (#24255) +* [soundcloud] Fix download URL extraction (#24394) ++ [cbc:watch] Add support for authentication (#19160) +* [hellporno] Fix extraction (#24399) +* [xtube] Fix formats extraction (#24348) +* [ndr] Fix extraction (#24326) +* [nhk] Update m3u8 URL and use native HLS downloader (#24329) +- [nhk] Remove obsolete rtmp formats (#24329) +* [nhk] Relax URL regular expression (#24329) +- [vimeo] Revert fix showcase password protected video extraction (#24224) + + +version 2020.03.08 + +Core ++ [utils] Add support for cookie files with spaces used instead of tabs + +Extractors ++ [pornhub] Add support for pornhubpremium.com (#24288) +- [youtube] Remove outdated code and unnecessary requests +* [youtube] Improve extraction in 429 HTTP error conditions (#24283) +* [nhk] Update API version (#24270) + + +version 2020.03.06 + +Extractors +* [youtube] Fix age-gated videos support without login (#24248) +* [vimeo] Fix showcase password protected video extraction (#24224) +* [pornhub] Improve title extraction (#24184) +* [peertube] Improve extraction (#23657) ++ [servus] Add support for new URL schema (#23475, #23583, #24142) +* [vimeo] Fix subtitles URLs (#24209) + + +version 2020.03.01 + +Core +* [YoutubeDL] Force redirect URL to unicode on python 2 +- [options] Remove duplicate short option -v for --version (#24162) + +Extractors +* [xhamster] Fix extraction (#24205) +* [franceculture] Fix extraction (#24204) ++ [telecinco] Add support for article opening videos +* [telecinco] Fix extraction (#24195) +* [xtube] Fix metadata extraction (#21073, #22455) +* [youjizz] Fix extraction (#24181) +- Remove no longer needed compat_str around geturl +* [pornhd] Fix extraction (#24128) ++ [teachable] Add support for multiple videos per lecture (#24101) ++ [wistia] Add support for multiple generic embeds (#8347, 11385) +* [imdb] Fix extraction (#23443) +* [tv2dk:bornholm:play] Fix extraction (#24076) + + version 2020.02.16 Core diff --git a/README.md b/README.md index 01f975958..4f54a5240 100644 --- a/README.md +++ b/README.md @@ -835,7 +835,9 @@ In February 2015, the new YouTube player contained a character sequence in a str ### HTTP Error 429: Too Many Requests or 402: Payment Required -These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. +These two error codes indicate that the service is blocking your IP address because of overuse. Usually this is a soft block meaning that you can gain access again after solving CAPTCHA. Just open a browser and solve a CAPTCHA the service suggests you and after that [pass cookies](#how-do-i-pass-cookies-to-youtube-dl) to youtube-dl. Note that if your machine has multiple external IPs then you should also pass exactly the same IP you've used for solving CAPTCHA with [`--source-address`](#network-options). Also you may need to pass a `User-Agent` HTTP header of your browser with [`--user-agent`](#workarounds). + +If this is not the case (no CAPTCHA suggested to solve by the service) then you can contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. ### SyntaxError: Non-ASCII character diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 02bc088ab..174b83bf3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -98,6 +98,7 @@ - **BiliBili** - **BilibiliAudio** - **BilibiliAudioAlbum** + - **BiliBiliPlayer** - **BioBioChileTV** - **BIQLE** - **BitChute** diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 7d57a628e..17aaaf20d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -26,7 +26,6 @@ from youtube_dl.extractor import ( ThePlatformIE, ThePlatformFeedIE, RTVEALaCartaIE, - FunnyOrDieIE, DemocracynowIE, ) @@ -322,18 +321,6 @@ class TestRtveSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') -class TestFunnyOrDieSubtitles(BaseTestSubtitles): - url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' - IE = FunnyOrDieIE - - def test_allsubtitles(self): - self.DL.params['writesubtitles'] = True - self.DL.params['allsubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') - - class TestDemocracynowSubtitles(BaseTestSubtitles): url = 'http://www.democracynow.org/shows/2015/7/3' IE = DemocracynowIE diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b09cb0a79..19370f62b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -92,6 +92,7 @@ from .utils import ( YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, + YoutubeDLRedirectHandler, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -2343,6 +2344,7 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + redirect_handler = YoutubeDLRedirectHandler() data_handler = compat_urllib_request_DataHandler() # When passing our own FileHandler instance, build_opener won't add the @@ -2356,7 +2358,7 @@ class YoutubeDL(object): file_handler.file_open = file_open opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) + proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80bd696e2..4dc597e16 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -24,7 +24,18 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P\d+)/play#)(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|bangumi)\.)? + bilibili\.(?:tv|com)/ + (?: + (?: + video/[aA][vV]| + anime/(?P\d+)/play\# + )(?P\d+)| + video/[bB][vV](?P[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', @@ -92,6 +103,10 @@ class BiliBiliIE(InfoExtractor): 'skip_download': True, # Test metadata only }, }] + }, { + # new BV video id format + 'url': 'https://www.bilibili.com/video/BV1JE411F741', + 'only_matching': True, }] _APP_KEY = 'iVGUTjsxvpLeuDCf' @@ -109,7 +124,7 @@ class BiliBiliIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('id') or mobj.group('id_bv') anime_id = mobj.group('anime_id') webpage = self._download_webpage(url, video_id) @@ -419,3 +434,17 @@ class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): entries, am_id, album_title, album_data.get('intro')) return self.playlist_result(entries, am_id) + + +class BiliBiliPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P\d+)' + _TEST = { + 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.bilibili.tv/video/av%s/' % video_id, + ie=BiliBiliIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 751a3a8f2..fd5ec6033 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -1,8 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import hashlib import json import re +from xml.sax.saxutils import escape from .common import InfoExtractor from ..compat import ( @@ -216,6 +218,29 @@ class CBCWatchBaseIE(InfoExtractor): 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } _GEO_COUNTRIES = ['CA'] + _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' + _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' + _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcwatch' + + def _signature(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._API_KEY} + resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + # token + query = { + 'access_token': access_token, + 'apikey': self._API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) + return resp['signature'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path @@ -239,7 +264,8 @@ class CBCWatchBaseIE(InfoExtractor): def _real_initialize(self): if self._valid_device_token(): return - device = self._downloader.cache.load('cbcwatch', 'device') or {} + device = self._downloader.cache.load( + 'cbcwatch', self._cache_device_key()) or {} self._device_id, self._device_token = device.get('id'), device.get('token') if self._valid_device_token(): return @@ -248,16 +274,30 @@ class CBCWatchBaseIE(InfoExtractor): def _valid_device_token(self): return self._device_id and self._device_token + def _cache_device_key(self): + email, _ = self._get_login_info() + return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' + def _register_device(self): - self._device_id = self._device_token = None result = self._download_xml( self._API_BASE_URL + 'device/register', None, 'Acquiring device token', data=b'web') self._device_id = xpath_text(result, 'deviceId', fatal=True) - self._device_token = xpath_text(result, 'deviceToken', fatal=True) + email, password = self._get_login_info() + if email and password: + signature = self._signature(email, password) + data = '{0}{1}web'.format( + escape(signature), escape(self._device_id)).encode() + url = self._API_BASE_URL + 'device/login' + result = self._download_xml( + url, None, data=data, + headers={'content-type': 'application/xml'}) + self._device_token = xpath_text(result, 'token', fatal=True) + else: + self._device_token = xpath_text(result, 'deviceToken', fatal=True) self._downloader.cache.store( - 'cbcwatch', 'device', { + 'cbcwatch', self._cache_device_key(), { 'id': self._device_id, 'token': self._device_token, }) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484..c51a3a07d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2340,6 +2340,8 @@ class InfoExtractor(object): if res is False: return [] ism_doc, urlh = res + if ism_doc is None: + return [] return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index c050bf9df..fe42821c7 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( encode_base_n, ExtractorError, @@ -55,7 +54,7 @@ class EpornerIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, display_id) - video_id = self._match_id(compat_str(urlh.geturl())) + video_id = self._match_id(urlh.geturl()) hash = self._search_regex( r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 61c587dbd..7f67256be 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -105,6 +105,7 @@ from .bilibili import ( BiliBiliBangumiIE, BilibiliAudioIE, BilibiliAudioAlbumIE, + BiliBiliPlayerIE, ) from .biobiochiletv import BioBioChileTVIE from .bitchute import ( @@ -639,7 +640,10 @@ from .mixcloud import ( from .mlb import MLBIE from .mnet import MnetIE from .moevideo import MoeVideoIE -from .mofosex import MofosexIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) from .mojvideo import MojvideoIE from .morningstar import MorningstarIE from .motherless import ( diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index b8fa17588..306b45fc9 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -31,7 +31,13 @@ class FranceCultureIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_data = extract_attributes(self._search_regex( - r'(?s)]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(]+data-asset-source="[^"]+"[^>]+>)', + r'''(?sx) + (?: + | + ]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> + ).*? + (]+data-asset-source="[^"]+"[^>]+>) + ''', webpage, 'video data')) video_url = video_data['data-asset-source'] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 04c026984..ce8252f6a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,6 +60,9 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE +from .mofosex import MofosexEmbedIE +from .spankwire import SpankwireIE +from .youporn import YouPornIE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -2287,7 +2290,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = compat_str(head_response.geturl()) + new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -2387,12 +2390,12 @@ class GenericIE(InfoExtractor): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=compat_str(full_response.geturl())), + xspf_base_url=full_response.geturl()), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, - mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], + mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict @@ -2536,6 +2539,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) + # Look for Teachable embeds, must be before Wistia + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + # Look for embedded Wistia player wistia_urls = WistiaIE._extract_urls(webpage) if wistia_urls: @@ -2710,6 +2718,21 @@ class GenericIE(InfoExtractor): if tube8_urls: return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Mofosex player + mofosex_urls = MofosexEmbedIE._extract_urls(webpage) + if mofosex_urls: + return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) + + # Look for embedded Spankwire player + spankwire_urls = SpankwireIE._extract_urls(webpage) + if spankwire_urls: + return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) + + # Look for embedded YouPorn player + youporn_urls = YouPornIE._extract_urls(webpage) + if youporn_urls: + return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -3141,10 +3164,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py index 0ee8ea712..fae425103 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/youtube_dl/extractor/hellporno.py @@ -1,12 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - js_to_json, + int_or_none, + merge_dicts, remove_end, - determine_ext, + unified_timestamp, ) @@ -14,15 +13,21 @@ class HellPornoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?hellporno\.(?:com/videos|net/v)/(?P[^/]+)' _TESTS = [{ 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', - 'md5': '1fee339c610d2049699ef2aa699439f1', + 'md5': 'f0a46ebc0bed0c72ae8fe4629f7de5f3', 'info_dict': { 'id': '149116', 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic', 'ext': 'mp4', 'title': 'Dixie is posing with naked ass very erotic', + 'description': 'md5:9a72922749354edb1c4b6e540ad3d215', + 'categories': list, 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 240, + 'timestamp': 1398762720, + 'upload_date': '20140429', + 'view_count': int, 'age_limit': 18, - } + }, }, { 'url': 'http://hellporno.net/v/186271/', 'only_matching': True, @@ -36,40 +41,36 @@ class HellPornoIE(InfoExtractor): title = remove_end(self._html_search_regex( r'([^<]+)', webpage, 'title'), ' - Hell Porno') - flashvars = self._parse_json(self._search_regex( - r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), - display_id, transform_source=js_to_json) + info = self._parse_html5_media_entries(url, webpage, display_id)[0] + self._sort_formats(info['formats']) - video_id = flashvars.get('video_id') - thumbnail = flashvars.get('preview_url') - ext = determine_ext(flashvars.get('postfix'), 'mp4') + video_id = self._search_regex( + (r'chs_object\s*=\s*["\'](\d+)', + r'params\[["\']video_id["\']\]\s*=\s*(\d+)'), webpage, 'video id', + default=display_id) + description = self._search_regex( + r'class=["\']desc_video_view_v2[^>]+>([^<]+)', webpage, + 'description', fatal=False) + categories = [ + c.strip() + for c in self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + if c.strip()] + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, fatal=False)) + timestamp = unified_timestamp(self._og_search_property( + 'video:release_date', webpage, fatal=False)) + view_count = int_or_none(self._search_regex( + r'>Views\s+(\d+)', webpage, 'view count', fatal=False)) - formats = [] - for video_url_key in ['video_url', 'video_alt_url']: - video_url = flashvars.get(video_url_key) - if not video_url: - continue - video_text = flashvars.get('%s_text' % video_url_key) - fmt = { - 'url': video_url, - 'ext': ext, - 'format_id': video_text, - } - m = re.search(r'^(?P\d+)[pP]', video_text) - if m: - fmt['height'] = int(m.group('height')) - formats.append(fmt) - self._sort_formats(formats) - - categories = self._html_search_meta( - 'keywords', webpage, 'categories', default='').split(',') - - return { + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, 'categories': categories, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, 'age_limit': 18, - 'formats': formats, - } + }) diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 6ed7da4ab..1b2dcef46 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, determine_ext, @@ -36,7 +35,7 @@ class LecturioBaseIE(InfoExtractor): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(url_handle): - return self._LOGIN_URL not in compat_str(url_handle.geturl()) + return self._LOGIN_URL not in url_handle.geturl() # Already logged in if is_logged(urlh): diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 729d8de50..39f74d282 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -18,7 +18,6 @@ from ..utils import ( class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' - _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' @classmethod def _extract_urls(cls, webpage, source_url): @@ -70,7 +69,8 @@ class LimelightBaseIE(InfoExtractor): try: return self._download_json( self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers) + item_id, 'Downloading PlaylistService %s JSON' % method, + fatal=fatal, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] @@ -79,22 +79,22 @@ class LimelightBaseIE(InfoExtractor): raise ExtractorError(error, expected=True) raise - def _call_api(self, organization_id, item_id, method): - return self._download_json( - self._API_URL % (organization_id, self._API_PATH, item_id, method), - item_id, 'Downloading API %s JSON' % method) - - def _extract(self, item_id, pc_method, mobile_method, meta_method, referer=None): + def _extract(self, item_id, pc_method, mobile_method, referer=None): pc = self._call_playlist_service(item_id, pc_method, referer=referer) - metadata = self._call_api(pc['orgId'], item_id, meta_method) - mobile = self._call_playlist_service(item_id, mobile_method, fatal=False, referer=referer) - return pc, mobile, metadata + mobile = self._call_playlist_service( + item_id, mobile_method, fatal=False, referer=referer) + return pc, mobile + + def _extract_info(self, pc, mobile, i, referer): + get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {} + pc_item = get_item(pc, 'playlistItems') + mobile_item = get_item(mobile, 'mediaList') + video_id = pc_item.get('mediaId') or mobile_item['mediaId'] + title = pc_item.get('title') or mobile_item['title'] - def _extract_info(self, streams, mobile_urls, properties): - video_id = properties['media_id'] formats = [] urls = [] - for stream in streams: + for stream in pc_item.get('streams', []): stream_url = stream.get('url') if not stream_url or stream.get('drmProtected') or stream_url in urls: continue @@ -155,7 +155,7 @@ class LimelightBaseIE(InfoExtractor): }) formats.append(fmt) - for mobile_url in mobile_urls: + for mobile_url in mobile_item.get('mobileUrls', []): media_url = mobile_url.get('mobileUrl') format_id = mobile_url.get('targetMediaPlatform') if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: @@ -179,54 +179,34 @@ class LimelightBaseIE(InfoExtractor): self._sort_formats(formats) - title = properties['title'] - description = properties.get('description') - timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date')) - duration = float_or_none(properties.get('duration_in_milliseconds'), 1000) - filesize = int_or_none(properties.get('total_storage_in_bytes')) - categories = [properties.get('category')] - tags = properties.get('tags', []) - thumbnails = [{ - 'url': thumbnail['url'], - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')] - subtitles = {} - for caption in properties.get('captions', []): - lang = caption.get('language_code') - subtitles_url = caption.get('url') - if lang and subtitles_url: - subtitles.setdefault(lang, []).append({ - 'url': subtitles_url, - }) - closed_captions_url = properties.get('closed_captions_url') - if closed_captions_url: - subtitles.setdefault('en', []).append({ - 'url': closed_captions_url, - 'ext': 'ttml', - }) + for flag in mobile_item.get('flags'): + if flag == 'ClosedCaptions': + closed_captions = self._call_playlist_service( + video_id, 'getClosedCaptionsDetailsByMediaId', + False, referer) or [] + for cc in closed_captions: + cc_url = cc.get('webvttFileUrl') + if not cc_url: + continue + lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + break + + get_meta = lambda x: pc_item.get(x) or mobile_item.get(x) return { 'id': video_id, 'title': title, - 'description': description, + 'description': get_meta('description'), 'formats': formats, - 'timestamp': timestamp, - 'duration': duration, - 'filesize': filesize, - 'categories': categories, - 'tags': tags, - 'thumbnails': thumbnails, + 'duration': float_or_none(get_meta('durationInMilliseconds'), 1000), + 'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'), 'subtitles': subtitles, } - def _extract_info_helper(self, pc, mobile, i, metadata): - return self._extract_info( - try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [], - try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [], - metadata) - class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' @@ -251,8 +231,6 @@ class LimelightMediaIE(LimelightBaseIE): 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 144.23, - 'timestamp': 1244136834, - 'upload_date': '20090604', }, 'params': { # m3u8 download @@ -268,30 +246,29 @@ class LimelightMediaIE(LimelightBaseIE): 'title': '3Play Media Overview Video', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 78.101, - 'timestamp': 1338929955, - 'upload_date': '20120605', - 'subtitles': 'mincount:9', + # TODO: extract all languages that were accessible via API + # 'subtitles': 'mincount:9', + 'subtitles': 'mincount:1', }, }, { 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'media' - _API_PATH = 'media' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) + source_url = smuggled_data.get('source_url') self._initialize_geo_bypass({ 'countries': smuggled_data.get('geo_countries'), }) - pc, mobile, metadata = self._extract( + pc, mobile = self._extract( video_id, 'getPlaylistByMediaId', - 'getMobilePlaylistByMediaId', 'properties', - smuggled_data.get('source_url')) + 'getMobilePlaylistByMediaId', source_url) - return self._extract_info_helper(pc, mobile, 0, metadata) + return self._extract_info(pc, mobile, 0, source_url) class LimelightChannelIE(LimelightBaseIE): @@ -313,6 +290,7 @@ class LimelightChannelIE(LimelightBaseIE): 'info_dict': { 'id': 'ab6a524c379342f9b23642917020c082', 'title': 'Javascript Sample Code', + 'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html', }, 'playlist_mincount': 3, }, { @@ -320,22 +298,23 @@ class LimelightChannelIE(LimelightBaseIE): 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'channel' - _API_PATH = 'channels' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) channel_id = self._match_id(url) + source_url = smuggled_data.get('source_url') - pc, mobile, medias = self._extract( + pc, mobile = self._extract( channel_id, 'getPlaylistByChannelId', 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', - 'media', smuggled_data.get('source_url')) + source_url) entries = [ - self._extract_info_helper(pc, mobile, i, medias['media_list'][i]) - for i in range(len(medias['media_list']))] + self._extract_info(pc, mobile, i, source_url) + for i in range(len(pc['playlistItems']))] - return self.playlist_result(entries, channel_id, pc['title']) + return self.playlist_result( + entries, channel_id, pc.get('title'), mobile.get('description')) class LimelightChannelListIE(LimelightBaseIE): @@ -368,10 +347,12 @@ class LimelightChannelListIE(LimelightBaseIE): def _real_extract(self, url): channel_list_id = self._match_id(url) - channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById') + channel_list = self._call_playlist_service( + channel_list_id, 'getMobileChannelListById') entries = [ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') for channel in channel_list['channelList']] - return self.playlist_result(entries, channel_list_id, channel_list['title']) + return self.playlist_result( + entries, channel_list_id, channel_list['title']) diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py index a78c6556e..23ca965d9 100644 --- a/youtube_dl/extractor/linuxacademy.py +++ b/youtube_dl/extractor/linuxacademy.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, - compat_str, ) from ..utils import ( ExtractorError, @@ -99,7 +98,7 @@ class LinuxAcademyIE(InfoExtractor): 'sso': 'true', }) - login_state_url = compat_str(urlh.geturl()) + login_state_url = urlh.geturl() try: login_page = self._download_webpage( @@ -129,7 +128,7 @@ class LinuxAcademyIE(InfoExtractor): }) access_token = self._search_regex( - r'access_token=([^=&]+)', compat_str(urlh.geturl()), + r'access_token=([^=&]+)', urlh.geturl(), 'access token') self._download_webpage( diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 027a790b8..933df1495 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -6,7 +6,6 @@ import re from .theplatform import ThePlatformBaseIE from ..compat import ( compat_parse_qs, - compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -114,7 +113,7 @@ class MediasetIE(ThePlatformBaseIE): continue urlh = ie._request_webpage( embed_url, video_id, note='Following embed URL redirect') - embed_url = compat_str(urlh.geturl()) + embed_url = urlh.geturl() program_guid = _program_guid(_qs(embed_url)) if program_guid: entries.append(embed_url) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 694a264d6..d6eb15740 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -129,7 +129,7 @@ class MediasiteIE(InfoExtractor): query = mobj.group('query') webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() # XXX: might have also extracted UrlReferrer and QueryString from the html service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 1c652813a..5234cac02 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,5 +1,8 @@ from __future__ import unicode_literals +import re + +from .common import InfoExtractor from ..utils import ( int_or_none, str_to_int, @@ -54,3 +57,23 @@ class MofosexIE(KeezMoviesIE): }) return info + + +class MofosexEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P\d+)' + _TESTS = [{ + 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id), + ie=MofosexIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 43fd70f11..b1615b4d8 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -26,7 +26,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', 'uploader_id': 'famouslyfuckedup', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -40,7 +40,7 @@ class MotherlessIE(InfoExtractor): 'game', 'hairy'], 'upload_date': '20140622', 'uploader_id': 'Sulivana7x', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, 'skip': '404', @@ -54,7 +54,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -76,7 +76,8 @@ class MotherlessIE(InfoExtractor): raise ExtractorError('Video %s is for friends only' % video_id, expected=True) title = self._html_search_regex( - r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + (r'(?s)]+\bclass=["\']media-meta-title[^>]+>(.+?)', + r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') video_url = (self._html_search_regex( (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), @@ -84,14 +85,15 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - r'Views\s+([^<]+)<', + (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - r'Favorited\s+([^<]+)<', + (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( - r'Uploaded\s+([^<]+)<', webpage, 'upload date') + (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'Uploaded\s+([^<]+)<'), webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 9c8bf05af..2447c812e 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + merge_dicts, parse_iso8601, qualities, try_get, @@ -87,21 +88,25 @@ class NDRIE(NDRBaseIE): def _extract_embed(self, webpage, display_id): embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', fatal=True) + 'embedURL', webpage, 'embed URL', + default=None) or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url') description = self._search_regex( r']+itemprop="description">([^<]+)

', webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', fatal=False)) - return { + webpage, 'upload date', default=None)) + info = self._search_json_ld(webpage, display_id, default={}) + return merge_dicts({ '_type': 'url_transparent', 'url': embed_url, 'display_id': display_id, 'description': description, 'timestamp': timestamp, - } + }, info) class NJoyIE(NDRBaseIE): diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 6a2c6cb7b..de6a707c4 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[a-z]+-\d{8}-\d+)' + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[^/]+?-\d{8}-\d+)' # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -30,8 +30,11 @@ class NhkVodIE(InfoExtractor): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', + 'only_matching': True, }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7/episode/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' def _real_extract(self, url): lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() @@ -82,15 +85,9 @@ class NhkVodIE(InfoExtractor): audio = episode['audio'] audio_path = audio['audio'] info['formats'] = self._extract_m3u8_formats( - 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', m3u8_id='hls', fatal=False) - for proto in ('rtmpt', 'rtmp'): - info['formats'].append({ - 'ext': 'flv', - 'format_id': proto, - 'url': '%s://flv.nhk.or.jp/ondemand/mp4:flv%s' % (proto, audio_path), - 'vcodec': 'none', - }) + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang return info diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 2850af5db..47b9748f0 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + determine_ext, int_or_none, js_to_json, qualities, @@ -33,42 +34,76 @@ class NovaEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - + duration = None formats = [] - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { - 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), + + player = self._parse_json( + self._search_regex( + r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', + webpage, 'player', default='{}'), video_id, fatal=False) + if player: + for format_id, format_list in player['tracks'].items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, }) - break - f['format_id'] = f_id - formats.append(f) + duration = int_or_none(player.get('duration')) + else: + # Old path, not actual as of 08.04.2020 + bitrates = self._parse_json( + self._search_regex( + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), + }) + break + f['format_id'] = f_id + formats.append(f) + self._sort_formats(formats) title = self._og_search_title( @@ -81,7 +116,8 @@ class NovaEmbedIE(InfoExtractor): r'poster\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='value') duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', + default=duration)) return { 'id': video_id, diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index d3a83ea2b..48fb95416 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( int_or_none, parse_resolution, + str_or_none, try_get, unified_timestamp, url_or_none, @@ -415,6 +416,7 @@ class PeerTubeIE(InfoExtractor): peertube\.cpy\.re )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' _VALID_URL = r'''(?x) (?: peertube:(?P[^:]+):| @@ -423,26 +425,30 @@ class PeerTubeIE(InfoExtractor): (?P%s) ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ - 'url': 'https://peertube.cpy.re/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', - 'md5': '80f24ff364cc9d333529506a263e7feb', + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '9bed8c0137913e17b86334e5885aacff', 'info_dict': { - 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'ext': 'mp4', - 'title': 'wow', - 'description': 'wow such video, so gif', + 'title': 'What is PeerTube?', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', 'thumbnail': r're:https?://.*\.(?:jpg|png)', - 'timestamp': 1519297480, - 'upload_date': '20180222', - 'uploader': 'Luclu7', - 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', - 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', - 'license': 'Unknown', - 'duration': 3, + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'Les vidéos de Framasoft', + 'channel_id': '2', + 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, 'view_count': int, 'like_count': int, 'dislike_count': int, - 'tags': list, - 'categories': list, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', @@ -484,13 +490,38 @@ class PeerTubeIE(InfoExtractor): entries = [peertube_url] return entries + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return + subtitles = {} + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) + return subtitles + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or mobj.group('host_2') video_id = mobj.group('id') - video = self._download_json( - 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) + video = self._call_api( + host, video_id, '', note='Downloading video JSON') title = video['name'] @@ -513,10 +544,28 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - def account_data(field): - return try_get(video, lambda x: x['account'][field], compat_str) + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) - category = try_get(video, lambda x: x['category']['label'], compat_str) + description = None + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) + if not description: + description = video.get('description') + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) categories = [category] if category else None nsfw = video.get('nsfw') @@ -528,14 +577,17 @@ class PeerTubeIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video.get('description'), + 'description': description, 'thumbnail': urljoin(url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName'), - 'uploader_id': account_data('uuid'), - 'uploder_url': account_data('url'), - 'license': try_get( - video, lambda x: x['licence']['label'], compat_str), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str_or_none(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str_or_none(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likes')), @@ -544,4 +596,5 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, + 'subtitles': subtitles } diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py index 602207beb..23c8256b5 100644 --- a/youtube_dl/extractor/platzi.py +++ b/youtube_dl/extractor/platzi.py @@ -46,7 +46,7 @@ class PlatziBaseIE(InfoExtractor): headers={'Referer': self._LOGIN_URL}) # login succeeded - if 'platzi.com/login' not in compat_str(urlh.geturl()): + if 'platzi.com/login' not in urlh.geturl(): return login_error = self._webpage_read_content( diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py index dd5f17f11..80222d428 100644 --- a/youtube_dl/extractor/pokemon.py +++ b/youtube_dl/extractor/pokemon.py @@ -20,20 +20,16 @@ class PokemonIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Ol’ Raise and Switch!', 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', - 'timestamp': 1511824728, - 'upload_date': '20171127', }, 'add_id': ['LimelightMedia'], }, { # no data-video-title - 'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008', + 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', 'info_dict': { - 'id': '99f3bae270bf4e5097274817239ce9c8', + 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', 'ext': 'mp4', - 'title': 'Pokémon: The Rise of Darkrai', - 'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d', - 'timestamp': 1417778347, - 'upload_date': '20141205', + 'title': "Pokémon : L'ascension de Darkrai", + 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', }, 'add_id': ['LimelightMedia'], 'params': { diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b3251ccd9..3567a3283 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -52,7 +52,7 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -149,6 +149,9 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, }] @staticmethod @@ -166,6 +169,13 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') + if 'premium' in host: + if not self._downloader.params.get('cookiefile'): + raise ExtractorError( + 'PornHub Premium requires authentication.' + ' You may want to use --cookies.', + expected=True) + self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): @@ -189,10 +199,10 @@ class PornHubIE(PornHubBaseIE): # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. title = self._html_search_meta( - 'twitter:title', webpage, default=None) or self._search_regex( - (r']+class=["\']title["\'][^>]*>(?P[^<]+)', - r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', - r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), webpage, 'title', group='title') video_urls = [] @@ -405,7 +415,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -473,7 +483,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -588,7 +598,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 4942437c7..2cc665122 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_str, compat_urlparse, ) from ..utils import ( @@ -39,13 +38,13 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in compat_str(urlh.geturl()) + return 'learning.oreilly.com/home/' in urlh.geturl() if is_logged(urlh): self.LOGGED_IN = True return - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index e579d42cf..9401bf2cf 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -7,9 +7,18 @@ from .common import InfoExtractor class ServusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| + servustv\.com/videos + ) + /(?P<id>[aA]{2}-\w+|\d+-\d+) + ''' _TESTS = [{ - 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + # new URL schema + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', @@ -18,6 +27,10 @@ class ServusIE(InfoExtractor): 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', } + }, { + # old URL schema + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'only_matching': True, }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index a1372d389..422ce1626 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -27,6 +27,7 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, + urlhandle_detect_ext, ) @@ -96,7 +97,7 @@ class SoundcloudIE(InfoExtractor): 'repost_count': int, } }, - # not streamable song, preview + # geo-restricted { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { @@ -108,17 +109,13 @@ class SoundcloudIE(InfoExtractor): 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 30, + 'duration': 227.155, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, - 'params': { - # rtmp - 'skip_download': True, - }, }, # private link { @@ -229,7 +226,6 @@ class SoundcloudIE(InfoExtractor): 'skip_download': True, }, }, - # not available via api.soundcloud.com/i1/tracks/id/streams { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', @@ -250,11 +246,14 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - 'expected_warnings': ['Unable to download JSON metadata'], - } + }, + { + # with AAC HQ format available via OAuth token + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, ] - _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' @@ -316,10 +315,9 @@ class SoundcloudIE(InfoExtractor): def _resolv_url(cls, url): return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url - def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): + def _extract_info_dict(self, info, full_title=None, secret_token=None): track_id = compat_str(info['id']) title = info['title'] - track_base_url = self._API_BASE + 'tracks/%s' % track_id format_urls = set() formats = [] @@ -328,21 +326,22 @@ class SoundcloudIE(InfoExtractor): query['secret_token'] = secret_token if info.get('downloadable') and info.get('has_downloads_left'): - format_url = update_url_query( - info.get('download_url') or track_base_url + '/download', query) - format_urls.add(format_url) - if version == 2: - v1_info = self._download_json( - track_base_url, track_id, query=query, fatal=False) or {} - else: - v1_info = info - formats.append({ - 'format_id': 'download', - 'ext': v1_info.get('original_format') or 'mp3', - 'filesize': int_or_none(v1_info.get('original_content_size')), - 'url': format_url, - 'preference': 10, - }) + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'preference': 10, + }) def invalid_url(url): return not url or url in format_urls @@ -356,6 +355,9 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) if v: @@ -366,9 +368,13 @@ class SoundcloudIE(InfoExtractor): abr = f.get('abr') if abr: f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' f.update({ 'format_id': '_'.join(format_id_list), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'protocol': protocol, 'preference': -10 if preview else None, }) formats.append(f) @@ -406,42 +412,11 @@ class SoundcloudIE(InfoExtractor): }, 'http' if protocol == 'progressive' else protocol, t.get('snipped') or '/preview/' in format_url) - if not formats: - # Old API, does not work for some tracks (e.g. - # https://soundcloud.com/giovannisarani/mezzo-valzer) - # and might serve preview URLs (e.g. - # http://www.soundcloud.com/snbrn/ele) - format_dict = self._download_json( - track_base_url + '/streams', track_id, - 'Downloading track url', query=query, fatal=False) or {} - - for key, stream_url in format_dict.items(): - if invalid_url(stream_url): - continue - format_urls.add(stream_url) - mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key) - if mobj: - protocol, ext, abr = mobj.groups() - add_format({ - 'abr': abr, - 'ext': ext, - 'url': stream_url, - }, protocol) - - if not formats: - # We fallback to the stream_url in the original info, this - # cannot be always used, sometimes it can give an HTTP 404 error - urlh = self._request_webpage( - HEADRequest(info.get('stream_url') or track_base_url + '/stream'), - track_id, query=query, fatal=False) - if urlh: - stream_url = urlh.geturl() - if not invalid_url(stream_url): - add_format({'url': stream_url}, 'http') - for f in formats: f['vcodec'] = 'none' + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted() self._sort_formats(formats) user = info.get('user') or {} @@ -511,16 +486,10 @@ class SoundcloudIE(InfoExtractor): resolve_title += '/%s' % token info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - version = 2 info = self._download_json( - info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) - if not info: - info = self._download_json( - info_json_url.replace(self._API_V2_BASE, self._API_BASE), - full_title, 'Downloading info JSON', query=query) - version = 1 + info_json_url, full_title, 'Downloading info JSON', query=query) - return self._extract_info_dict(info, full_title, token, version) + return self._extract_info_dict(info, full_title, token) class SoundcloudPlaylistBaseIE(SoundcloudIE): diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 44d8fa52f..35ab9ec37 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -3,34 +3,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) from ..utils import ( - sanitized_Request, + float_or_none, + int_or_none, + merge_dicts, + str_or_none, str_to_int, - unified_strdate, + url_or_none, ) -from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<id>[0-9]+)/?)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P<id>\d+) + ''' _TESTS = [{ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '8bbfde12b101204b39e4b9fe7eb67095', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', 'info_dict': { 'id': '103545', 'ext': 'mp4', 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070507', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, - } + 'categories': list, + 'tags': list, + }, }, { # download URL pattern: */mp4_<format_id>_<video_id>.mp4 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', @@ -45,83 +58,125 @@ class SpankwireIE(InfoExtractor): 'upload_date': '20150822', 'age_limit': 18, }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - req = sanitized_Request('http://www.' + mobj.group('url')) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) - title = self._html_search_regex( - r'<h1>([^<]+)', webpage, 'title') - description = self._html_search_regex( - r'(?s)<div\s+id="descriptionContent">(.+?)</div>', - webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'by:\s*<a [^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - uploader_id = self._html_search_regex( - r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"', - webpage, 'uploader id', fatal=False) - upload_date = unified_strdate(self._html_search_regex( - r'</a> on (.+?) at \d+:\d+', - webpage, 'upload date', fatal=False)) - - view_count = str_to_int(self._html_search_regex( - r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>', - webpage, 'comment count', fatal=False)) - - videos = re.findall( - r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) - heights = [int(video[0]) for video in videos] - video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) - if webpage.find(r'flashvars\.encrypted = "true"') != -1: - password = self._search_regex( - r'flashvars\.video_title = "([^"]+)', - webpage, 'password').replace('+', ' ') - video_urls = list(map( - lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), - video_urls)) + title = video['title'] formats = [] - for height, video_url in zip(heights, video_urls): - path = compat_urllib_parse_urlparse(video_url).path - m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path) - if m: - tbr = int(m.group('tbr')) - height = int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height, - 'height': height, - 'tbr': tbr, + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, }) - self._sort_formats(formats) - age_limit = self._rta_search(webpage) + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries - return { + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), 'view_count': view_count, - 'comment_count': comment_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, 'formats': formats, - 'age_limit': age_limit, - } + }, info) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index cca89a4a8..a75369dbe 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -4,11 +4,12 @@ import re from .common import InfoExtractor from .wistia import WistiaIE -from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, + int_or_none, get_element_by_class, + strip_or_none, urlencode_postdata, urljoin, ) @@ -20,8 +21,8 @@ class TeachableBaseIE(InfoExtractor): _SITES = { # Only notable ones here - 'upskillcourses.com': 'upskill', - 'academy.gns3.com': 'gns3', + 'v1.upskillcourses.com': 'upskill', + 'gns3.teachable.com': 'gns3', 'academyhacker.com': 'academyhacker', 'stackskills.com': 'stackskills', 'market.saleshacker.com': 'saleshacker', @@ -58,7 +59,7 @@ class TeachableBaseIE(InfoExtractor): self._logged_in = True return - login_url = compat_str(urlh.geturl()) + login_url = urlh.geturl() login_form = self._hidden_inputs(login_page) @@ -110,27 +111,29 @@ class TeachableIE(TeachableBaseIE): ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', 'info_dict': { - 'id': 'uzw6zw58or', - 'ext': 'mp4', - 'title': 'Welcome to the Course!', - 'description': 'md5:65edb0affa582974de4625b9cdea1107', - 'duration': 138.763, - 'timestamp': 1479846621, - 'upload_date': '20161122', + 'id': 'untlgzk1v7', + 'ext': 'bin', + 'title': 'Overview', + 'description': 'md5:071463ff08b86c208811130ea1c2464c', + 'duration': 736.4, + 'timestamp': 1542315762, + 'upload_date': '20181115', + 'chapter': 'Welcome', + 'chapter_number': 1, }, 'params': { 'skip_download': True, }, }, { - 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', 'only_matching': True, }, { - 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 'only_matching': True, }] @@ -174,11 +177,34 @@ class TeachableIE(TeachableBaseIE): title = self._og_search_title(webpage, default=None) + chapter = None + chapter_number = None + section_item = self._search_regex( + r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id, + webpage, 'section item', default=None, group='li') + if section_item: + chapter_number = int_or_none(self._search_regex( + r'data-ss-position=["\'](\d+)', section_item, 'section id', + default=None)) + if chapter_number is not None: + sections = [] + for s in re.findall( + r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage): + section = strip_or_none(clean_html(s)) + if not section: + sections = [] + break + sections.append(section) + if chapter_number <= len(sections): + chapter = sections[chapter_number - 1] + entries = [{ '_type': 'url_transparent', 'url': wistia_url, 'ie_key': WistiaIE.ie_key(), 'title': title, + 'chapter': chapter, + 'chapter_number': chapter_number, } for wistia_url in wistia_urls] return self.playlist_result(entries, video_id, title) @@ -193,20 +219,20 @@ class TeachableCourseIE(TeachableBaseIE): /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, }, { - 'url': 'http://upskillcourses.com/courses/119763/', + 'url': 'http://v1.upskillcourses.com/courses/119763/', 'only_matching': True, }, { - 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'url': 'https://gns3.teachable.com/courses/enrolled/423415', 'only_matching': True, }, { 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 33a72083b..364556a1f 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,9 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .jwplatform import JWPlatformIE from .nexx import NexxIE -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + NO_DEFAULT, + try_get, +) class Tele5IE(InfoExtractor): @@ -44,14 +54,49 @@ class Tele5IE(InfoExtractor): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - if not video_id: + NEXX_ID_RE = r'\d{6,}' + JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' + + def nexx_result(nexx_id): + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, + ie=NexxIE.ie_key(), video_id=nexx_id) + + nexx_id = jwplatform_id = None + + if video_id: + if re.match(NEXX_ID_RE, video_id): + return nexx_result(video_id) + elif re.match(JWPLATFORM_ID_RE, video_id): + jwplatform_id = video_id + + if not nexx_id: display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', - r'\s+id\s*=\s*["\']player_(\d{6,})', - r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id') + + def extract_id(pattern, name, default=NO_DEFAULT): + return self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + default=default) + + nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) + if nexx_id: + return nexx_result(nexx_id) + + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') + + media = self._download_json( + 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id, + display_id) + nexx_id = try_get( + media, lambda x: x['playlist'][0]['nexx_id'], compat_str) + + if nexx_id: + return nexx_result(nexx_id) return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, - ie=NexxIE.ie_key(), video_id=video_id) + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=jwplatform_id) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d37e1b055..9ba3da341 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, int_or_none, str_or_none, + try_get, urljoin, ) @@ -24,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'info_dict': { 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', - 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', + 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ 'md5': 'adb28c37238b675dad0f042292f209a7', @@ -55,6 +56,26 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, + }, { + # video in opening's content + 'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html', + 'info_dict': { + 'id': '2907195140', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'description': 'md5:73f340a7320143d37ab895375b2bf13a', + }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'TpI2EttSDAReWpJ1o0NVh2', + 'ext': 'mp4', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'duration': 1015, + }, + }], + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, @@ -135,17 +156,28 @@ class TelecincoIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) article = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', webpage, 'article'), display_id)['article'] title = article.get('title') - description = clean_html(article.get('leadParagraph')) + description = clean_html(article.get('leadParagraph')) or '' if article.get('editorialType') != 'VID': entries = [] - for p in article.get('body', []): - content = p.get('content') - if p.get('type') != 'video' or not content: + body = [article.get('opening')] + body.extend(try_get(article, lambda x: x['body'], list) or []) + for p in body: + if not isinstance(p, dict): continue - entries.append(self._parse_content(content, url)) + content = p.get('content') + if not content: + continue + type_ = p.get('type') + if type_ == 'paragraph': + content_str = str_or_none(content) + if content_str: + description += content_str + continue + if type_ == 'video' and isinstance(content, dict): + entries.append(self._parse_content(content, url)) return self.playlist_result( entries, str_or_none(article.get('id')), title, description) content = article['opening']['content'] diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index ae9f66787..c82c94b3a 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -38,8 +38,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', - 'upload_date': '20180222', - 'timestamp': 1519326631, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py index 0e2370cd8..0631cb7ab 100644 --- a/youtube_dl/extractor/tfo.py +++ b/youtube_dl/extractor/tfo.py @@ -17,14 +17,12 @@ class TFOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)' _TEST = { 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', - 'md5': '47c987d0515561114cf03d1226a9d4c7', + 'md5': 'cafbe4f47a8dae0ca0159937878100d6', 'info_dict': { - 'id': '100463871', + 'id': '7da3d50e495c406b8fc0b997659cc075', 'ext': 'mp4', 'title': 'Video Game Hackathon', 'description': 'md5:558afeba217c6c8d96c60e5421795c07', - 'upload_date': '20160212', - 'timestamp': 1455310233, } } diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py index 387f955ee..a3d9b4017 100644 --- a/youtube_dl/extractor/thisoldhouse.py +++ b/youtube_dl/extractor/thisoldhouse.py @@ -31,6 +31,10 @@ class ThisOldHouseIE(InfoExtractor): }, { 'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost', 'only_matching': True, + }, { + # iframe www.thisoldhouse.com + 'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project', + 'only_matching': True, }] _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe' @@ -38,6 +42,6 @@ class ThisOldHouseIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - r'<iframe[^>]+src=[\'"](?:https?:)?//thisoldhouse\.chorus\.build/videos/zype/([0-9a-f]{24})', + r'<iframe[^>]+src=[\'"](?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', webpage, 'video id') return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index edbb0aa69..ae584ad69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -151,7 +150,7 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): raise ExtractorError( 'This Tumblr may contain sensitive media. ' diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index a819d048c..c498b0191 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -99,7 +99,7 @@ class TV4IE(InfoExtractor): manifest_url.replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_ism_formats( - re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 0db2dca41..78ee0115c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -643,7 +643,14 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| + (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + ) + (?P<id>[^/?#&]+) + ''' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -669,6 +676,12 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index e68aa6ad1..3d243c191 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -33,6 +33,7 @@ from ..utils import ( unified_timestamp, unsmuggle_url, urlencode_postdata, + urljoin, unescapeHTML, ) @@ -191,7 +192,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): for tt in text_tracks: subtitles[tt['lang']] = [{ 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], + 'url': urljoin('https://vimeo.com', tt['url']), }] thumbnails = [] @@ -591,7 +592,7 @@ class VimeoIE(VimeoBaseInfoExtractor): # Retrieve video webpage to extract further information webpage, urlh = self._download_webpage_handle( url, video_id, headers=headers) - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index a5b94d279..0f7be6a7d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -113,7 +113,7 @@ class XHamsterIE(InfoExtractor): display_id = mobj.group('display_id') or mobj.group('display_id_2') desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + webpage, urlh = self._download_webpage_handle(desktop_url, video_id) error = self._html_search_regex( r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', @@ -161,6 +161,9 @@ class XHamsterIE(InfoExtractor): 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), 'filesize': filesize, + 'http_headers': { + 'Referer': urlh.geturl(), + }, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index c6c0b3291..01b253dcb 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -47,7 +47,7 @@ class XTubeIE(InfoExtractor): 'display_id': 'A-Super-Run-Part-1-YT', 'ext': 'flv', 'title': 'A Super Run - Part 1 (YT)', - 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', 'uploader': 'tshirtguy59', 'duration': 579, 'view_count': int, @@ -87,10 +87,24 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - sources = self._parse_json(self._search_regex( - r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', - webpage, 'sources', group='sources'), video_id, - transform_source=js_to_json) + title, thumbnail, duration = [None] * 3 + + config = self._parse_json(self._search_regex( + r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + default='{}'), video_id, transform_source=js_to_json, fatal=False) + if config: + config = config.get('mainRoll') + if isinstance(config, dict): + title = config.get('title') + thumbnail = config.get('poster') + duration = int_or_none(config.get('duration')) + sources = config.get('sources') or config.get('format') + + if not isinstance(sources, dict): + sources = self._parse_json(self._search_regex( + r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) formats = [] for format_id, format_url in sources.items(): @@ -102,20 +116,25 @@ class XTubeIE(InfoExtractor): self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex( - (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), - webpage, 'title', group='title') - description = self._search_regex( + if not title: + title = self._search_regex( + (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, default=None) or self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) uploader = self._search_regex( (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', r'<span[^>]+class="nickname"[^>]*>([^<]+)'), webpage, 'uploader', fatal=False) - duration = parse_duration(self._search_regex( - r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', - webpage, 'duration', fatal=False)) + if not duration: + duration = parse_duration(self._search_regex( + r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( - r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>', + (r'["\']viewsCount["\'][^>]*>(\d+)\s+views', + r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'), webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( r'>Comments? \(([\d,\.]+)\)<', @@ -126,6 +145,7 @@ class XTubeIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'uploader': uploader, 'duration': duration, 'view_count': view_count, @@ -144,7 +164,7 @@ class XTubeUserIE(InfoExtractor): 'id': 'greenshowers-4056496', 'age_limit': 18, }, - 'playlist_mincount': 155, + 'playlist_mincount': 154, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index dff69fcb7..88aabd272 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -44,7 +44,7 @@ class YouJizzIE(InfoExtractor): encodings = self._parse_json( self._search_regex( - r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', + r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', default='[]'), video_id, fatal=False) for encoding in encodings: diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d4eccb4b2..e7fca22de 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, - sanitized_Request, str_to_int, unescapeHTML, unified_strdate, @@ -15,7 +14,7 @@ from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'md5': '3744d24c50438cf5b6f6d59feb5055c2', @@ -57,16 +56,28 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'only_matching': True, + }, { + 'url': 'http://www.youporn.com/watch/505835', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', + webpage) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - request = sanitized_Request(url) - request.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(request, display_id) + webpage = self._download_webpage( + 'http://www.youporn.com/watch/%s' % video_id, display_id, + headers={'Cookie': 'age_verified=1'}) title = self._html_search_regex( r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eacaa5ecd..afaa12b1b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,7 +29,6 @@ from ..compat import ( from ..utils import ( bool_or_none, clean_html, - dict_get, error_to_compat_str, extract_attributes, ExtractorError, @@ -570,7 +569,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', + 'description': 'md5:19a2f98d9032b9311e686ed039564f63', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], @@ -685,12 +684,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:bec2185232c05479482cb5a9b82719bf', + 'description': 'md5:307195cd21ff7fa352270fe884570ef0', 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', - 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, @@ -755,11 +753,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', + 'creator': 'Dada Life, deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', + 'alt_title': 'This Machine Kills Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -1135,6 +1133,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, 'youtube_include_dash_manifest': False, }, + 'skip': 'not actual anymore', }, { # Youtube Music Auto-generated description @@ -1145,8 +1144,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Voyeur Girl', 'description': 'md5:7ae382a65843d6df2685993e90a8628f', 'upload_date': '20190312', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', + 'uploader': 'Stephen - Topic', + 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', 'artist': 'Stephen', 'track': 'Voyeur Girl', 'album': 'it\'s too much love to know my dear', @@ -1210,7 +1209,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '-hcAI0g-f5M', 'ext': 'mp4', 'title': 'Put It On Me', - 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', + 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', 'upload_date': '20180426', 'uploader': 'Matt Maeson - Topic', 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', @@ -1256,7 +1255,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', + r'.*?[-.](?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1708,9 +1707,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - def extract_token(v_info): - return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) - def extract_player_response(player_response, video_id): pl_response = str_or_none(player_response) if not pl_response: @@ -1723,6 +1719,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = {} # Get video info + video_info = {} embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True @@ -1737,19 +1734,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) + try: + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') + except ExtractorError: + video_info_webpage = None + if video_info_webpage: + video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False - video_info = None - sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: @@ -1766,61 +1765,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True - sts = ytplayer_config.get('sts') if not player_response: player_response = extract_player_response(args.get('player_response'), video_id) if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el in ('embedded', 'detailpage', 'vevo', ''): - query = { - 'video_id': video_id, - 'ps': 'default', - 'eurl': '', - 'gl': 'US', - 'hl': 'en', - } - if el: - query['el'] = el - if sts: - query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) - if not video_info_webpage: - continue - get_video_info = compat_parse_qs(video_info_webpage) - if not player_response: - pl_response = get_video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(get_video_info) - if view_count is None: - view_count = extract_view_count(get_video_info) - if not video_info: - video_info = get_video_info - get_token = extract_token(get_video_info) - if get_token: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - token = extract_token(video_info) - if not token: - video_info = get_video_info - break def extract_unavailable_message(): messages = [] @@ -1833,13 +1781,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if messages: return '\n'.join(messages) - if not video_info: + if not video_info and not player_response: unavailable_message = extract_unavailable_message() if not unavailable_message: unavailable_message = 'Unable to extract video data' raise ExtractorError( 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) + if not isinstance(video_info, dict): + video_info = {} + video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} @@ -1889,15 +1840,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get(feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + 'title': title, }) - feed_ids.append(feed_data['id'][0]) + feed_ids.append(feed_id) self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) @@ -1968,7 +1930,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } for fmt in streaming_formats: - if fmt.get('drm_families'): + if fmt.get('drmFamilies') or fmt.get('drm_families'): continue url = url_or_none(fmt.get('url')) @@ -2035,7 +1997,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)[-.]([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -2392,30 +2354,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['stretched_ratio'] = ratio if not formats: - token = extract_token(video_info) - if not token: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): - raise ExtractorError('This video is DRM protected.', expected=True) + if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message + raise ExtractorError( + 'YouTube said: %s' % reason, + expected=True, video_id=video_id) + if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) @@ -2495,20 +2450,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', }, - 'playlist_count': 3, + 'playlist_count': 1, }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, - 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2518,7 +2476,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader': 'Christiaan008', 'uploader_id': 'ChRiStIaAn008', }, - 'playlist_count': 95, + 'playlist_count': 96, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py index bacb82eee..f6496f516 100644 --- a/youtube_dl/extractor/zapiks.py +++ b/youtube_dl/extractor/zapiks.py @@ -29,7 +29,6 @@ class ZapiksIE(InfoExtractor): 'timestamp': 1359044972, 'upload_date': '20130124', 'view_count': int, - 'comment_count': int, }, }, { diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 145c123a4..656864b2e 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -244,14 +244,14 @@ class ZDFChannelIE(ZDFBaseIE): 'id': 'das-aktuelle-sportstudio', 'title': 'das aktuelle sportstudio | ZDF', }, - 'playlist_count': 21, + 'playlist_mincount': 23, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { 'id': 'planet-e', 'title': 'planet e.', }, - 'playlist_count': 4, + 'playlist_mincount': 50, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 1ffabc62b..8826b382c 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -134,7 +134,7 @@ def parseOpts(overrideArguments=None): action='help', help='Print this help text and exit') general.add_option( - '-v', '--version', + '--version', action='version', help='Print program version and exit') general.add_option( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f6204692a..38262bee4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2729,6 +2729,11 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ _HTTPONLY_PREFIX = '#HttpOnly_' def save(self, filename=None, ignore_discard=False, ignore_expires=False): @@ -2795,6 +2800,15 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): https_response = http_response +class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): + if sys.version_info[0] < 3: + def redirect_request(self, req, fp, code, msg, headers, newurl): + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + + def extract_timezone(date_str): m = re.search( r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0163333ac..5aedd3268 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.02.16' +__version__ = '2020.03.24'