diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 73f46ec04..0721d49c3 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.24 + [debug] youtube-dl version 2020.03.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 7e3c9f669..1e67f724d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index b9bb3bd11..1290b55c4 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 265ea80c1..3f006bef8 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.24 + [debug] youtube-dl version 2020.03.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index e71778a3d..202bb9b2f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 94aa9f327..1a676f4f2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,58 @@ +version 2020.03.01 + +Core +* [YoutubeDL] Force redirect URL to unicode on python 2 +- [options] Remove duplicate short option -v for --version (#24162) + +Extractors +* [xhamster] Fix extraction (#24205) +* [franceculture] Fix extraction (#24204) ++ [telecinco] Add support for article opening videos +* [telecinco] Fix extraction (#24195) +* [xtube] Fix metadata extraction (#21073, #22455) +* [youjizz] Fix extraction (#24181) +- Remove no longer needed compat_str around geturl +* [pornhd] Fix extraction (#24128) ++ [teachable] Add support for multiple videos per lecture (#24101) ++ [wistia] Add support for multiple generic embeds (#8347, 11385) +* [imdb] Fix extraction (#23443) +* [tv2dk:bornholm:play] Fix extraction (#24076) + + +version 2020.02.16 + +Core +* [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, + #10622) +* [update] Fix updating via symlinks (#23991) ++ [compat] Introduce compat_realpath (#23991) + +Extractors ++ [npr] Add support for streams (#24042) ++ [24video] Add support for porn.24video.net (#23779, #23784) +- [jpopsuki] Remove extractor (#23858) +* [nova] Improve extraction (#23690) +* [nova:embed] Improve (#23690) +* [nova:embed] Fix extraction (#23672) ++ [abc:iview] Add support for 720p (#22907, #22921) +* [nytimes] Improve format sorting (#24010) ++ [toggle] Add support for mewatch.sg (#23895, #23930) +* [thisoldhouse] Fix extraction (#23951) ++ [popcorntimes] Add support for popcorntimes.tv (#23949) +* [sportdeutschland] Update to new API +* [twitch:stream] Lowercase channel id for stream request (#23917) +* [tv5mondeplus] Fix extraction (#23907, #23911) +* [tva] Relax URL regular expression (#23903) +* [vimeo] Fix album extraction (#23864) +* [viewlift] Improve extraction + * Fix extraction (#23851) + + Add support for authentication + + Add support for more domains +* [svt] Fix series extraction (#22297) +* [svt] Fix article extraction (#22897, #22919) +* [soundcloud] Imporve private playlist/set tracks extraction (#3707) + + version 2020.01.24 Extractors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e9a8cc27a..02bc088ab 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -389,7 +389,6 @@ - **JeuxVideo** - **Joj** - **Jove** - - **jpopsuki.tv** - **JWPlatform** - **Kakao** - **Kaltura** @@ -663,6 +662,7 @@ - **Pokemon** - **PolskieRadio** - **PolskieRadioCategory** + - **Popcorntimes** - **PopcornTV** - **PornCom** - **PornerBros** @@ -1004,8 +1004,8 @@ - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - - **ViewLift** - - **ViewLiftEmbed** + - **viewlift** + - **viewlift:embed** - **Viidea** - **viki** - **viki:channel** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ce9666171..1e204e551 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -816,11 +816,15 @@ class TestYoutubeDL(unittest.TestCase): 'webpage_url': 'http://example.com', } - def get_ids(params): + def get_downloaded_info_dicts(params): ydl = YDL(params) - # make a copy because the dictionary can be modified - ydl.process_ie_result(playlist.copy()) - return [int(v['id']) for v in ydl.downloaded_info_dicts] + # make a deep copy because the dictionary and nested entries + # can be modified + ydl.process_ie_result(copy.deepcopy(playlist)) + return ydl.downloaded_info_dicts + + def get_ids(params): + return [int(v['id']) for v in get_downloaded_info_dicts(params)] result = get_ids({}) self.assertEqual(result, [1, 2, 3, 4]) @@ -852,6 +856,22 @@ class TestYoutubeDL(unittest.TestCase): result = get_ids({'playlist_items': '2-4,3-4,3'}) self.assertEqual(result, [2, 3, 4]) + # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591 + # @{ + result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) + self.assertEqual(result[0]['playlist_index'], 2) + self.assertEqual(result[1]['playlist_index'], 3) + + result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) + self.assertEqual(result[0]['playlist_index'], 2) + self.assertEqual(result[1]['playlist_index'], 3) + self.assertEqual(result[2]['playlist_index'], 4) + + result = get_downloaded_info_dicts({'playlist_items': '4,2'}) + self.assertEqual(result[0]['playlist_index'], 4) + self.assertEqual(result[1]['playlist_index'], 2) + # @} + def test_urlopen_no_file_protocol(self): # see https://github.com/ytdl-org/youtube-dl/issues/8227 ydl = YDL() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 7d57a628e..17aaaf20d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -26,7 +26,6 @@ from youtube_dl.extractor import ( ThePlatformIE, ThePlatformFeedIE, RTVEALaCartaIE, - FunnyOrDieIE, DemocracynowIE, ) @@ -322,18 +321,6 @@ class TestRtveSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') -class TestFunnyOrDieSubtitles(BaseTestSubtitles): - url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' - IE = FunnyOrDieIE - - def test_allsubtitles(self): - self.DL.params['writesubtitles'] = True - self.DL.params['allsubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') - - class TestDemocracynowSubtitles(BaseTestSubtitles): url = 'http://www.democracynow.org/shows/2015/7/3' IE = DemocracynowIE diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f5cb46308..19370f62b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -92,6 +92,7 @@ from .utils import ( YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, + YoutubeDLRedirectHandler, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -990,7 +991,7 @@ class YoutubeDL(object): 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': i + playliststart, + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), @@ -2343,6 +2344,7 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + redirect_handler = YoutubeDLRedirectHandler() data_handler = compat_urllib_request_DataHandler() # When passing our own FileHandler instance, build_opener won't add the @@ -2356,7 +2358,7 @@ class YoutubeDL(object): file_handler.file_open = file_open opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) + proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index c75ab131b..d1b86bd13 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2754,6 +2754,17 @@ else: compat_expanduser = os.path.expanduser +if compat_os_name == 'nt' and sys.version_info < (3, 8): + # os.path.realpath on Windows does not follow symbolic links + # prior to Python 3.8 (see https://bugs.python.org/issue9949) + def compat_realpath(path): + while os.path.islink(path): + path = os.path.abspath(os.readlink(path)) + return path +else: + compat_realpath = os.path.realpath + + if sys.version_info < (3, 0): def compat_print(s): from .utils import preferredencoding @@ -2998,6 +3009,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 4ac323bf6..6637f4f35 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -110,17 +110,17 @@ class ABCIViewIE(InfoExtractor): # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', - 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', + 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', + 'md5': '67715ce3c78426b11ba167d875ac6abf', 'info_dict': { - 'id': 'ZX9371A050S00', + 'id': 'LE1927H001S00', 'ext': 'mp4', - 'title': "Gaston's Birthday", - 'series': "Ben And Holly's Little Kingdom", - 'description': 'md5:f9de914d02f226968f598ac76f105bcf', - 'upload_date': '20180604', - 'uploader_id': 'abc4kids', - 'timestamp': 1528140219, + 'title': "Series 11 Ep 1", + 'series': "Gruen", + 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', + 'upload_date': '20190925', + 'uploader_id': 'abc1', + 'timestamp': 1569445289, }, 'params': { 'skip_download': True, @@ -148,7 +148,7 @@ class ABCIViewIE(InfoExtractor): 'hdnea': token, }) - for sd in ('sd', 'sd-low'): + for sd in ('720', 'sd', 'sd-low'): sd_url = try_get( stream, lambda x: x['streams']['hls'][sd], compat_str) if not sd_url: diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index c050bf9df..fe42821c7 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( encode_base_n, ExtractorError, @@ -55,7 +54,7 @@ class EpornerIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, display_id) - video_id = self._match_id(compat_str(urlh.geturl())) + video_id = self._match_id(urlh.geturl()) hash = self._search_regex( r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1cab440f4..64d1fa251 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -497,7 +497,6 @@ from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .joj import JojIE from .jwplatform import JWPlatformIE -from .jpopsukitv import JpopsukiIE from .kakao import KakaoIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE @@ -850,6 +849,7 @@ from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, ) +from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE from .porn91 import Porn91IE from .porncom import PornComIE diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index b8fa17588..306b45fc9 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -31,7 +31,13 @@ class FranceCultureIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_data = extract_attributes(self._search_regex( - r'(?s)]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(]+data-asset-source="[^"]+"[^>]+>)', + r'''(?sx) + (?: + | + ]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> + ).*? + (]+data-asset-source="[^"]+"[^>]+>) + ''', webpage, 'video data')) video_url = video_data['data-asset-source'] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3c002472f..d1ec56be9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2287,7 +2287,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = compat_str(head_response.geturl()) + new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -2387,12 +2387,12 @@ class GenericIE(InfoExtractor): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=compat_str(full_response.geturl())), + xspf_base_url=full_response.geturl()), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, - mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], + mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict @@ -2537,14 +2537,15 @@ class GenericIE(InfoExtractor): dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) # Look for embedded Wistia player - wistia_url = WistiaIE._extract_url(webpage) - if wistia_url: - return { - '_type': 'url_transparent', - 'url': self._proto_relative_url(wistia_url), - 'ie_key': WistiaIE.ie_key(), - 'uploader': video_uploader, - } + wistia_urls = WistiaIE._extract_urls(webpage) + if wistia_urls: + playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) + for entry in playlist['entries']: + entry.update({ + '_type': 'url_transparent', + 'uploader': video_uploader, + }) + return playlist # Look for SVT player svt_url = SVTIE._extract_url(webpage) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 436759da5..a31301985 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import base64 +import json import re from .common import InfoExtractor @@ -8,6 +10,7 @@ from ..utils import ( mimetype2ext, parse_duration, qualities, + try_get, url_or_none, ) @@ -15,15 +18,16 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'No. 2 from Ice Age: Continental Drift (2012)', + 'title': 'No. 2', 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', + 'duration': 152, } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -47,21 +51,23 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://www.imdb.com/videoplayer/vi' + video_id, video_id) - video_metadata = self._parse_json(self._search_regex( - r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage, - 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id] - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage) or self._html_search_regex( - r'(.+?)', webpage, 'title', fatal=False) or video_metadata['title'] + + data = self._download_json( + 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, + query={ + 'key': base64.b64encode(json.dumps({ + 'type': 'VIDEO_PLAYER', + 'subType': 'FORCE_LEGACY', + 'id': 'vi%s' % video_id, + }).encode()).decode(), + })[0] quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] - for encoding in video_metadata.get('encodings', []): + for encoding in data['videoLegacyEncodings']: if not encoding or not isinstance(encoding, dict): continue - video_url = url_or_none(encoding.get('videoUrl')) + video_url = url_or_none(encoding.get('url')) if not video_url: continue ext = mimetype2ext(encoding.get( @@ -69,7 +75,7 @@ class ImdbIE(InfoExtractor): if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + preference=1, m3u8_id='hls', fatal=False)) continue format_id = encoding.get('definition') formats.append({ @@ -80,13 +86,33 @@ class ImdbIE(InfoExtractor): }) self._sort_formats(formats) + webpage = self._download_webpage( + 'https://www.imdb.com/video/vi' + video_id, video_id) + video_metadata = self._parse_json(self._search_regex( + r'args\.push\(\s*({.+?})\s*\)\s*;', webpage, + 'video metadata'), video_id) + + video_info = video_metadata.get('VIDEO_INFO') + if video_info and isinstance(video_info, dict): + info = try_get( + video_info, lambda x: x[list(video_info.keys())[0]][0], dict) + else: + info = {} + + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'(.+?)', webpage, 'title', + default=None) or info['videoTitle'] + return { 'id': video_id, 'title': title, + 'alt_title': info.get('videoSubTitle'), 'formats': formats, - 'description': video_metadata.get('description'), - 'thumbnail': video_metadata.get('slate', {}).get('url'), - 'duration': parse_duration(video_metadata.get('duration')), + 'description': info.get('videoDescription'), + 'thumbnail': url_or_none(try_get( + video_metadata, lambda x: x['videoSlate']['source'])), + 'duration': parse_duration(info.get('videoRuntime')), } diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py deleted file mode 100644 index 4b5f346d1..000000000 --- a/youtube_dl/extractor/jpopsukitv.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) - - -class JpopsukiIE(InfoExtractor): - IE_NAME = 'jpopsuki.tv' - _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/(?:category/)?video/[^/]+/(?P\S+)' - - _TEST = { - 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771', - 'md5': '88018c0c1a9b1387940e90ec9e7e198e', - 'info_dict': { - 'id': '00be659d23b0b40508169cdee4545771', - 'ext': 'mp4', - 'title': 'ayumi hamasaki - evolution', - 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution', - 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', - 'uploader': 'plama_chan', - 'uploader_id': '404', - 'upload_date': '20121101' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = 'http://www.jpopsuki.tv' + self._html_search_regex( - r'from: uploaded: (.*?)', webpage, 'video upload_date', - fatal=False)) - view_count_str = self._html_search_regex( - r'
  • Hits: ([0-9]+?)
  • ', webpage, 'video view_count', - fatal=False) - comment_count_str = self._html_search_regex( - r'

    ([0-9]+?) comments

    ', webpage, 'video comment_count', - fatal=False) - - return { - 'id': video_id, - 'url': video_url, - 'title': video_title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - 'view_count': int_or_none(view_count_str), - 'comment_count': int_or_none(comment_count_str), - } diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 6ed7da4ab..1b2dcef46 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, determine_ext, @@ -36,7 +35,7 @@ class LecturioBaseIE(InfoExtractor): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(url_handle): - return self._LOGIN_URL not in compat_str(url_handle.geturl()) + return self._LOGIN_URL not in url_handle.geturl() # Already logged in if is_logged(urlh): diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py index a78c6556e..23ca965d9 100644 --- a/youtube_dl/extractor/linuxacademy.py +++ b/youtube_dl/extractor/linuxacademy.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, - compat_str, ) from ..utils import ( ExtractorError, @@ -99,7 +98,7 @@ class LinuxAcademyIE(InfoExtractor): 'sso': 'true', }) - login_state_url = compat_str(urlh.geturl()) + login_state_url = urlh.geturl() try: login_page = self._download_webpage( @@ -129,7 +128,7 @@ class LinuxAcademyIE(InfoExtractor): }) access_token = self._search_regex( - r'access_token=([^=&]+)', compat_str(urlh.geturl()), + r'access_token=([^=&]+)', urlh.geturl(), 'access token') self._download_webpage( diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 027a790b8..933df1495 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -6,7 +6,6 @@ import re from .theplatform import ThePlatformBaseIE from ..compat import ( compat_parse_qs, - compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -114,7 +113,7 @@ class MediasetIE(ThePlatformBaseIE): continue urlh = ie._request_webpage( embed_url, video_id, note='Following embed URL redirect') - embed_url = compat_str(urlh.geturl()) + embed_url = urlh.geturl() program_guid = _program_guid(_qs(embed_url)) if program_guid: entries.append(embed_url) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 694a264d6..d6eb15740 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -129,7 +129,7 @@ class MediasiteIE(InfoExtractor): query = mobj.group('query') webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() # XXX: might have also extracted UrlReferrer and QueryString from the html service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 901f44b54..2850af5db 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -18,7 +18,7 @@ class NovaEmbedIE(InfoExtractor): _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P[^/?#&]+)' _TEST = { 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', - 'md5': 'b3834f6de5401baabf31ed57456463f7', + 'md5': 'ee009bafcc794541570edd44b71cbea3', 'info_dict': { 'id': '8o0n0r', 'ext': 'mp4', @@ -44,11 +44,17 @@ class NovaEmbedIE(InfoExtractor): formats = [] for format_id, format_list in bitrates.items(): if not isinstance(format_list, list): - continue + format_list = [format_list] for format_url in format_list: format_url = url_or_none(format_url) if not format_url: continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue f = { 'url': format_url, } @@ -91,7 +97,7 @@ class NovaIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?(?Ptv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', - 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', + 'md5': '249baab7d0104e186e78b0899c7d5f28', 'info_dict': { 'id': '1757139', 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', @@ -113,7 +119,8 @@ class NovaIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'gone', }, { # media.cms.nova.cz embed 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', @@ -128,6 +135,7 @@ class NovaIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [NovaEmbedIE.ie_key()], + 'skip': 'CHYBA 404: STRÁNKA NENALEZENA', }, { 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 'only_matching': True, @@ -152,14 +160,29 @@ class NovaIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + description = clean_html(self._og_search_description(webpage, default=None)) + if site == 'novaplus': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) + elif site == 'fanda': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) + else: + upload_date = None + # novaplus embed_id = self._search_regex( r']+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', webpage, 'embed url', default=None) if embed_id: - return self.url_result( - 'https://media.cms.nova.cz/embed/%s' % embed_id, - ie=NovaEmbedIE.ie_key(), video_id=embed_id) + return { + '_type': 'url_transparent', + 'url': 'https://media.cms.nova.cz/embed/%s' % embed_id, + 'ie_key': NovaEmbedIE.ie_key(), + 'id': embed_id, + 'description': description, + 'upload_date': upload_date + } video_id = self._search_regex( [r"(?:media|video_id)\s*:\s*'(\d+)'", @@ -233,18 +256,8 @@ class NovaIE(InfoExtractor): self._sort_formats(formats) title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) - description = clean_html(self._og_search_description(webpage, default=None)) thumbnail = config.get('poster') - if site == 'novaplus': - upload_date = unified_strdate(self._search_regex( - r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) - elif site == 'fanda': - upload_date = unified_strdate(self._search_regex( - r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) - else: - upload_date = None - return { 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index a5e8baa7e..53acc6e57 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, qualities, + url_or_none, ) @@ -48,6 +49,10 @@ class NprIE(InfoExtractor): }, }], 'expected_warnings': ['Failed to download m3u8 information'], + }, { + # multimedia, no formats, stream + 'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert', + 'only_matching': True, }] def _real_extract(self, url): @@ -95,6 +100,17 @@ class NprIE(InfoExtractor): 'format_id': format_id, 'quality': quality(format_id), }) + for stream_id, stream_entry in media.get('stream', {}).items(): + if not isinstance(stream_entry, dict): + continue + if stream_id != 'hlsUrl': + continue + stream_url = url_or_none(stream_entry.get('$text')) + if not stream_url: + continue + formats.extend(self._extract_m3u8_formats( + stream_url, stream_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) entries.append({ diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 2bb77ab24..fc78ca56c 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -69,10 +69,10 @@ class NYTimesBaseIE(InfoExtractor): 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), - 'tbr': int_or_none(video.get('bitrate'), 1000), + 'tbr': int_or_none(video.get('bitrate'), 1000) or None, 'ext': ext, }) - self._sort_formats(formats) + self._sort_formats(formats, ('height', 'width', 'filesize', 'tbr', 'fps', 'format_id')) thumbnails = [] for image in video_data.get('images', []): diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index d3a83ea2b..48fb95416 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( int_or_none, parse_resolution, + str_or_none, try_get, unified_timestamp, url_or_none, @@ -415,6 +416,7 @@ class PeerTubeIE(InfoExtractor): peertube\.cpy\.re )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' _VALID_URL = r'''(?x) (?: peertube:(?P[^:]+):| @@ -423,26 +425,30 @@ class PeerTubeIE(InfoExtractor): (?P%s) ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ - 'url': 'https://peertube.cpy.re/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', - 'md5': '80f24ff364cc9d333529506a263e7feb', + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '9bed8c0137913e17b86334e5885aacff', 'info_dict': { - 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'ext': 'mp4', - 'title': 'wow', - 'description': 'wow such video, so gif', + 'title': 'What is PeerTube?', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', 'thumbnail': r're:https?://.*\.(?:jpg|png)', - 'timestamp': 1519297480, - 'upload_date': '20180222', - 'uploader': 'Luclu7', - 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', - 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', - 'license': 'Unknown', - 'duration': 3, + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'Les vidéos de Framasoft', + 'channel_id': '2', + 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, 'view_count': int, 'like_count': int, 'dislike_count': int, - 'tags': list, - 'categories': list, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', @@ -484,13 +490,38 @@ class PeerTubeIE(InfoExtractor): entries = [peertube_url] return entries + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return + subtitles = {} + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) + return subtitles + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or mobj.group('host_2') video_id = mobj.group('id') - video = self._download_json( - 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) + video = self._call_api( + host, video_id, '', note='Downloading video JSON') title = video['name'] @@ -513,10 +544,28 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - def account_data(field): - return try_get(video, lambda x: x['account'][field], compat_str) + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) - category = try_get(video, lambda x: x['category']['label'], compat_str) + description = None + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) + if not description: + description = video.get('description') + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) categories = [category] if category else None nsfw = video.get('nsfw') @@ -528,14 +577,17 @@ class PeerTubeIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video.get('description'), + 'description': description, 'thumbnail': urljoin(url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName'), - 'uploader_id': account_data('uuid'), - 'uploder_url': account_data('url'), - 'license': try_get( - video, lambda x: x['licence']['label'], compat_str), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str_or_none(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str_or_none(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likes')), @@ -544,4 +596,5 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, + 'subtitles': subtitles } diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py index 602207beb..23c8256b5 100644 --- a/youtube_dl/extractor/platzi.py +++ b/youtube_dl/extractor/platzi.py @@ -46,7 +46,7 @@ class PlatziBaseIE(InfoExtractor): headers={'Referer': self._LOGIN_URL}) # login succeeded - if 'platzi.com/login' not in compat_str(urlh.geturl()): + if 'platzi.com/login' not in urlh.geturl(): return login_error = self._webpage_read_content( diff --git a/youtube_dl/extractor/popcorntimes.py b/youtube_dl/extractor/popcorntimes.py new file mode 100644 index 000000000..7bf7f9858 --- /dev/null +++ b/youtube_dl/extractor/popcorntimes.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_chr, +) +from ..utils import int_or_none + + +class PopcorntimesIE(InfoExtractor): + _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P[^/]+)/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', + 'md5': '93f210991ad94ba8c3485950a2453257', + 'info_dict': { + 'id': 'A1XCFvz', + 'display_id': 'haensel-und-gretel-opera-fantasy', + 'ext': 'mp4', + 'title': 'Hänsel und Gretel', + 'description': 'md5:1b8146791726342e7b22ce8125cf6945', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'John Paul', + 'release_date': '19541009', + 'duration': 4260, + 'tbr': 5380, + 'width': 720, + 'height': 540, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._search_regex( + r'

    ([^<]+)', webpage, 'title', + default=None) or self._html_search_meta( + 'ya:ovs:original_name', webpage, 'title', fatal=True) + + loc = self._search_regex( + r'PCTMLOC\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'loc', + group='value') + + loc_b64 = '' + for c in loc: + c_ord = ord(c) + if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): + upper = ord('Z') if c_ord <= ord('Z') else ord('z') + c_ord += 13 + if upper < c_ord: + c_ord -= 26 + loc_b64 += compat_chr(c_ord) + + video_url = compat_b64decode(loc_b64).decode('utf-8') + + description = self._html_search_regex( + r'(?s)]+class=["\']pt-movie-desc[^>]+>(.+?)', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r']+class=["\']video-preview[^>]+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'thumbnail', default=None, + group='value') or self._og_search_thumbnail(webpage) + + creator = self._html_search_meta( + 'video:director', webpage, 'creator', default=None) + + release_date = self._html_search_meta( + 'video:release_date', webpage, default=None) + if release_date: + release_date = release_date.replace('-', '') + + def int_meta(name): + return int_or_none(self._html_search_meta( + name, webpage, default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'creator': creator, + 'release_date': release_date, + 'duration': int_meta('video:duration'), + 'tbr': int_meta('ya:ovs:bitrate'), + 'width': int_meta('og:video:width'), + 'height': int_meta('og:video:height'), + 'http_headers': { + 'Referer': url, + }, + } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 27d65d4b9..c6052ac9f 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, int_or_none, js_to_json, + merge_dicts, urljoin, ) @@ -27,23 +28,22 @@ class PornHdIE(InfoExtractor): 'view_count': int, 'like_count': int, 'age_limit': 18, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { - # removed video 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'md5': '956b8ca569f7f4d8ec563e2c41598441', + 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', 'info_dict': { 'id': '1962', 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'ext': 'mp4', - 'title': 'Sierra loves doing laundry', + 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'like_count': int, 'age_limit': 18, }, - 'skip': 'Not available anymore', }] def _real_extract(self, url): @@ -61,7 +61,13 @@ class PornHdIE(InfoExtractor): r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) + info = {} if not sources: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info = entries[0] + + if not sources and not info: message = self._html_search_regex( r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class="description"[^>]*>(?P[^<]+)]+class=["\']video-description[^>]+>(?P.+?)', + r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, - 'thumbnail', fatal=False, group='url') + 'thumbnail', default=None, group='url') like_count = int_or_none(self._search_regex( - (r'(\d+)\s*]+>(?: |\s)*\blikes', + (r'(\d+)\s*likes', + r'(\d+)\s*]+>(?: |\s)*\blikes', r'class=["\']save-count["\'][^>]*>\s*(\d+)'), webpage, 'like count', fatal=False)) - return { + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -106,4 +118,4 @@ class PornHdIE(InfoExtractor): 'like_count': like_count, 'formats': formats, 'age_limit': 18, - } + }) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b3251ccd9..b8f65af7c 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -189,10 +189,10 @@ class PornHubIE(PornHubBaseIE): # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. title = self._html_search_meta( - 'twitter:title', webpage, default=None) or self._search_regex( - (r']+class=["\']title["\'][^>]*>(?P[^<]+)', - r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', - r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), webpage, 'title', group='title') video_urls = [] diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 4942437c7..2cc665122 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_str, compat_urlparse, ) from ..utils import ( @@ -39,13 +38,13 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in compat_str(urlh.geturl()) + return 'learning.oreilly.com/home/' in urlh.geturl() if is_logged(urlh): self.LOGGED_IN = True return - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index e579d42cf..9401bf2cf 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -7,9 +7,18 @@ from .common import InfoExtractor class ServusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| + servustv\.com/videos + ) + /(?P<id>[aA]{2}-\w+|\d+-\d+) + ''' _TESTS = [{ - 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + # new URL schema + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', @@ -18,6 +27,10 @@ class ServusIE(InfoExtractor): 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', } + }, { + # old URL schema + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'only_matching': True, }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index a3c35a899..378fc7568 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -13,36 +13,18 @@ from ..utils import ( class SportDeutschlandIE(InfoExtractor): _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])' _TESTS = [{ - 'url': 'http://sportdeutschland.tv/badminton/live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', + 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', 'ext': 'mp4', - 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', - 'categories': ['Badminton'], + 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', + 'categories': ['Badminton-Deutschland'], 'view_count': int, - 'thumbnail': r're:^https?://.*\.jpg$', - 'description': r're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'timestamp': int, - 'upload_date': 're:^201408[23][0-9]$', + 'upload_date': '20200201', + 'description': 're:.*', # meaningless description for THIS video }, - 'params': { - 'skip_download': 'Live stream', - }, - }, { - 'url': 'http://sportdeutschland.tv/li-ning-badminton-wm-2014/lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', - 'info_dict': { - 'id': 'lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', - 'ext': 'mp4', - 'upload_date': '20140825', - 'description': 'md5:60a20536b57cee7d9a4ec005e8687504', - 'timestamp': 1408976060, - 'duration': 2732, - 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', - 'thumbnail': r're:^https?://.*\.jpg$', - 'view_count': int, - 'categories': ['Li-Ning Badminton WM 2014'], - - } }] def _real_extract(self, url): @@ -50,7 +32,7 @@ class SportDeutschlandIE(InfoExtractor): video_id = mobj.group('id') sport_id = mobj.group('sport') - api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( + api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( sport_id, video_id) req = sanitized_Request(api_url, headers={ 'Accept': 'application/vnd.vidibus.v2.html+json', diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 6b7f13b43..4316a6962 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from .wistia import WistiaIE -from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, @@ -58,7 +57,7 @@ class TeachableBaseIE(InfoExtractor): self._logged_in = True return - login_url = compat_str(urlh.geturl()) + login_url = urlh.geturl() login_form = self._hidden_inputs(login_page) @@ -160,8 +159,8 @@ class TeachableIE(TeachableBaseIE): webpage = self._download_webpage(url, video_id) - wistia_url = WistiaIE._extract_url(webpage) - if not wistia_url: + wistia_urls = WistiaIE._extract_urls(webpage) + if not wistia_urls: if any(re.search(p, webpage) for p in ( r'class=["\']lecture-contents-locked', r'>\s*Lecture contents locked', @@ -174,12 +173,14 @@ class TeachableIE(TeachableBaseIE): title = self._og_search_title(webpage, default=None) - return { + entries = [{ '_type': 'url_transparent', 'url': wistia_url, 'ie_key': WistiaIE.ie_key(), 'title': title, - } + } for wistia_url in wistia_urls] + + return self.playlist_result(entries, video_id, title) class TeachableCourseIE(TeachableBaseIE): diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d37e1b055..9ba3da341 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, int_or_none, str_or_none, + try_get, urljoin, ) @@ -24,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'info_dict': { 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', - 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', + 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ 'md5': 'adb28c37238b675dad0f042292f209a7', @@ -55,6 +56,26 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, + }, { + # video in opening's content + 'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html', + 'info_dict': { + 'id': '2907195140', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'description': 'md5:73f340a7320143d37ab895375b2bf13a', + }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'TpI2EttSDAReWpJ1o0NVh2', + 'ext': 'mp4', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'duration': 1015, + }, + }], + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, @@ -135,17 +156,28 @@ class TelecincoIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) article = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', webpage, 'article'), display_id)['article'] title = article.get('title') - description = clean_html(article.get('leadParagraph')) + description = clean_html(article.get('leadParagraph')) or '' if article.get('editorialType') != 'VID': entries = [] - for p in article.get('body', []): - content = p.get('content') - if p.get('type') != 'video' or not content: + body = [article.get('opening')] + body.extend(try_get(article, lambda x: x['body'], list) or []) + for p in body: + if not isinstance(p, dict): continue - entries.append(self._parse_content(content, url)) + content = p.get('content') + if not content: + continue + type_ = p.get('type') + if type_ == 'paragraph': + content_str = str_or_none(content) + if content_str: + description += content_str + continue + if type_ == 'video' and isinstance(content, dict): + entries.append(self._parse_content(content, url)) return self.playlist_result( entries, str_or_none(article.get('id')), title, description) content = article['opening']['content'] diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py index 6ab147ad7..387f955ee 100644 --- a/youtube_dl/extractor/thisoldhouse.py +++ b/youtube_dl/extractor/thisoldhouse.py @@ -2,43 +2,42 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import try_get class ThisOldHouseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/]+/)?\d+)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', - 'md5': '568acf9ca25a639f0c4ff905826b662f', 'info_dict': { - 'id': '2REGtUDQ', + 'id': '5dcdddf673c3f956ef5db202', 'ext': 'mp4', 'title': 'How to Build a Storage Bench', 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', 'timestamp': 1442548800, 'upload_date': '20150918', - } + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', 'only_matching': True, }, { 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric', 'only_matching': True, + }, { + 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', + 'only_matching': True, + }, { + 'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost', + 'only_matching': True, }] + _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', - r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'), - webpage, 'video id', default=None, group='id') - if not video_id: - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - video_id = try_get( - drupal_settings, lambda x: x['jwplatform']['video_id'], - compat_str) or list(drupal_settings['comScore'])[0] - return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) + r'<iframe[^>]+src=[\'"](?:https?:)?//thisoldhouse\.chorus\.build/videos/zype/([0-9a-f]{24})', + webpage, 'video id') + return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index 5e5efda0f..ca2e36efe 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -17,9 +17,9 @@ from ..utils import ( class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', + 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { 'id': '343115', 'ext': 'mp4', @@ -33,7 +33,7 @@ class ToggleIE(InfoExtractor): } }, { 'note': 'DRM-protected video', - 'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413', + 'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413', 'info_dict': { 'id': '341413', 'ext': 'wvm', @@ -48,7 +48,7 @@ class ToggleIE(InfoExtractor): }, { # this also tests correct video id extraction 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', - 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', + 'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', 'info_dict': { 'id': '332861', 'ext': 'mp4', @@ -65,19 +65,22 @@ class ToggleIE(InfoExtractor): 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367', + 'url': 'http://www.mewatch.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', + 'url': 'http://www.mewatch.sg/zh/series/zero-calling-s2-hd/ep13/336367', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', + 'url': 'http://www.mewatch.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', 'only_matching': True, }, { - 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'url': 'http://www.mewatch.sg/en/movies/seven-days/321936', 'only_matching': True, }, { - 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', + 'url': 'https://www.mewatch.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'only_matching': True, + }, { + 'url': 'http://www.mewatch.sg/en/channels/eleven-plus/401585', 'only_matching': True, }] diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index edbb0aa69..ae584ad69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -151,7 +150,7 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): raise ExtractorError( 'This Tumblr may contain sensitive media. ' diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 611fdc0c6..8bda9348d 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -106,7 +106,7 @@ class TV2DKBornholmPlayIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id, + 'https://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id, data=json.dumps({ 'playlist_id': video_id, 'serienavn': '', diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py index 88b6baa31..b7fe082b9 100644 --- a/youtube_dl/extractor/tv5mondeplus.py +++ b/youtube_dl/extractor/tv5mondeplus.py @@ -3,31 +3,51 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - clean_html, determine_ext, extract_attributes, - get_element_by_class, int_or_none, parse_duration, - parse_iso8601, ) class TV5MondePlusIE(InfoExtractor): IE_DESC = 'TV5MONDE+' - _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', - 'md5': '12130fc199f020673138a83466542ec6', + _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' + _TESTS = [{ + # movie + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit', + 'md5': '8cbde5ea7b296cf635073e27895e227f', 'info_dict': { - 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'id': '822a4756-0712-7329-1859-a13ac7fd1407', + 'display_id': 'rendez-vous-a-atlit', 'ext': 'mp4', - 'title': 'Tdah, mon amour - Enfants', - 'description': 'md5:230e3aca23115afcf8006d1bece6df74', - 'upload_date': '20170401', - 'timestamp': 1491022860, - } - } + 'title': 'Rendez-vous à Atlit', + 'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb', + 'upload_date': '20200130', + }, + }, { + # series episode + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree', + 'info_dict': { + 'id': '0df7007c-4900-3936-c601-87a13a93a068', + 'display_id': 'c-est-la-vie-ennemie-juree', + 'ext': 'mp4', + 'title': "C'est la vie - Ennemie jurée", + 'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e', + 'upload_date': '20200130', + 'series': "C'est la vie", + 'episode': 'Ennemie jurée', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', + 'only_matching': True, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30', + 'only_matching': True, + }] _GEO_BYPASS = False def _real_extract(self, url): @@ -37,11 +57,7 @@ class TV5MondePlusIE(InfoExtractor): if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: self.raise_geo_restricted(countries=['FR']) - series = get_element_by_class('video-detail__title', webpage) - title = episode = get_element_by_class( - 'video-detail__subtitle', webpage) or series - if series and series != title: - title = '%s - %s' % (series, title) + title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') vpl_data = extract_attributes(self._search_regex( r'(<[^>]+class="video_player_loader"[^>]+>)', webpage, 'video player loader')) @@ -65,15 +81,37 @@ class TV5MondePlusIE(InfoExtractor): }) self._sort_formats(formats) + description = self._html_search_regex( + r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage, + 'description', fatal=False) + + series = self._html_search_regex( + r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage, + 'series', default=None) + + if series and series != title: + title = '%s - %s' % (series, title) + + upload_date = self._search_regex( + r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})', + webpage, 'upload date', default=None) + if upload_date: + upload_date = upload_date.replace('_', '') + + video_id = self._search_regex( + (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', + default=display_id) + return { - 'id': display_id, + 'id': video_id, 'display_id': display_id, 'title': title, - 'description': clean_html(get_element_by_class('video-detail__description', webpage)), + 'description': description, 'thumbnail': vpl_data.get('data-image'), 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)), - 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)), + 'upload_date': upload_date, 'formats': formats, - 'episode': episode, 'series': series, + 'episode': episode, } diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py index 0b863df2f..443f46e8a 100644 --- a/youtube_dl/extractor/tva.py +++ b/youtube_dl/extractor/tva.py @@ -9,8 +9,8 @@ from ..utils import ( class TVAIE(InfoExtractor): - _VALID_URL = r'https?://videos\.tva\.ca/details/_(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P<id>\d+)' + _TESTS = [{ 'url': 'https://videos.tva.ca/details/_5596811470001', 'info_dict': { 'id': '5596811470001', @@ -24,7 +24,10 @@ class TVAIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://video.tva.ca/details/_5596811470001', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 2830c212e..74d14049b 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -17,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?P<host> - (?:(?:www|porno)\.)?24video\. + (?:(?:www|porno?)\.)?24video\. (?:net|me|xxx|sexy?|tube|adult|site|vip) )/ (?: @@ -62,6 +62,9 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'https://www.24video.vip/video/view/1044982', 'only_matching': True, + }, { + 'url': 'https://porn.24video.net/video/2640421-vsya-takay', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index a8c2502af..0db2dca41 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -575,8 +575,8 @@ class TwitchStreamIE(TwitchBaseIE): channel_id = self._match_id(url) stream = self._call_api( - 'kraken/streams/%s?stream_type=all' % channel_id, channel_id, - 'Downloading stream JSON').get('stream') + 'kraken/streams/%s?stream_type=all' % channel_id.lower(), + channel_id, 'Downloading stream JSON').get('stream') if not stream: raise ExtractorError('%s is offline' % channel_id, expected=True) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f378aa283..cea686afc 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -33,6 +33,7 @@ from ..utils import ( unified_timestamp, unsmuggle_url, urlencode_postdata, + urljoin, unescapeHTML, ) @@ -191,7 +192,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): for tt in text_tracks: subtitles[tt['lang']] = [{ 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], + 'url': urljoin('https://vimeo.com', tt['url']), }] thumbnails = [] @@ -584,14 +585,14 @@ class VimeoIE(VimeoBaseInfoExtractor): url = 'https://vimeo.com/' + video_id elif is_player: url = 'https://player.vimeo.com/video/' + video_id - elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf', '/album/', '/showcase/')): url = 'https://vimeo.com/' + video_id try: # Retrieve video webpage to extract further information webpage, urlh = self._download_webpage_handle( url, video_id, headers=headers) - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 085514d47..168e5e901 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -45,22 +45,23 @@ class WistiaIE(InfoExtractor): # https://wistia.com/support/embed-and-share/video-on-your-website @staticmethod def _extract_url(webpage): - match = re.search( - r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage) - if match: - return unescapeHTML(match.group('url')) + urls = WistiaIE._extract_urls(webpage) + return urls[0] if urls else None - match = re.search( - r'''(?sx) - <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2 - ''', webpage) - if match: - return 'wistia:%s' % match.group('id') - - match = re.search(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage) - if match: - return 'wistia:%s' % match.group('id') + @staticmethod + def _extract_urls(webpage): + urls = [] + for match in re.finditer( + r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): + urls.append(unescapeHTML(match.group('url'))) + for match in re.finditer( + r'''(?sx) + <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2 + ''', webpage): + urls.append('wistia:%s' % match.group('id')) + for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): + urls.append('wistia:%s' % match.group('id')) + return urls def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index a5b94d279..0f7be6a7d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -113,7 +113,7 @@ class XHamsterIE(InfoExtractor): display_id = mobj.group('display_id') or mobj.group('display_id_2') desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + webpage, urlh = self._download_webpage_handle(desktop_url, video_id) error = self._html_search_regex( r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', @@ -161,6 +161,9 @@ class XHamsterIE(InfoExtractor): 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), 'filesize': filesize, + 'http_headers': { + 'Referer': urlh.geturl(), + }, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index c6c0b3291..47caec1de 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -47,7 +47,7 @@ class XTubeIE(InfoExtractor): 'display_id': 'A-Super-Run-Part-1-YT', 'ext': 'flv', 'title': 'A Super Run - Part 1 (YT)', - 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', 'uploader': 'tshirtguy59', 'duration': 579, 'view_count': int, @@ -87,10 +87,24 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - sources = self._parse_json(self._search_regex( - r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', - webpage, 'sources', group='sources'), video_id, - transform_source=js_to_json) + title, thumbnail, duration = [None] * 3 + + config = self._parse_json(self._search_regex( + r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + default='{}'), video_id, transform_source=js_to_json, fatal=False) + if config: + config = config.get('mainRoll') + if isinstance(config, dict): + title = config.get('title') + thumbnail = config.get('poster') + duration = int_or_none(config.get('duration')) + sources = config.get('sources') + + if isinstance(sources, dict): + sources = self._parse_json(self._search_regex( + r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) formats = [] for format_id, format_url in sources.items(): @@ -102,20 +116,25 @@ class XTubeIE(InfoExtractor): self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex( - (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), - webpage, 'title', group='title') - description = self._search_regex( + if not title: + title = self._search_regex( + (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, default=None) or self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) uploader = self._search_regex( (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', r'<span[^>]+class="nickname"[^>]*>([^<]+)'), webpage, 'uploader', fatal=False) - duration = parse_duration(self._search_regex( - r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', - webpage, 'duration', fatal=False)) + if not duration: + duration = parse_duration(self._search_regex( + r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( - r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>', + (r'["\']viewsCount["\'][^>]*>(\d+)\s+views', + r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'), webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( r'>Comments? \(([\d,\.]+)\)<', @@ -126,6 +145,7 @@ class XTubeIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'uploader': uploader, 'duration': duration, 'view_count': view_count, @@ -144,7 +164,7 @@ class XTubeUserIE(InfoExtractor): 'id': 'greenshowers-4056496', 'age_limit': 18, }, - 'playlist_mincount': 155, + 'playlist_mincount': 154, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index dff69fcb7..88aabd272 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -44,7 +44,7 @@ class YouJizzIE(InfoExtractor): encodings = self._parse_json( self._search_regex( - r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', + r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', default='[]'), video_id, fatal=False) for encoding in encodings: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eacaa5ecd..e06290427 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2495,20 +2495,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', }, - 'playlist_count': 3, + 'playlist_count': 1, }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, - 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2518,7 +2521,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader': 'Christiaan008', 'uploader_id': 'ChRiStIaAn008', }, - 'playlist_count': 95, + 'playlist_count': 96, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py index bacb82eee..f6496f516 100644 --- a/youtube_dl/extractor/zapiks.py +++ b/youtube_dl/extractor/zapiks.py @@ -29,7 +29,6 @@ class ZapiksIE(InfoExtractor): 'timestamp': 1359044972, 'upload_date': '20130124', 'view_count': int, - 'comment_count': int, }, }, { diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 145c123a4..656864b2e 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -244,14 +244,14 @@ class ZDFChannelIE(ZDFBaseIE): 'id': 'das-aktuelle-sportstudio', 'title': 'das aktuelle sportstudio | ZDF', }, - 'playlist_count': 21, + 'playlist_mincount': 23, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { 'id': 'planet-e', 'title': 'planet e.', }, - 'playlist_count': 4, + 'playlist_mincount': 50, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 1ffabc62b..8826b382c 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -134,7 +134,7 @@ def parseOpts(overrideArguments=None): action='help', help='Print this help text and exit') general.add_option( - '-v', '--version', + '--version', action='version', help='Print program version and exit') general.add_option( diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 002ea7f33..84c964617 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -9,6 +9,7 @@ import subprocess import sys from zipimport import zipimporter +from .compat import compat_realpath from .utils import encode_compat_str from .version import __version__ @@ -84,7 +85,9 @@ def update_self(to_screen, verbose, opener): print_notes(to_screen, versions_info['versions']) # sys.executable is set to the full pathname of the exe-file for py2exe - filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0] + # though symlinks are not followed so that we need to do this manually + # with help of realpath + filename = compat_realpath(sys.executable if hasattr(sys, 'frozen') else sys.argv[0]) if not os.access(filename, os.W_OK): to_screen('ERROR: no write permissions on %s' % filename) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f6204692a..8ccf25489 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2795,6 +2795,15 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): https_response = http_response +class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): + if sys.version_info[0] < 3: + def redirect_request(self, req, fp, code, msg, headers, newurl): + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + + def extract_timezone(date_str): m = re.search( r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fa6f7289a..fabc1e543 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.01.24' +__version__ = '2020.03.01'