From 9b9b3501c5bee18d608dd2961a80936667f8ece2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 16 Nov 2018 22:55:35 +0700 Subject: [PATCH 001/226] [tnaflixnetwork:embed] Fix extraction (closes #18205) --- youtube_dl/extractor/tnaflix.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 0c2f8f119..6798ef4c3 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -18,8 +18,9 @@ from ..utils import ( class TNAFlixNetworkBaseIE(InfoExtractor): # May be overridden in descendants if necessary _CONFIG_REGEX = [ - r'flashvars\.config\s*=\s*escape\("([^"]+)"', - r']+name="config\d?" value="([^"]+)"', + r'flashvars\.config\s*=\s*escape\("(?P[^"]+)"', + r']+name="config\d?" value="(?P[^"]+)"', + r'config\s*=\s*(["\'])(?P(?:https?:)?//(?:(?!\1).)+)\1', ] _HOST = 'tna' _VKEY_SUFFIX = '' @@ -85,7 +86,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): webpage = self._download_webpage(url, display_id) cfg_url = self._proto_relative_url(self._html_search_regex( - self._CONFIG_REGEX, webpage, 'flashvars.config', default=None), 'http:') + self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, + group='url'), 'http:') if not cfg_url: inputs = self._hidden_inputs(webpage) From 2599956c9ff0162e7afbddebb00d73eea6b0403c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Nov 2018 00:07:59 +0700 Subject: [PATCH 002/226] [rte] Add support for new API endpoint (closes #18206) --- youtube_dl/extractor/rte.py | 129 +++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index a6fac6c35..1fbc72915 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -8,7 +8,10 @@ from ..compat import compat_HTTPError from ..utils import ( float_or_none, parse_iso8601, + str_or_none, + try_get, unescapeHTML, + url_or_none, ExtractorError, ) @@ -17,65 +20,87 @@ class RteBaseIE(InfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - try: - json_string = self._download_json( - 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, - item_id) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) - if error_info: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error_info['message']), - expected=True) - raise - - # NB the string values in the JSON are stored using XML escaping(!) - show = json_string['shows'][0] - title = unescapeHTML(show['title']) - description = unescapeHTML(show.get('description')) - thumbnail = show.get('thumbnail') - duration = float_or_none(show.get('duration'), 1000) - timestamp = parse_iso8601(show.get('published')) - - mg = show['media:group'][0] - + info_dict = {} formats = [] - if mg.get('url'): - m = re.match(r'(?Prtmpe?://[^/]+)/(?P.+)/(?Pmp4:.*)', mg['url']) - if m: - m = m.groupdict() - formats.append({ - 'url': m['url'] + '/' + m['app'], - 'app': m['app'], - 'play_path': m['playpath'], - 'player_url': url, - 'ext': 'flv', - 'format_id': 'rtmp', - }) + ENDPOINTS = ( + 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', + ) - if mg.get('hls_server') and mg.get('hls_url'): - formats.extend(self._extract_m3u8_formats( - mg['hls_server'] + mg['hls_url'], item_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + for num, ep_url in enumerate(ENDPOINTS, start=1): + try: + data = self._download_json(ep_url + item_id, item_id) + except ExtractorError as ee: + if num < len(ENDPOINTS) or formats: + continue + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise - if mg.get('hds_server') and mg.get('hds_url'): - formats.extend(self._extract_f4m_formats( - mg['hds_server'] + mg['hds_url'], item_id, - f4m_id='hds', fatal=False)) + # NB the string values in the JSON are stored using XML escaping(!) + show = try_get(data, lambda x: x['shows'][0], dict) + if not show: + continue + + if not info_dict: + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + info_dict = { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + } + + mg = try_get(show, lambda x: x['media:group'][0], dict) + if not mg: + continue + + if mg.get('url'): + m = re.match(r'(?Prtmpe?://[^/]+)/(?P.+)/(?Pmp4:.*)', mg['url']) + if m: + m = m.groupdict() + formats.append({ + 'url': m['url'] + '/' + m['app'], + 'app': m['app'], + 'play_path': m['playpath'], + 'player_url': url, + 'ext': 'flv', + 'format_id': 'rtmp', + }) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + mg_rte_server = str_or_none(mg.get('rte:server')) + mg_url = str_or_none(mg.get('url')) + if mg_rte_server and mg_url: + hds_url = url_or_none(mg_rte_server + mg_url) + if hds_url: + formats.extend(self._extract_f4m_formats( + hds_url, item_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) - return { - 'id': item_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } + info_dict['formats'] = formats + return info_dict class RteIE(RteBaseIE): From 0919cd4d011d0545a6afadfab0b71de8c0a4fe02 Mon Sep 17 00:00:00 2001 From: NeroBurner Date: Fri, 16 Nov 2018 18:18:50 +0100 Subject: [PATCH 003/226] [atvat] Fix extraction (closes #18041) --- youtube_dl/extractor/atvat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py index 1584d53fc..95e572d70 100644 --- a/youtube_dl/extractor/atvat.py +++ b/youtube_dl/extractor/atvat.py @@ -28,8 +28,10 @@ class ATVAtIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_data = self._parse_json(unescapeHTML(self._search_regex( - r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"', - webpage, 'player data')), display_id)['config']['initial_video'] + [r'flashPlayerOptions\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P[^"]+)"'], + webpage, 'player data', group='json')), + display_id)['config']['initial_video'] video_id = video_data['id'] video_title = video_data['title'] From d0058c76d5e14ffe89e8265fa3d984e28e922d78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Nov 2018 16:59:20 +0700 Subject: [PATCH 004/226] [openload] Use original host during extraction (closes #18211) --- youtube_dl/extractor/openload.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 2473536fd..cf51e4770 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,18 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'''(?x) + https?:// + (?P + (?:www\.)? + (?: + openload\.(?:co|io|link)| + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun) + ) + )/ + (?:f|embed)/ + (?P[a-zA-Z0-9-_]+) + ''' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -334,8 +345,11 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): - video_id = self._match_id(url) - url_pattern = 'https://openload.co/%%s/%s/' % video_id + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') + + url_pattern = 'https://%s/%%s/%s/' % (host, video_id) headers = { 'User-Agent': self._USER_AGENT, } @@ -368,7 +382,7 @@ class OpenloadIE(InfoExtractor): r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, 'stream URL')) - video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id + video_url = 'https://%s/stream/%s?mime=true' % (host, decoded_id) title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, @@ -379,7 +393,7 @@ class OpenloadIE(InfoExtractor): entry = entries[0] if entries else {} subtitles = entry.get('subtitles') - info_dict = { + return { 'id': video_id, 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), @@ -388,4 +402,3 @@ class OpenloadIE(InfoExtractor): 'subtitles': subtitles, 'http_headers': headers, } - return info_dict From a640c4d226e7b790fe8db43f1c5bdf2358caf839 Mon Sep 17 00:00:00 2001 From: aviperes Date: Sat, 17 Nov 2018 15:59:13 +0200 Subject: [PATCH 005/226] [vk] Detect geo restriction --- youtube_dl/extractor/vk.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index ef8b9bcb7..b52d15ac6 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -293,8 +293,12 @@ class VKIE(VKBaseIE): # This video is no longer available, because its author has been blocked. 'url': 'https://vk.com/video-10639516_456240611', 'only_matching': True, - } - ] + }, + { + # The video is not available in your region. + 'url': 'https://vk.com/video-51812607_171445436', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -354,6 +358,9 @@ class VKIE(VKBaseIE): r'This video is no longer available, because it has been deleted.': 'Video %s is no longer available, because it has been deleted.', + + r'The video .+? is not available in your region.': + 'Video %s is not available in your region.', } for error_re, error_msg in ERRORS.items(): From 11d19ff50393cd195af884c6865aff6d89ed66ac Mon Sep 17 00:00:00 2001 From: mttronc Date: Thu, 6 Sep 2018 15:41:07 +0200 Subject: [PATCH 006/226] [wwe] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/wwe.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/wwe.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b2b00c86f..87c7d8b0c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1386,6 +1386,7 @@ from .wsj import ( WSJIE, WSJArticleIE, ) +from .wwe import WWEIE from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE diff --git a/youtube_dl/extractor/wwe.py b/youtube_dl/extractor/wwe.py new file mode 100644 index 000000000..c471a79f5 --- /dev/null +++ b/youtube_dl/extractor/wwe.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import urljoin + + +class WWEIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?wwe.com/(?:.*/)?videos/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018', + 'md5': '30cbc824b51f4010ea885bfcaec76972', + 'info_dict': { + 'id': '40048199', + 'ext': 'mp4', + 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018', + 'description': 'Still fuming after he and his wife Brie Bella were attacked by The Miz and Maryse last week, Daniel Bryan takes care of some unfinished business with Andrade "Cien" Almas.', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://de.wwe.com/videos/gran-metalik-vs-tony-nese-wwe-205-live-sept-4-2018', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + drupal_settings = self._parse_json( + self._html_search_regex( + r'(?s)Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), + display_id) + + player = drupal_settings['WWEVideoLanding']['initialVideo'] + metadata = player['playlist'][0] + + id = compat_str(metadata['nid']) + title = metadata.get('title') or self._og_search_title(webpage) + video_url = 'https:' + metadata['file'] + thumbnail = None + if metadata.get('image') is not None: + thumbnail = urljoin(url, metadata.get('image')) + description = metadata.get('description') + + formats = self._extract_m3u8_formats(video_url, id, 'mp4') + + return { + 'id': id, + 'title': title, + 'formats': formats, + 'url': video_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'description': description, + } From 006374e3aebc3be3f20b7e812c987cdf15b3ae35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Nov 2018 23:59:20 +0700 Subject: [PATCH 007/226] [wwe] Fix issues, extract subtitles and add support for playlists (closes #14781, closes #17450) --- youtube_dl/extractor/wwe.py | 138 +++++++++++++++++++++++++++++------- 1 file changed, 111 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/wwe.py b/youtube_dl/extractor/wwe.py index c471a79f5..bebc77bb5 100644 --- a/youtube_dl/extractor/wwe.py +++ b/youtube_dl/extractor/wwe.py @@ -1,20 +1,75 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str -from ..utils import urljoin +from ..utils import ( + try_get, + unescapeHTML, + url_or_none, + urljoin, +) -class WWEIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?wwe.com/(?:.*/)?videos/(?P[\w-]+)' +class WWEBaseIE(InfoExtractor): + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + } + + def _extract_entry(self, data, url, video_id=None): + video_id = compat_str(video_id or data['nid']) + title = data['title'] + + formats = self._extract_m3u8_formats( + data['file'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + description = data.get('description') + thumbnail = urljoin(url, data.get('image')) + series = data.get('show_name') + episode = data.get('episode_name') + + subtitles = {} + tracks = data.get('tracks') + if isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + track_file = url_or_none(track.get('file')) + if not track_file: + continue + label = track.get('label') + lang = self._SUBTITLE_LANGS.get(label, label) or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_file, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'series': series, + 'episode': episode, + 'formats': formats, + 'subtitles': subtitles, + } + + +class WWEIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*videos/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018', - 'md5': '30cbc824b51f4010ea885bfcaec76972', + 'md5': '92811c6a14bfc206f7a6a9c5d9140184', 'info_dict': { 'id': '40048199', 'ext': 'mp4', 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018', - 'description': 'Still fuming after he and his wife Brie Bella were attacked by The Miz and Maryse last week, Daniel Bryan takes care of some unfinished business with Andrade "Cien" Almas.', + 'description': 'md5:2d7424dbc6755c61a0e649d2a8677f67', 'thumbnail': r're:^https?://.*\.jpg$', } }, { @@ -26,31 +81,60 @@ class WWEIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json( + landing = self._parse_json( self._html_search_regex( - r'(?s)Drupal\.settings\s*,\s*({.+?})\);', + r'(?s)Drupal\.settings\s*,\s*({.+?})\s*\)\s*;', webpage, 'drupal settings'), - display_id) + display_id)['WWEVideoLanding'] - player = drupal_settings['WWEVideoLanding']['initialVideo'] - metadata = player['playlist'][0] + data = landing['initialVideo']['playlist'][0] + video_id = landing.get('initialVideoId') - id = compat_str(metadata['nid']) - title = metadata.get('title') or self._og_search_title(webpage) - video_url = 'https:' + metadata['file'] - thumbnail = None - if metadata.get('image') is not None: - thumbnail = urljoin(url, metadata.get('image')) - description = metadata.get('description') + info = self._extract_entry(data, url, video_id) + info['display_id'] = display_id + return info - formats = self._extract_m3u8_formats(video_url, id, 'mp4') - return { - 'id': id, - 'title': title, - 'formats': formats, - 'url': video_url, - 'display_id': display_id, - 'thumbnail': thumbnail, - 'description': description, - } +class WWEPlaylistIE(WWEBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.wwe.com/shows/raw/2018-11-12', + 'info_dict': { + 'id': '2018-11-12', + }, + 'playlist_mincount': 11, + }, { + 'url': 'http://www.wwe.com/article/walk-the-prank-wwe-edition', + 'only_matching': True, + }, { + 'url': 'https://www.wwe.com/shows/wwenxt/article/matt-riddle-interview', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if WWEIE.suitable(url) else super(WWEPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'data-video\s*=\s*(["\'])(?P{.+?})\1', webpage): + video = self._parse_json( + mobj.group('data'), display_id, transform_source=unescapeHTML, + fatal=False) + if not video: + continue + data = try_get(video, lambda x: x['playlist'][0], dict) + if not data: + continue + try: + entry = self._extract_entry(data, url) + except Exception: + continue + entry['extractor_key'] = WWEIE.ie_key() + entries.append(entry) + + return self.playlist_result(entries, display_id) From 02df855e1339942c00f365dc59a0e418abb4b26f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Nov 2018 00:07:40 +0700 Subject: [PATCH 008/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index fa5de8b04..15daa1bec 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version + +Extractors ++ [wwe] Extract subtitles ++ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for wwe.com (#14781, #17450) +* [vk] Detect geo restriction (#17767) +* [openload] Use original host during extraction (#18211) +* [atvat] Fix extraction (#18041) ++ [rte] Add support for new API endpoint (#18206) +* [tnaflixnetwork:embed] Fix extraction (#18205) +* [picarto] Use API and add token support (#16518) ++ [zype] Add support for player.zype.com (#18143) +* [vivo] Fix extraction (#18139) +* [ruutu] Update API endpoint (#18138) + + version 2018.11.07 Extractors From 5bb04792696c41f66f2114b0cc05b01135fa1f68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Nov 2018 00:11:54 +0700 Subject: [PATCH 009/226] release 2018.11.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7607e0e03..905576364 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.07** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.11.07 +[debug] youtube-dl version 2018.11.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 15daa1bec..0083c4631 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.11.18 Extractors + [wwe] Extract subtitles diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 24c3254c3..9009f7e9e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1080,6 +1080,7 @@ - **wrzuta.pl:playlist** - **WSJ**: Wall Street Journal - **WSJArticle** + - **WWE** - **XBef** - **XboxClips** - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me @@ -1139,3 +1140,4 @@ - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn + - **Zype** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7f32ad36c..7f5ad7bf4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.11.07' +__version__ = '2018.11.18' From 4167148fa45e43a93ed202e5923223b7797340ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Nov 2018 01:07:54 +0700 Subject: [PATCH 010/226] [nova:embed] Fix extraction (closes #18222) --- youtube_dl/extractor/nova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 80186ec50..901f44b54 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -35,7 +35,7 @@ class NovaEmbedIE(InfoExtractor): bitrates = self._parse_json( self._search_regex( - r'(?s)bitrates\s*=\s*({.+?})\s*;', webpage, 'formats'), + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), video_id, transform_source=js_to_json) QUALITIES = ('lq', 'mq', 'hq', 'hd') From 1febf99da1924c46a491790639371f4ee9069193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Nov 2018 06:26:08 +0700 Subject: [PATCH 011/226] [pornhub] Add pornhub.net alias --- youtube_dl/extractor/pornhub.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 19eaf389f..7ee64dbf6 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -27,7 +27,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -340,7 +340,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/playlist/(?P\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -355,7 +355,7 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -393,6 +393,9 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, }] def _real_extract(self, url): From f97c099131f625104f64a99a02a8c9894620171a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Nov 2018 11:14:46 +0700 Subject: [PATCH 012/226] [pornhub] Move test to correct place --- youtube_dl/extractor/pornhub.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7ee64dbf6..c9c884095 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -121,6 +121,9 @@ class PornHubIE(InfoExtractor): }, { 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, }] @staticmethod @@ -393,9 +396,6 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'only_matching': True, - }, { - 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', - 'only_matching': True, }] def _real_extract(self, url): From 964b989dc88c37b027481fb01de835b1e796ba5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Nov 2018 20:44:51 +0700 Subject: [PATCH 013/226] [americastestkitchen] Add support for zype embeds (closes #18225) --- youtube_dl/extractor/americastestkitchen.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 01736872d..8b32aa886 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -43,10 +43,6 @@ class AmericasTestKitchenIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id') - video_data = self._parse_json( self._search_regex( r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', @@ -58,7 +54,18 @@ class AmericasTestKitchenIE(InfoExtractor): (lambda x: x['episodeDetail']['content']['data'], lambda x: x['videoDetail']['content']['data']), dict) ep_meta = ep_data.get('full_video', {}) - external_id = ep_data.get('external_id') or ep_meta['external_id'] + + zype_id = ep_meta.get('zype_id') + if zype_id: + embed_url = 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id + ie_key = 'Zype' + else: + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') + external_id = ep_data.get('external_id') or ep_meta['external_id'] + embed_url = 'kaltura:%s:%s' % (partner_id, external_id) + ie_key = 'Kaltura' title = ep_data.get('title') or ep_meta.get('title') description = clean_html(ep_meta.get('episode_description') or ep_data.get( @@ -72,8 +79,8 @@ class AmericasTestKitchenIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, external_id), - 'ie_key': 'Kaltura', + 'url': embed_url, + 'ie_key': ie_key, 'title': title, 'description': description, 'thumbnail': thumbnail, From 9b27a78a881bceb9d62f3364399d7572e8e2be24 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 18 Nov 2018 16:13:46 +0100 Subject: [PATCH 014/226] [kaltura] limit requested MediaEntry fields --- youtube_dl/extractor/kaltura.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 04f68fce4..fdf7f5bbc 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -192,6 +192,8 @@ class KalturaIE(InfoExtractor): 'entryId': video_id, 'service': 'baseentry', 'ks': '{1:result:ks}', + 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', + 'responseProfile:type': 1, }, { 'action': 'getbyentryid', From 8578ea4dcb17834ee3843e0e337c15af706f9803 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 18 Nov 2018 16:15:10 +0100 Subject: [PATCH 015/226] [bitchute] use _html_search_regex for title extraction --- youtube_dl/extractor/bitchute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 446a1ab19..43b4732aa 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -37,7 +37,7 @@ class BitChuteIE(InfoExtractor): 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', }) - title = self._search_regex( + title = self._html_search_regex( (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'([^<]+)'), webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', From 2e1280ed432257244ea52a47efe6a7f0e226b897 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 19 Nov 2018 18:15:51 +0100 Subject: [PATCH 016/226] [sixplay] fix format extraction --- youtube_dl/extractor/sixplay.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 207ab4477..0c4f865ef 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -64,7 +64,7 @@ class SixPlayIE(InfoExtractor): for asset in clip_data['assets']: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') - if not asset_url or protocol == 'primetime' or asset_url in urls: + if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls: continue urls.append(asset_url) container = asset.get('video_container') @@ -81,19 +81,17 @@ class SixPlayIE(InfoExtractor): if not urlh: continue asset_url = urlh.geturl() - asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) - formats.extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - asset_url.replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_mpd_formats( - asset_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - re.sub(r'/[^/]+\.m3u8', '/Manifest', asset_url), - video_id, ism_id='mss', fatal=False)) + for i in range(3, 0, -1): + asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) + m3u8_formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + formats.extend(self._extract_mpd_formats( + asset_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + if m3u8_formats: + break else: formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', From 15ed5a27840e748d9f786c50b78a4c6326e9f186 Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Tue, 20 Nov 2018 20:50:40 +0100 Subject: [PATCH 017/226] [nzz] Relax kaltura regex --- youtube_dl/extractor/nzz.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nzz.py b/youtube_dl/extractor/nzz.py index 2d352f53f..61ee77adb 100644 --- a/youtube_dl/extractor/nzz.py +++ b/youtube_dl/extractor/nzz.py @@ -11,20 +11,27 @@ from ..utils import ( class NZZIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nzz\.ch/(?:[^/]+/)*[^/?#]+-ld\.(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nzz.ch/zuerich/gymizyte/gymizyte-schreiben-schueler-heute-noch-diktate-ld.9153', 'info_dict': { 'id': '9153', }, 'playlist_mincount': 6, - } + }, { + 'url': 'https://www.nzz.ch/video/nzz-standpunkte/cvp-auf-der-suche-nach-dem-mass-der-mitte-ld.1368112', + 'info_dict': { + 'id': '1368112', + }, + 'playlist_count': 1, + }] def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) entries = [] - for player_element in re.findall(r'(<[^>]+class="kalturaPlayer"[^>]*>)', webpage): + for player_element in re.findall( + r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): player_params = extract_attributes(player_element) if player_params.get('data-type') not in ('kaltura_singleArticle',): self.report_warning('Unsupported player type') From 05bd5e9c77e0e8acb95f47396be4c970fc9f39c4 Mon Sep 17 00:00:00 2001 From: Austin de Coup-Crank <austindcc@gmail.com> Date: Fri, 26 Oct 2018 19:15:44 -0700 Subject: [PATCH 018/226] [ciscolive] Add extractor --- youtube_dl/extractor/ciscolive.py | 136 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 137 insertions(+) create mode 100644 youtube_dl/extractor/ciscolive.py diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py new file mode 100644 index 000000000..2db7aad2c --- /dev/null +++ b/youtube_dl/extractor/ciscolive.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs +) +from ..utils import ( + clean_html, + int_or_none, + try_get, + urlencode_postdata, +) + + +class CiscoLiveIE(InfoExtractor): + IE_NAME = 'ciscolive' + _VALID_URL = r'(?:https?://)?ciscolive\.cisco\.com/on-demand-library/\??(?P<query>[^#]+)#/(?:session/(?P<id>.+))?$' + _TESTS = [ + { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'uploader_id': '5647924234001', + 'upload_date': '20180629', + 'location': '16B Mezz.', + }, + }, + { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'md5': '993d4cf051f6174059328b1dce8e94bd', + 'info_dict': { + 'upload_date': '20180629', + 'title': 'DevNet Panel-Applying Design Thinking to Building Products in Cisco', + 'timestamp': 1530316421, + 'uploader_id': '5647924234001', + 'id': '5803751616001', + 'description': 'md5:5f144575cd6848117fe2f756855b038b', + 'location': 'WoS, DevNet Theater', + 'ext': 'mp4', + }, + }, + { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'md5': '80e0c3b87e373fe3a3316b934b8915bf', + 'info_dict': { + 'upload_date': '20180629', + 'title': 'Beating the CCIE Routing & Switching', + 'timestamp': 1530311842, + 'uploader_id': '5647924234001', + 'id': '5803735679001', + 'description': 'md5:e71970799e92d7f5ff57ae23f64b0929', + 'location': 'Tulúm 02', + 'ext': 'mp4', + }, + } + ] + + # These appear to be constant across all Cisco Live presentations + # and are not tied to any user session or event + RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' + RAINFOCUS_APIPROFILEID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGETID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + + def _parse_rf_item(self, rf_item): + ''' Parses metadata and passes to Brightcove extractor ''' + event_name = rf_item.get('eventName') + title = rf_item['title'] + description = clean_html(rf_item.get('abstract')) + presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) + bc_id = rf_item['videos'][0]['url'] + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id + duration = int_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + location = try_get(rf_item, lambda x: x['times'][0]['room']) + + if duration: + duration = duration * 60 + + return { + '_type': 'url_transparent', + 'creator': presenter_name, + 'description': description, + 'duration': duration, + 'ie_key': 'BrightcoveNew', + 'location': location, + 'series': event_name, + 'title': title, + 'url': bc_url, + } + + def _check_bc_id_exists(self, rf_item): + ''' Checks for the existence of a Brightcove URL in an API result ''' + bc_id = try_get(rf_item, lambda x: x['videos'][0]['url']) + if bc_id: + if bc_id.strip().isdigit(): + return rf_item + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': self.RAINFOCUS_APIPROFILEID, + 'rfWidgetId': self.RAINFOCUS_WIDGETID, + 'Referer': url, + } + # Single session URL (single video) + if mobj.group('id'): + rf_id = mobj.group('id') + request = self.RAINFOCUS_API_URL % 'session' + data = urlencode_postdata({'id': rf_id}) + rf_result = self._download_json(request, rf_id, data=data, headers=HEADERS) + rf_item = self._check_bc_id_exists(rf_result['items'][0]) + return self._parse_rf_item(rf_item) + else: + # Filter query URL (multiple videos) + rf_query = compat_parse_qs((compat_urllib_parse_urlparse(url).query)) + rf_query['type'] = 'session' + rf_query['size'] = 1000 + data = urlencode_postdata(rf_query) + request = self.RAINFOCUS_API_URL % 'search' + rf_results = self._download_json(request, 'Filter query', data=data, headers=HEADERS) + entries = [ + self._parse_rf_item(rf_item) + for rf_item + in rf_results['sectionList'][0]['items'] + if self._check_bc_id_exists(rf_item) + ] + return self.playlist_result(entries, 'Filter query') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 87c7d8b0c..2c5988a14 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -194,6 +194,7 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE +from .ciscolive import CiscoLiveIE from .cjsw import CJSWIE from .cliphunter import CliphunterIE from .clippit import ClippitIE From 6a6d7f064178427d28986884524bd3434f0ca957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 21 Nov 2018 05:25:43 +0700 Subject: [PATCH 019/226] [ciscolive] Fix issues and improve extraction (closes #17984) --- youtube_dl/extractor/ciscolive.py | 176 ++++++++++++++--------------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 87 insertions(+), 94 deletions(-) diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py index 2db7aad2c..32f645713 100644 --- a/youtube_dl/extractor/ciscolive.py +++ b/youtube_dl/extractor/ciscolive.py @@ -1,84 +1,49 @@ # coding: utf-8 from __future__ import unicode_literals -import re from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urllib_parse_urlparse, - compat_parse_qs ) from ..utils import ( clean_html, + float_or_none, int_or_none, try_get, urlencode_postdata, ) -class CiscoLiveIE(InfoExtractor): - IE_NAME = 'ciscolive' - _VALID_URL = r'(?:https?://)?ciscolive\.cisco\.com/on-demand-library/\??(?P<query>[^#]+)#/(?:session/(?P<id>.+))?$' - _TESTS = [ - { - 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', - 'md5': 'c98acf395ed9c9f766941c70f5352e22', - 'info_dict': { - 'id': '5803694304001', - 'ext': 'mp4', - 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', - 'description': 'md5:ec4a436019e09a918dec17714803f7cc', - 'timestamp': 1530305395, - 'uploader_id': '5647924234001', - 'upload_date': '20180629', - 'location': '16B Mezz.', - }, - }, - { - 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', - 'md5': '993d4cf051f6174059328b1dce8e94bd', - 'info_dict': { - 'upload_date': '20180629', - 'title': 'DevNet Panel-Applying Design Thinking to Building Products in Cisco', - 'timestamp': 1530316421, - 'uploader_id': '5647924234001', - 'id': '5803751616001', - 'description': 'md5:5f144575cd6848117fe2f756855b038b', - 'location': 'WoS, DevNet Theater', - 'ext': 'mp4', - }, - }, - { - 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', - 'md5': '80e0c3b87e373fe3a3316b934b8915bf', - 'info_dict': { - 'upload_date': '20180629', - 'title': 'Beating the CCIE Routing & Switching', - 'timestamp': 1530311842, - 'uploader_id': '5647924234001', - 'id': '5803735679001', - 'description': 'md5:e71970799e92d7f5ff57ae23f64b0929', - 'location': 'Tulúm 02', - 'ext': 'mp4', - }, - } - ] - +class CiscoLiveBaseIE(InfoExtractor): # These appear to be constant across all Cisco Live presentations # and are not tied to any user session or event RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' - RAINFOCUS_APIPROFILEID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' - RAINFOCUS_WIDGETID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, + 'rfWidgetId': RAINFOCUS_WIDGET_ID, + } + + def _call_api(self, ep, rf_id, query, referrer): + headers = self.HEADERS.copy() + headers['Referer'] = referrer + return self._download_json( + self.RAINFOCUS_API_URL % ep, rf_id, data=urlencode_postdata(query), + headers=headers) + def _parse_rf_item(self, rf_item): - ''' Parses metadata and passes to Brightcove extractor ''' event_name = rf_item.get('eventName') title = rf_item['title'] description = clean_html(rf_item.get('abstract')) presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) bc_id = rf_item['videos'][0]['url'] bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id - duration = int_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) location = try_get(rf_item, lambda x: x['times'][0]['room']) if duration: @@ -86,51 +51,76 @@ class CiscoLiveIE(InfoExtractor): return { '_type': 'url_transparent', - 'creator': presenter_name, + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + 'title': title, 'description': description, 'duration': duration, - 'ie_key': 'BrightcoveNew', + 'creator': presenter_name, 'location': location, 'series': event_name, - 'title': title, - 'url': bc_url, } - def _check_bc_id_exists(self, rf_item): - ''' Checks for the existence of a Brightcove URL in an API result ''' - bc_id = try_get(rf_item, lambda x: x['videos'][0]['url']) - if bc_id: - if bc_id.strip().isdigit(): - return rf_item + +class CiscoLiveSessionIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/\??[^#]*#/session/(?P<id>[^/?&]+)' + _TEST = { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'upload_date': '20180629', + 'uploader_id': '5647924234001', + 'location': '16B Mezz.', + }, + 'params': { + 'proxy': '127.0.0.1:8118', + } + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - HEADERS = { - 'Origin': 'https://ciscolive.cisco.com', - 'rfApiProfileId': self.RAINFOCUS_APIPROFILEID, - 'rfWidgetId': self.RAINFOCUS_WIDGETID, - 'Referer': url, + rf_id = self._match_id(url) + rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) + return self._parse_rf_item(rf_result['items'][0]) + + +class CiscoLiveSearchIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'info_dict': { + 'title': 'Filter query', + }, + 'playlist_count': 5, + 'params': { + 'proxy': '127.0.0.1:8118', } - # Single session URL (single video) - if mobj.group('id'): - rf_id = mobj.group('id') - request = self.RAINFOCUS_API_URL % 'session' - data = urlencode_postdata({'id': rf_id}) - rf_result = self._download_json(request, rf_id, data=data, headers=HEADERS) - rf_item = self._check_bc_id_exists(rf_result['items'][0]) - return self._parse_rf_item(rf_item) - else: - # Filter query URL (multiple videos) - rf_query = compat_parse_qs((compat_urllib_parse_urlparse(url).query)) - rf_query['type'] = 'session' - rf_query['size'] = 1000 - data = urlencode_postdata(rf_query) - request = self.RAINFOCUS_API_URL % 'search' - rf_results = self._download_json(request, 'Filter query', data=data, headers=HEADERS) - entries = [ - self._parse_rf_item(rf_item) - for rf_item - in rf_results['sectionList'][0]['items'] - if self._check_bc_id_exists(rf_item) - ] - return self.playlist_result(entries, 'Filter query') + }, { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + + @staticmethod + def _check_bc_id_exists(rf_item): + return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + + def _real_extract(self, url): + rf_query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + rf_query['type'] = 'session' + rf_query['size'] = 1000 + rf_results = self._call_api('search', None, rf_query, url) + entries = [ + self._parse_rf_item(rf_item) + for rf_item + in rf_results['sectionList'][0]['items'] + if self._check_bc_id_exists(rf_item) + ] + return self.playlist_result(entries, playlist_title='Filter query') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2c5988a14..60e6175b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -194,7 +194,10 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .ciscolive import CiscoLiveIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) from .cjsw import CJSWIE from .cliphunter import CliphunterIE from .clippit import ClippitIE From 183417a50fd68c0c63b1d0621c6a0b44fbf2ac52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 21 Nov 2018 06:04:34 +0700 Subject: [PATCH 020/226] [ciscolive:search] Add support for pagination --- youtube_dl/extractor/ciscolive.py | 58 ++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py index 32f645713..c99b6ee58 100644 --- a/youtube_dl/extractor/ciscolive.py +++ b/youtube_dl/extractor/ciscolive.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools + from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -29,12 +31,12 @@ class CiscoLiveBaseIE(InfoExtractor): 'rfWidgetId': RAINFOCUS_WIDGET_ID, } - def _call_api(self, ep, rf_id, query, referrer): + def _call_api(self, ep, rf_id, query, referrer, note=None): headers = self.HEADERS.copy() headers['Referer'] = referrer return self._download_json( - self.RAINFOCUS_API_URL % ep, rf_id, data=urlencode_postdata(query), - headers=headers) + self.RAINFOCUS_API_URL % ep, rf_id, note=note, + data=urlencode_postdata(query), headers=headers) def _parse_rf_item(self, rf_item): event_name = rf_item.get('eventName') @@ -77,9 +79,6 @@ class CiscoLiveSessionIE(CiscoLiveBaseIE): 'uploader_id': '5647924234001', 'location': '16B Mezz.', }, - 'params': { - 'proxy': '127.0.0.1:8118', - } } def _real_extract(self, url): @@ -93,12 +92,9 @@ class CiscoLiveSearchIE(CiscoLiveBaseIE): _TESTS = [{ 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', 'info_dict': { - 'title': 'Filter query', + 'title': 'Search query', }, 'playlist_count': 5, - 'params': { - 'proxy': '127.0.0.1:8118', - } }, { 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', 'only_matching': True, @@ -112,15 +108,35 @@ class CiscoLiveSearchIE(CiscoLiveBaseIE): def _check_bc_id_exists(rf_item): return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + def _entries(self, query, url): + query['size'] = 50 + query['from'] = 0 + for page_num in itertools.count(1): + results = self._call_api( + 'search', None, query, url, + 'Downloading search JSON page %d' % page_num) + sl = try_get(results, lambda x: x['sectionList'][0], dict) + if sl: + results = sl + items = results.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + if not self._check_bc_id_exists(item): + continue + yield self._parse_rf_item(item) + size = int_or_none(results.get('size')) + if size is not None: + query['size'] = size + total = int_or_none(results.get('total')) + if total is not None and query['from'] + query['size'] > total: + break + query['from'] += query['size'] + def _real_extract(self, url): - rf_query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - rf_query['type'] = 'session' - rf_query['size'] = 1000 - rf_results = self._call_api('search', None, rf_query, url) - entries = [ - self._parse_rf_item(rf_item) - for rf_item - in rf_results['sectionList'][0]['items'] - if self._check_bc_id_exists(rf_item) - ] - return self.playlist_result(entries, playlist_title='Filter query') + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query['type'] = 'session' + return self.playlist_result( + self._entries(query, url), playlist_title='Search query') From 6c882aa8991383e1c39a6457cbde5dcab260bff5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 09:44:10 +0100 Subject: [PATCH 021/226] [loc] relax _VALID_URL regex and improve formats extraction --- youtube_dl/extractor/libraryofcongress.py | 37 +++++++++++++++-------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 40295a30b..1e5c82c66 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -16,7 +16,7 @@ from ..utils import ( class LibraryOfCongressIE(InfoExtractor): IE_NAME = 'loc' IE_DESC = 'Library of Congress' - _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)' _TESTS = [{ # embedded via <div class="media-player" 'url': 'http://loc.gov/item/90716351/', @@ -57,6 +57,12 @@ class LibraryOfCongressIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.loc.gov/item/ihas.200197114/', + 'only_matching': True, + }, { + 'url': 'https://www.loc.gov/item/afc1981005_afs20503/', + 'only_matching': True, }] def _real_extract(self, url): @@ -67,12 +73,13 @@ class LibraryOfCongressIE(InfoExtractor): (r'id=(["\'])media-player-(?P<id>.+?)\1', r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1', r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1', - r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'), + r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1', + r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'), webpage, 'media id', group='id') data = self._download_json( 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, - video_id)['mediaObject'] + media_id)['mediaObject'] derivative = data['derivatives'][0] media_url = derivative['derivativeUrl'] @@ -89,25 +96,29 @@ class LibraryOfCongressIE(InfoExtractor): if ext not in ('mp4', 'mp3'): media_url += '.mp4' if is_video else '.mp3' - if 'vod/mp4:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8', + formats = [] + if '/vod/mp4:' in media_url: + formats.append({ + 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8', 'format_id': 'hls', 'ext': 'mp4', 'protocol': 'm3u8_native', 'quality': 1, - }] - elif 'vod/mp3:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp3:', ''), - 'vcodec': 'none', - }] + }) + http_format = { + 'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url), + 'format_id': 'http', + 'quality': 1, + } + if not is_video: + http_format['vcodec'] = 'none' + formats.append(http_format) download_urls = set() for m in re.finditer( r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?: |\s+)\((?P<size>.+?)\))?\s*<', webpage): format_id = m.group('id').lower() - if format_id == 'gif': + if format_id in ('gif', 'jpeg'): continue download_url = m.group('url') if download_url in download_urls: From 35328915b5fe5c8915b924cfbc54bbdd6d6d1430 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 09:46:13 +0100 Subject: [PATCH 022/226] [foxsports] fix extraction(closes #17543) --- youtube_dl/extractor/foxsports.py | 17 +++-------------- youtube_dl/extractor/theplatform.py | 5 +++-- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index 985542727..596fded20 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -8,7 +8,7 @@ from ..utils import ( class FoxSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P<id>\d+)' _TEST = { 'url': 'http://www.foxsports.com/tennessee/video/432609859715', @@ -28,16 +28,5 @@ class FoxSportsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._html_search_regex( - r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", - webpage, 'data player config'), - video_id) - - return self.url_result(smuggle_url(update_url_query( - config['releaseURL'], { - 'mbr': 'true', - 'switch': 'http', - }), {'force_smil_url': True})) + return self.url_result( + 'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 181620615..90b351cbb 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -343,7 +343,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) entry = self._download_json(real_url, video_id)['entries'][0] - main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else None + main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl') formats = [] subtitles = {} @@ -356,7 +356,8 @@ class ThePlatformFeedIE(ThePlatformBaseIE): if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) - for asset_type in item['plfile$assetTypes']: + file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes'] + for asset_type in file_asset_types: if asset_type in asset_types: continue asset_types.append(asset_type) From 4e33e0792a3e134b494bd71f257a674294cca8d9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 12:00:50 +0100 Subject: [PATCH 023/226] [loc] update test --- youtube_dl/extractor/libraryofcongress.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 1e5c82c66..03f205144 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -20,12 +20,11 @@ class LibraryOfCongressIE(InfoExtractor): _TESTS = [{ # embedded via <div class="media-player" 'url': 'http://loc.gov/item/90716351/', - 'md5': '353917ff7f0255aa6d4b80a034833de8', + 'md5': '6ec0ae8f07f86731b1b2ff70f046210a', 'info_dict': { 'id': '90716351', 'ext': 'mp4', 'title': "Pa's trip to Mars", - 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 0, 'view_count': int, }, From 6866f2449437eeb0ad93b80e5bf39cf758af7a26 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 12:08:46 +0100 Subject: [PATCH 024/226] [foxsports] update test --- youtube_dl/extractor/foxsports.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index 596fded20..2b2cb6c6f 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -1,10 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - smuggle_url, - update_url_query, -) class FoxSportsIE(InfoExtractor): @@ -14,14 +10,19 @@ class FoxSportsIE(InfoExtractor): 'url': 'http://www.foxsports.com/tennessee/video/432609859715', 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'bwduI3X_TgUB', + 'id': '432609859715', 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', - 'upload_date': '20150423', - 'timestamp': 1429761109, + # TODO: fix timestamp + 'upload_date': '19700101', # '20150423', + # 'timestamp': 1429761109, 'uploader': 'NEWA-FNG-FOXSPORTS', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['ThePlatform'], } From a843464a7e0608b679651f913cbd9447a7b928c0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 12:10:06 +0100 Subject: [PATCH 025/226] [nbc] fix NBCNews article extraction(closes #16194) --- youtube_dl/extractor/nbc.py | 91 ++++++++----------------------------- 1 file changed, 19 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 765c46fd2..3282f84ee 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,8 @@ from .theplatform import ThePlatformIE from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( - find_xpath_attr, smuggle_url, try_get, - unescapeHTML, update_url_query, int_or_none, ) @@ -269,27 +267,14 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ - (?:video/.+?/(?P<id>\d+)| - ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+)) - ''' + _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' _TESTS = [ - { - 'url': 'http://www.nbcnews.com/video/nbc-news/52753292', - 'md5': '47abaac93c6eaf9ad37ee6c4463a5179', - 'info_dict': { - 'id': '52753292', - 'ext': 'flv', - 'title': 'Crew emerges after four-month Mars food study', - 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', - }, - }, { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'p_tweet_snow_140529', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', @@ -313,7 +298,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'nn_netcast_150204', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', @@ -326,7 +311,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', 'md5': 'a49e173825e5fcd15c13fc297fced39d', 'info_dict': { - 'id': 'x_lon_vwhorn_150922', + 'id': '529953347624', 'ext': 'mp4', 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', 'description': 'md5:c8be487b2d80ff0594c005add88d8351', @@ -339,7 +324,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', 'md5': '118d7ca3f0bea6534f119c68ef539f71', 'info_dict': { - 'id': 'tdy_al_space_160420', + 'id': '669831235788', 'ext': 'mp4', 'title': 'See the aurora borealis from space in stunning new NASA video', 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', @@ -352,7 +337,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', + 'id': '314487875924', 'ext': 'mp4', 'title': 'The chaotic GOP immigration vote', 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', @@ -374,60 +359,22 @@ class NBCNewsIE(ThePlatformIE): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if video_id is not None: - all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) - info = all_info.find('video') - - return { - 'id': video_id, - 'title': info.find('headline').text, - 'ext': 'flv', - 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': info.find('caption').text, - 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, - } - else: - # "feature" and "nightly-news" pages use theplatform.com - video_id = mobj.group('mpx_id') + video_id = self._match_id(url) + if not video_id.isdigit(): webpage = self._download_webpage(url, video_id) - filter_param = 'byId' - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"', - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);'], - webpage, 'bootstrap json', default=None) - if bootstrap_json: - bootstrap = self._parse_json( - bootstrap_json, video_id, transform_source=unescapeHTML) + data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.+});', webpage, + 'bootstrap json'), video_id) + video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id'] - info = None - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - elif 'msnbcVideoInfo' in bootstrap: - info = bootstrap['msnbcVideoInfo']['meta'] - elif 'msnbcThePlatform' in bootstrap: - info = bootstrap['msnbcThePlatform']['videoPlayer']['video'] - else: - info = bootstrap - - if 'guid' in info: - video_id = info['guid'] - filter_param = 'byGuid' - elif 'mpxId' in info: - video_id = info['mpxId'] - - return { - '_type': 'url_transparent', - 'id': video_id, - # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {filter_param: video_id}), - 'ie_key': 'ThePlatformFeed', - } + return { + '_type': 'url_transparent', + 'id': video_id, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}), + 'ie_key': 'ThePlatformFeed', + } class NBCOlympicsIE(InfoExtractor): From af60e81e3c557ace943aab35c1364d3d03d5a3bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:00:38 +0700 Subject: [PATCH 026/226] [setup.py] Add more relevant classifiers --- setup.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/setup.py b/setup.py index 7dbb5805f..a1a08f1e2 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,8 @@ setup( 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'License :: Public Domain', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', @@ -132,6 +134,12 @@ setup( 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: Implementation', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: IronPython', + 'Programming Language :: Python :: Implementation :: Jython', + 'Programming Language :: Python :: Implementation :: PyPy', ], cmdclass={'build_lazy_extractors': build_lazy_extractors}, From bd2d553c7b1529f793c2b7343c514a558543fc0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:01:22 +0700 Subject: [PATCH 027/226] [travis] Add python 3.7 build --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.travis.yml b/.travis.yml index 92f326860..1ea640071 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,12 @@ env: - YTDL_TEST_SET=download matrix: include: + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true From 157eef3e635230cbba0dd0c74f7115029867533e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:08:41 +0700 Subject: [PATCH 028/226] [setup.py] Add python 3.8 classifier --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index a1a08f1e2..dfb669ad2 100644 --- a/setup.py +++ b/setup.py @@ -135,6 +135,7 @@ setup( 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: Implementation', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: IronPython', From 305ce767d586e8796d873270abf771e69ff5586c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 22 Nov 2018 02:34:35 +0700 Subject: [PATCH 029/226] [travis] Add python 3.8-dev build --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.travis.yml b/.travis.yml index 1ea640071..79287ccf6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,12 @@ matrix: - python: 3.7 dist: xenial env: YTDL_TEST_SET=download + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true From 560020da3049bec19e5714e9e24fc90fadd06582 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 23:19:46 +0100 Subject: [PATCH 030/226] [mixcloud] fallback to hardcoded decryption key(closes #18016) --- youtube_dl/extractor/mixcloud.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index b7bccb504..a2d19d3ef 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -161,11 +161,17 @@ class MixcloudIE(InfoExtractor): stream_info = info_json['streamInfo'] formats = [] + def decrypt_url(f_url): + for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): + decrypted_url = self._decrypt_xor_cipher(k, compat_b64decode(f_url)) + if re.search(r'^https?://[0-9a-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): + return decrypted_url + for url_key in ('url', 'hlsUrl', 'dashUrl'): format_url = stream_info.get(url_key) if not format_url: continue - decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url)) + decrypted = decrypt_url(format_url) if not decrypted: continue if url_key == 'hlsUrl': From 6f2883a2df45ca89d272bc8a0975f09758af5eb3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 21 Nov 2018 23:25:38 +0100 Subject: [PATCH 031/226] [mixcloud] base64 decode before decryption --- youtube_dl/extractor/mixcloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a2d19d3ef..bcac13ec5 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -163,7 +163,7 @@ class MixcloudIE(InfoExtractor): def decrypt_url(f_url): for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): - decrypted_url = self._decrypt_xor_cipher(k, compat_b64decode(f_url)) + decrypted_url = self._decrypt_xor_cipher(k, f_url) if re.search(r'^https?://[0-9a-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): return decrypted_url @@ -171,7 +171,7 @@ class MixcloudIE(InfoExtractor): format_url = stream_info.get(url_key) if not format_url: continue - decrypted = decrypt_url(format_url) + decrypted = decrypt_url(compat_b64decode(format_url)) if not decrypted: continue if url_key == 'hlsUrl': From 66173211c4177d36612486acfd99fc4634b8004e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 23 Nov 2018 00:14:43 +0700 Subject: [PATCH 032/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0083c4631..beb002041 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +version <unreleased> + +Core ++ [setup.py] Add more relevant classifiers + +Extractors +* [mixcloud] Fallback to hardcoded decryption key (#18016) +* [nbc:news] Fix article extraction (#16194) +* [foxsports] Fix extraction (#17543) +* [loc] Relax regular expression and improve formats extraction ++ [ciscolive] Add support for ciscolive.cisco.com (#17984) +* [nzz] Relax kaltura regex (#18228) +* [sixplay] Fix formats extraction +* [bitchute] Improve title extraction +* [kaltura] Limit requested MediaEntry fields ++ [americastestkitchen] Add support for zype embeds (#18225) ++ [pornhub] Add pornhub.net alias +* [nova:embed] Fix extraction (#18222) + + version 2018.11.18 Extractors From d861a9d5814408973e0715bb9160fb7db34fbcd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 23 Nov 2018 00:16:45 +0700 Subject: [PATCH 033/226] release 2018.11.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 905576364..35cc8d6d0 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.11.18 +[debug] youtube-dl version 2018.11.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index beb002041..f82c7ea35 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.11.23 Core + [setup.py] Add more relevant classifiers diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9009f7e9e..7d72ad82d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -163,6 +163,8 @@ - **chirbit** - **chirbit:profile** - **Cinchcast** + - **CiscoLiveSearch** + - **CiscoLiveSession** - **CJSW** - **cliphunter** - **Clippit** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7f5ad7bf4..4956365d0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.11.18' +__version__ = '2018.11.23' From 6864855eb111dbf6e0efe9ed086f48efa1d9f209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 23 Nov 2018 00:43:42 +0700 Subject: [PATCH 034/226] [tests] Fix invalid escape sequences --- test/test_compat.py | 2 +- test/test_postprocessors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index d6c54e135..51fe6aa0b 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -39,7 +39,7 @@ class TestCompat(unittest.TestCase): def test_compat_expanduser(self): old_home = os.environ.get('HOME') - test_str = 'C:\Documents and Settings\тест\Application Data' + test_str = r'C:\Documents and Settings\тест\Application Data' compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) compat_setenv('HOME', old_home or '') diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index addb69d6f..4209d1d9a 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -14,4 +14,4 @@ from youtube_dl.postprocessor import MetadataFromTitlePP class TestMetadataFromTitle(unittest.TestCase): def test_format_to_regex(self): pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') - self.assertEqual(pp._titleregex, '(?P<title>.+)\ \-\ (?P<artist>.+)') + self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') From 641e86e3cf751f1050ca331b9d13152bd0e18558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 24 Nov 2018 21:47:41 +0700 Subject: [PATCH 035/226] [wistia] Add support for fast.wistia.com (closes #18287) --- youtube_dl/extractor/wistia.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 2182d6fd4..01a51275e 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -12,7 +12,7 @@ from ..utils import ( class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/iframe/)(?P<id>[a-z0-9]+)' _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' @@ -35,12 +35,15 @@ class WistiaIE(InfoExtractor): # with hls video 'url': 'wistia:807fafadvk', 'only_matching': True, + }, { + 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage) if match: return unescapeHTML(match.group('url')) From d19600df07128c73ef7242af7e1cd8c819951aba Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Sat, 24 Nov 2018 16:14:27 +0100 Subject: [PATCH 036/226] [joj] Fix extraction (closes #18280) --- youtube_dl/extractor/joj.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index d9f8dbfd2..62b28e980 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -61,7 +61,7 @@ class JojIE(InfoExtractor): bitrates = self._parse_json( self._search_regex( - r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates', + r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates', default='{}'), video_id, transform_source=js_to_json, fatal=False) From ca01d178844129bd4b6ed74740fbd30e7f84c1c2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 28 Nov 2018 19:53:22 +0100 Subject: [PATCH 037/226] [vimeo] Add support for VHX(Vimeo OTT)(#14835) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vimeo.py | 85 +++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 60e6175b1..cd91c0fcb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1303,6 +1303,7 @@ from .vimeo import ( VimeoReviewIE, VimeoUserIE, VimeoWatchLaterIE, + VHXEmbedIE, ) from .vimple import VimpleIE from .vine import ( diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 88f4d9979..6353c6831 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -14,10 +14,13 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, + js_to_json, InAdvancePagedList, int_or_none, merge_dicts, NO_DEFAULT, + parse_filesize, + qualities, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -27,7 +30,6 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, unescapeHTML, - parse_filesize, ) @@ -1063,3 +1065,84 @@ class VimeoLikesIE(InfoExtractor): 'description': description, 'entries': pl, } + + +class VHXEmbedIE(InfoExtractor): + IE_NAME = 'vhx:embed' + _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' + + def _call_api(self, video_id, access_token, path='', query=None): + return self._download_json( + 'https://api.vhx.tv/videos/' + video_id + path, video_id, headers={ + 'Authorization': 'Bearer ' + access_token, + }, query=query) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + credentials = self._parse_json(self._search_regex( + r'(?s)credentials\s*:\s*({.+?}),', webpage, + 'config'), video_id, js_to_json) + access_token = credentials['access_token'] + + query = {} + for k, v in credentials.items(): + if k in ('authorization', 'authUserToken', 'ticket') and v and v != 'undefined': + if k == 'authUserToken': + query['auth_user_token'] = v + else: + query[k] = v + files = self._call_api(video_id, access_token, '/files', query) + + formats = [] + for f in files: + href = try_get(f, lambda x: x['_links']['source']['href']) + if not href: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif method == 'dash': + formats.extend(self._extract_mpd_formats( + href, video_id, mpd_id='dash', fatal=False)) + else: + fmt = { + 'filesize': int_or_none(try_get(f, lambda x: x['size']['bytes'])), + 'format_id': 'http', + 'preference': 1, + 'url': href, + 'vcodec': f.get('codec'), + } + quality = f.get('quality') + if quality: + fmt.update({ + 'format_id': 'http-' + quality, + 'height': int_or_none(self._search_regex(r'(\d+)p', quality, 'height', default=None)), + }) + formats.append(fmt) + self._sort_formats(formats) + + video_data = self._call_api(video_id, access_token) + title = video_data.get('title') or video_data['name'] + + q = qualities(['small', 'medium', 'large', 'source']) + thumbnails = [] + for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': q(thumbnail_id), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), + 'formats': formats, + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(video_data.get('created_at')), + 'view_count': int_or_none(video_data.get('plays_count')), + } From d9df8f120b325766181fb474a8c534e51df78f17 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 28 Nov 2018 20:13:36 +0100 Subject: [PATCH 038/226] [vimeo] extract VHX subtitles --- youtube_dl/extractor/vimeo.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 6353c6831..5e15f060b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1127,6 +1127,17 @@ class VHXEmbedIE(InfoExtractor): video_data = self._call_api(video_id, access_token) title = video_data.get('title') or video_data['name'] + subtitles = {} + for subtitle in try_get(video_data, lambda x: x['tracks']['subtitles'], list) or []: + lang = subtitle.get('srclang') or subtitle.get('label') + for _link in subtitle.get('_links', {}).values(): + href = _link.get('href') + if not href: + continue + subtitles.setdefault(lang, []).append({ + 'url': href, + }) + q = qualities(['small', 'medium', 'large', 'source']) thumbnails = [] for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): @@ -1142,6 +1153,7 @@ class VHXEmbedIE(InfoExtractor): 'description': video_data.get('description'), 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'timestamp': unified_timestamp(video_data.get('created_at')), 'view_count': int_or_none(video_data.get('plays_count')), From 053e5b12b2e38b7d343aafbb7dc13fb8e4933015 Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Thu, 29 Nov 2018 18:12:18 +0100 Subject: [PATCH 039/226] [azmedien] Fix extraction (closes #18334) --- youtube_dl/extractor/azmedien.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index a57a5f114..fcbdc71b9 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -36,7 +36,6 @@ class AZMedienIE(InfoExtractor): 'id': '1_anruz3wy', 'ext': 'mp4', 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', - 'description': 'md5:dd9f96751ec9c35e409a698a328402f3', 'uploader_id': 'TVOnline', 'upload_date': '20180930', 'timestamp': 1538328802, @@ -53,15 +52,12 @@ class AZMedienIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') video_id = mobj.group('id') entry_id = mobj.group('kaltura_id') if not entry_id: - webpage = self._download_webpage(url, video_id) - api_path = self._search_regex( - r'["\']apiPath["\']\s*:\s*["\']([^"^\']+)["\']', - webpage, 'api path') - api_url = 'https://www.%s%s' % (mobj.group('host'), api_path) + api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0]) payload = { 'query': '''query VideoContext($articleId: ID!) { article: node(id: $articleId) { From adbbdefc8126a933d9ff0a6e603fb312e4b4cbdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 30 Nov 2018 00:48:15 +0700 Subject: [PATCH 040/226] [hotstar] Add support for alternative app state layout (closes #18320) --- youtube_dl/extractor/hotstar.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index bf5717f1b..45aa5e7ea 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -43,6 +43,7 @@ class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' _TESTS = [{ + # contentData 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { 'id': '1000076273', @@ -57,6 +58,10 @@ class HotStarIE(HotStarBaseIE): # m3u8 download 'skip_download': True, } + }, { + # contentDetail + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, }, { 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583', 'only_matching': True, @@ -74,8 +79,12 @@ class HotStarIE(HotStarBaseIE): r'<script>window\.APP_STATE\s*=\s*({.+?})</script>', webpage, 'app state'), video_id) video_data = {} + getters = ( + lambda x, k=k: x['initialState']['content%s' % k]['content'] + for k in ('Data', 'Detail') + ) for v in app_state.values(): - content = try_get(v, lambda x: x['initialState']['contentData']['content'], dict) + content = try_get(v, getters, dict) if content and content.get('contentId') == video_id: video_data = content From 16597c2f9492651c55e01f441ee9cb5c276209cb Mon Sep 17 00:00:00 2001 From: Jimm Stout <jamesstout1@gmail.com> Date: Thu, 29 Nov 2018 13:07:07 -0500 Subject: [PATCH 041/226] [gfycat] Update API endpoint (closes #18333) --- youtube_dl/extractor/gfycat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index a0670b645..c1b36a59b 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -53,7 +53,7 @@ class GfycatIE(InfoExtractor): video_id = self._match_id(url) gfy = self._download_json( - 'http://gfycat.com/cajax/get/%s' % video_id, + 'https://api.gfycat.com/v1/gfycats/%s' % video_id, video_id, 'Downloading video info') if 'error' in gfy: raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True) From f012823082c893c0fc1f96afb8a91f5b1c1ae07a Mon Sep 17 00:00:00 2001 From: Hakim Boyles <hak@volkanite.net> Date: Thu, 29 Nov 2018 14:20:27 -0400 Subject: [PATCH 042/226] [lynda] Fix authentication (closes #18158) --- youtube_dl/extractor/lynda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 4ba61cd8a..3084c6dff 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -15,7 +15,7 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): - _SIGNIN_URL = 'https://www.lynda.com/signin' + _SIGNIN_URL = 'https://www.lynda.com/signin/lynda' _PASSWORD_URL = 'https://www.lynda.com/signin/password' _USER_URL = 'https://www.lynda.com/signin/user' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' From 3430ff9b07d4dc9dd39617af54aeffb381d88737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Dec 2018 16:45:51 +0700 Subject: [PATCH 043/226] [pornhub] Use actual URL host for requests (closes #18359) --- youtube_dl/extractor/pornhub.py | 34 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index c9c884095..e377de196 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -27,7 +27,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -129,7 +129,7 @@ class PornHubIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)', + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -137,14 +137,16 @@ class PornHubIE(InfoExtractor): pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') - self._set_cookie('pornhub.com', 'age_verified', '1') + self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): - self._set_cookie('pornhub.com', 'platform', platform) + self._set_cookie(host, 'platform', platform) return self._download_webpage( - 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, + 'http://www.%s/view_video.php?viewkey=%s' % (host, video_id), video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') @@ -306,7 +308,7 @@ class PornHubIE(InfoExtractor): class PornHubPlaylistBaseIE(InfoExtractor): - def _extract_entries(self, webpage): + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see # https://github.com/rg3/youtube-dl/issues/11594). @@ -316,7 +318,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): return [ self.url_result( - 'http://www.pornhub.com/%s' % video_url, + 'http://www.%s/%s' % (host, video_url), PornHubIE.ie_key(), video_title=title) for video_url, title in orderedSet(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', @@ -324,11 +326,13 @@ class PornHubPlaylistBaseIE(InfoExtractor): ] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) - entries = self._extract_entries(webpage) + entries = self._extract_entries(webpage, host) playlist = self._parse_json( self._search_regex( @@ -343,7 +347,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/playlist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/playlist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -358,7 +362,7 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -399,7 +403,9 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }] def _real_extract(self, url): - user_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + user_id = mobj.group('id') entries = [] for page_num in itertools.count(1): @@ -411,7 +417,7 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: break raise - page_entries = self._extract_entries(webpage) + page_entries = self._extract_entries(webpage, host) if not page_entries: break entries.extend(page_entries) From aa374bc78e5e4dbb8453bd367ae0e3c0db702a7a Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Sat, 1 Dec 2018 18:05:15 +0100 Subject: [PATCH 044/226] [utils] Fix random_birthday to generate existing dates only --- youtube_dl/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e84d35d4d..0b1c7cd6c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3948,8 +3948,12 @@ def write_xattr(path, key, value): def random_birthday(year_field, month_field, day_field): + start_date = datetime.date(1950, 1, 1) + end_date = datetime.date(1995, 12, 31) + offset = random.randint(0, (end_date - start_date).days) + random_date = start_date + datetime.timedelta(offset) return { - year_field: str(random.randint(1950, 1995)), - month_field: str(random.randint(1, 12)), - day_field: str(random.randint(1, 31)), + year_field: str(random_date.year), + month_field: str(random_date.month), + day_field: str(random_date.day), } From 1ead840d2c18d4add340117f676fd6694f0650d3 Mon Sep 17 00:00:00 2001 From: Ken Swenson <flat@esoteric.moe> Date: Fri, 9 Nov 2018 16:49:20 -0500 Subject: [PATCH 045/226] [tiktok] Add extractor (closes #18108) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tiktok.py | 79 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/tiktok.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cd91c0fcb..547331078 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1124,6 +1124,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE +from .tiktok import TikTokIE from .tinypic import TinyPicIE from .tmz import ( TMZIE, diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py new file mode 100644 index 000000000..d71b09c66 --- /dev/null +++ b/youtube_dl/extractor/tiktok.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_str, + int_or_none, + str_or_none, + try_get, + url_or_none, +) + + +class TikTokIE(InfoExtractor): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://m.tiktok.com/v/6606727368545406213.html', + 'md5': 'd584b572e92fcd48888051f238022420', + 'info_dict': { + 'id': '6606727368545406213', + 'ext': 'mp4', + 'title': 'Zureeal on TikTok', + 'thumbnail': r're:^https?://.*~noop.image', + 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', + 'uploader': 'Zureeal', + 'width': 540, + 'height': 960, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'var\s+data\s*=\s*({.+?});', webpage, 'data' + ), video_id) + + title = self._og_search_title(webpage) + + description = str_or_none(try_get(data, lambda x: x['desc'])) + width = int_or_none(try_get(data, lambda x: x['video']['width'])) + height = int_or_none(try_get(data, lambda x: x['video']['height'])) + + formats = [] + + for count, (key, label) in enumerate((('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')), -2): + for format in try_get(data, lambda x: x['video'][key]['url_list']): + format_url = url_or_none(format) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + 'format_note': label, + 'quality': count + }) + + self._sort_formats(formats) + + uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) + + thumbnail = url_or_none( + try_get( + data, lambda x: x['video']['cover']['url_list'][0], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'formats': formats, + 'thumbnail': thumbnail, + 'width': width, + 'height': height, + } From ce18a19be9af1e227e7162636cbf6b277adc1b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Dec 2018 02:39:22 +0700 Subject: [PATCH 046/226] [tiktok] Improve extraction and add support for user pages (closes #18135) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tiktok.py | 136 ++++++++++++++++++----------- 2 files changed, 91 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 547331078..0baed6b27 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1124,7 +1124,10 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE -from .tiktok import TikTokIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, +) from .tinypic import TinyPicIE from .tmz import ( TMZIE, diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index d71b09c66..083e9f36d 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( compat_str, + ExtractorError, int_or_none, str_or_none, try_get, @@ -11,69 +12,106 @@ from ..utils import ( ) -class TikTokIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>[0-9]+)' +class TikTokBaseIE(InfoExtractor): + def _extract_aweme(self, data): + video = data['video'] + description = str_or_none(try_get(data, lambda x: x['desc'])) + width = int_or_none(try_get(data, lambda x: video['width'])) + height = int_or_none(try_get(data, lambda x: video['height'])) + + format_urls = set() + formats = [] + for format_id in ( + 'play_addr_lowbr', 'play_addr', 'play_addr_h264', + 'download_addr'): + for format in try_get( + video, lambda x: x[format_id]['url_list'], list) or []: + format_url = url_or_none(format) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + }) + self._sort_formats(formats) + + thumbnail = url_or_none(try_get( + video, lambda x: x['cover']['url_list'][0], compat_str)) + uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) + timestamp = int_or_none(data.get('create_time')) + comment_count = int_or_none(data.get('comment_count')) or int_or_none( + try_get(data, lambda x: x['statistics']['comment_count'])) + repost_count = int_or_none(try_get( + data, lambda x: x['statistics']['share_count'])) + + aweme_id = data['aweme_id'] + + return { + 'id': aweme_id, + 'title': uploader or aweme_id, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'repost_count': repost_count, + 'formats': formats, + } + + +class TikTokIE(TikTokBaseIE): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>\d+)' _TEST = { 'url': 'https://m.tiktok.com/v/6606727368545406213.html', 'md5': 'd584b572e92fcd48888051f238022420', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', - 'title': 'Zureeal on TikTok', - 'thumbnail': r're:^https?://.*~noop.image', + 'title': 'Zureeal', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', + 'thumbnail': r're:^https?://.*~noop.image', 'uploader': 'Zureeal', - 'width': 540, - 'height': 960, + 'timestamp': 1538248586, + 'upload_date': '20180929', + 'comment_count': int, + 'repost_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + data = self._parse_json(self._search_regex( + r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) + return self._extract_aweme(data) - data = self._parse_json( - self._search_regex( - r'var\s+data\s*=\s*({.+?});', webpage, 'data' - ), video_id) - title = self._og_search_title(webpage) +class TikTokUserIE(TikTokBaseIE): + _VALID_URL = r'https?://(?:m\.)?tiktok\.com/h5/share/usr/(?P<id>\d+)' + _TEST = { + 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', + 'info_dict': { + 'id': '188294915489964032', + }, + 'playlist_mincount': 24, + } - description = str_or_none(try_get(data, lambda x: x['desc'])) - width = int_or_none(try_get(data, lambda x: x['video']['width'])) - height = int_or_none(try_get(data, lambda x: x['video']['height'])) - - formats = [] - - for count, (key, label) in enumerate((('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')), -2): - for format in try_get(data, lambda x: x['video'][key]['url_list']): - format_url = url_or_none(format) - if not format_url: - continue - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'height': height, - 'width': width, - 'format_note': label, - 'quality': count - }) - - self._sort_formats(formats) - - uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) - - thumbnail = url_or_none( - try_get( - data, lambda x: x['video']['cover']['url_list'][0], compat_str)) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'uploader': uploader, - 'formats': formats, - 'thumbnail': thumbnail, - 'width': width, - 'height': height, - } + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._download_json( + 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, + query={'_signature': '_'}) + entries = [] + for aweme in data['aweme_list']: + try: + entry = self._extract_aweme(aweme) + except ExtractorError: + continue + entry['extractor_key'] = TikTokIE.ie_key() + entries.append(entry) + return self.playlist_result(entries, user_id) From 1fa59a928e48ac81e2afb5dfe134fa6c40f64e44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Dec 2018 00:06:54 +0700 Subject: [PATCH 047/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index f82c7ea35..ddd4bec14 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version <unreleased> + +Core +* [utils] Fix random_birthday to generate existing dates only (#18284) + +Extractors ++ [tiktok] Add support for tiktok.com (#18108, #18135) +* [pornhub] Use actual URL host for requests (#18359) +* [lynda] Fix authentication (#18158, #18217) +* [gfycat] Update API endpoint (#18333, #18343) ++ [hotstar] Add support for alternative app state layout (#18320) +* [azmedien] Fix extraction (#18334, #18336) ++ [vimeo] Add support for VHX (Vimeo OTT) (#14835) +* [joj] Fix extraction (#18280, #18281) ++ [wistia] Add support for fast.wistia.com (#18287) + + version 2018.11.23 Core From ab896fa894d2395b886e5bb8168ca0e0e6e9517d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Dec 2018 00:10:20 +0700 Subject: [PATCH 048/226] release 2018.12.03 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 35cc8d6d0..4b35244a8 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.11.23 +[debug] youtube-dl version 2018.12.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ddd4bec14..689d07826 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.12.03 Core * [utils] Fix random_birthday to generate existing dates only (#18284) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7d72ad82d..837b0199b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -885,6 +885,8 @@ - **ThisAmericanLife** - **ThisAV** - **ThisOldHouse** + - **TikTok** + - **TikTokUser** - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -979,6 +981,7 @@ - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** + - **vhx:embed** - **Viafree** - **vice** - **vice:article** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4956365d0..8e1203892 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.11.23' +__version__ = '2018.12.03' From 5547014ad972a4364c4d4a613db6d3a18f25950e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 2 Dec 2018 20:01:36 +0100 Subject: [PATCH 049/226] [gamespot] add support reviews URLs --- youtube_dl/extractor/gamespot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index ab647dd41..4236a5ed8 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(OnceIE): - _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article|review)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -41,6 +41,9 @@ class GameSpotIE(OnceIE): }, { 'url': 'https://www.gamespot.com/articles/the-last-of-us-2-receives-new-ps4-trailer/1100-6454469/', 'only_matching': True, + }, { + 'url': 'https://www.gamespot.com/reviews/gears-of-war-review/1900-6161188/', + 'only_matching': True, }] def _real_extract(self, url): From 8bb0c9cc16b841ab89e15e620e00ee954ab316ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 5 Dec 2018 07:03:00 +0100 Subject: [PATCH 050/226] [tbs] fix info extraction(fixes #18403) --- youtube_dl/extractor/tbs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index 784f8ed66..e8a7c65e0 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -16,7 +16,7 @@ from ..utils import ( class TBSIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))' _TESTS = [{ 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', 'info_dict': { @@ -40,12 +40,12 @@ class TBSIE(TurnerBaseIE): }] def _real_extract(self, url): - site, display_id = re.match(self._VALID_URL, url).groups() + site, path, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) drupal_settings = self._parse_json(self._search_regex( r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', webpage, 'drupal setting'), display_id) - video_data = drupal_settings['turner_playlist'][0] + video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path) media_id = video_data['mediaID'] title = video_data['title'] From ae9d77dab54630ac480bd4082468d543a15c341e Mon Sep 17 00:00:00 2001 From: v-delta <45652398+v-delta@users.noreply.github.com> Date: Thu, 6 Dec 2018 17:24:35 +0100 Subject: [PATCH 051/226] [yourporn] Fix extraction (closes #18424) --- youtube_dl/extractor/yourporn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index 6602f7c03..a9951f3b8 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -26,7 +26,7 @@ class YourPornIE(InfoExtractor): self._search_regex( r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info', group='data'), - video_id)[video_id]) + video_id)[video_id]).replace('/cdn/', '/cdn2/') title = (self._search_regex( r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', From 33cc1ea586480ccc60fd25f2d42cfb44eec605c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 00:00:06 +0700 Subject: [PATCH 052/226] [nrktv] Relax _VALID_URL (closes #18304, closes #18387) --- youtube_dl/extractor/nrk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index a231735fb..c5001ef48 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -248,7 +248,7 @@ class NRKTVIE(NRKBaseIE): _VALID_URL = r'''(?x) https?:// (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie/[^/]+|program)/ + (?:serie(?:/[^/]+){1,2}|program)/ (?![Ee]pisodes)%s (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? @@ -362,6 +362,9 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, }] From 15699ec8b0a9cb8d519b721a5cb8199afe62fc3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 00:49:24 +0700 Subject: [PATCH 053/226] [nrktv:season,series] Fix extraction and update tests (closes #17159, closes #17258) --- youtube_dl/extractor/nrk.py | 68 +++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index c5001ef48..48bc6fd7a 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -211,13 +211,13 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '2f7f6eeb2aacdd99885f355428715cfa', + 'md5': '706f34cdf1322577589e369e522b50ef', 'info_dict': { 'id': '150533', 'ext': 'mp4', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 263, + 'duration': 262, } }, { # audio @@ -256,14 +256,14 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '4e9ca6629f09e588ed240fb11619922a', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, - 'series': '20 spørsmål - TV', + 'series': '20 spørsmål', 'episode': '23.05.2014', }, }, { @@ -301,7 +301,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515AH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 772, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -314,7 +314,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515BH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 6175, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -326,7 +326,7 @@ class NRKTVIE(NRKBaseIE): 'info_dict': { 'id': 'MSPO40010515', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', }, 'expected_warnings': ['Video is geo restricted'], }, { @@ -406,21 +406,35 @@ class NRKTVSerieBaseIE(InfoExtractor): def _extract_series(self, webpage, display_id, fatal=True): config = self._parse_json( self._search_regex( - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config', - default='{}' if not fatal else NO_DEFAULT), + (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;', + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'), + webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False) if not config: return - return try_get(config, lambda x: x['series'], dict) + return try_get( + config, + (lambda x: x['initialState']['series'], lambda x: x['series']), + dict) + + def _extract_seasons(self, seasons): + if not isinstance(seasons, list): + return [] + entries = [] + for season in seasons: + entries.extend(self._extract_episodes(season)) + return entries def _extract_episodes(self, season): - entries = [] if not isinstance(season, dict): - return entries - episodes = season.get('episodes') - if not isinstance(episodes, list): - return entries - for episode in episodes: + return [] + return self._extract_entries(season.get('episodes')) + + def _extract_entries(self, entry_list): + if not isinstance(entry_list, list): + return [] + entries = [] + for episode in entry_list: nrk_id = episode.get('prfId') if not nrk_id or not isinstance(nrk_id, compat_str): continue @@ -465,7 +479,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' _TESTS = [{ - # new layout + # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', 'info_dict': { 'id': 'backstage', @@ -474,20 +488,21 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, 'playlist_mincount': 60, }, { - # old layout + # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', 'info_dict': { 'id': 'groenn-glede', 'title': 'Grønn glede', 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', }, - 'playlist_mincount': 9, + 'playlist_mincount': 10, }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', + # old layout + 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', + 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', }, 'playlist_mincount': 3, }, { @@ -520,11 +535,11 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): description = try_get( series, lambda x: x['titles']['subtitle'], compat_str) entries = [] - for season in series['seasons']: - entries.extend(self._extract_episodes(season)) + entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_entries(series.get('instalments'))) return self.playlist_result(entries, series_id, title, description) - # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) + # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) entries = [ self.url_result( 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( @@ -536,6 +551,9 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'seriestitle', webpage, 'title', default=None) or self._og_search_title( webpage, fatal=False) + if title: + title = self._search_regex( + r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) description = self._html_search_meta( 'series_description', webpage, @@ -596,7 +614,7 @@ class NRKPlaylistIE(NRKPlaylistBaseIE): 'title': 'Rivertonprisen til Karin Fossum', 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', }, - 'playlist_count': 5, + 'playlist_count': 2, }] def _extract_title(self, webpage): From c976873c5b2912c06ff53e5193640ee8627edee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 00:54:58 +0700 Subject: [PATCH 054/226] [nrktv:series] Add support for extra materials --- youtube_dl/extractor/nrk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 48bc6fd7a..072f920a9 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -537,6 +537,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): entries = [] entries.extend(self._extract_seasons(series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) + entries.extend(self._extract_episodes(series.get('extraMaterial'))) return self.playlist_result(entries, series_id, title, description) # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) From dfe0a3a9d2e07ac1e5ad221912d03b999ebb4d75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Dec 2018 03:27:11 +0700 Subject: [PATCH 055/226] [lecturio] Add extractor (closes #18405) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/lecturio.py | 186 +++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 youtube_dl/extractor/lecturio.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0baed6b27..e5f18a75d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -553,6 +553,10 @@ from .lcp import ( ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, +) from .leeco import ( LeIE, LePlaylistIE, diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py new file mode 100644 index 000000000..62ff28e02 --- /dev/null +++ b/youtube_dl/extractor/lecturio.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + extract_attributes, + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class LecturioBaseIE(InfoExtractor): + _LOGIN_URL = 'https://app.lecturio.com/en/login' + _NETRC_MACHINE = 'lecturio' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + # Sets some cookies + _, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(url_handle): + return self._LOGIN_URL not in compat_str(url_handle.geturl()) + + # Already logged in + if is_logged(urlh): + return + + login_form = { + 'signin[email]': username, + 'signin[password]': password, + 'signin[remember]': 'on', + } + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + # Logged in successfully + if is_logged(urlh): + return + + errors = self._html_search_regex( + r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, + 'errors', default=None) + if errors: + raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.lecture' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', + 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', + 'info_dict': { + 'id': '39634', + 'ext': 'mp4', + 'title': 'Important Concepts and Terms – Introduction to Microbiology', + }, + 'skip': 'Requires lecturio account credentials', + } + + _CC_LANGS = { + 'German': 'de', + 'English': 'en', + 'Spanish': 'es', + 'French': 'fr', + 'Polish': 'pl', + 'Russian': 'ru', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, + display_id) + + lecture_id = self._search_regex( + r'lecture_id\s*=\s*(?:L_)?(\d+)', webpage, 'lecture id') + + api_url = self._search_regex( + r'lectureDataLink\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'api url', group='url') + + video = self._download_json(api_url, display_id) + + title = video['title'].strip() + + formats = [] + for format_ in video['content']['media']: + if not isinstance(format_, dict): + continue + file_ = format_.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'smil': + # smil contains only broken RTMP formats anyway + continue + file_url = url_or_none(file_) + if not file_url: + continue + label = str_or_none(format_.get('label')) + filesize = int_or_none(format_.get('fileSize')) + formats.append({ + 'url': file_url, + 'format_id': label, + 'filesize': float_or_none(filesize, invscale=1000) + }) + self._sort_formats(formats) + + subtitles = {} + automatic_captions = {} + cc = self._parse_json( + self._search_regex( + r'subtitleUrls\s*:\s*({.+?})\s*,', webpage, 'subtitles', + default='{}'), display_id, fatal=False) + for cc_label, cc_url in cc.items(): + cc_url = url_or_none(cc_url) + if not cc_url: + continue + sub_dict = automatic_captions if 'auto-translated' in cc_label else subtitles + lang = self._search_regex( + r'/([a-z]{2})_', cc_url, 'lang', default=cc_label.split()[0]) + sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ + 'url': cc_url, + }) + + return { + 'id': lecture_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class LecturioCourseIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.course' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', + 'info_dict': { + 'id': 'microbiology-introduction', + 'title': 'Microbiology: Introduction', + }, + 'playlist_count': 45, + 'skip': 'Requires lecturio account credentials', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)<[^>]+\bdata-url=(["\'])(?:(?!\1).)+\.lecture\b[^>]+>', + webpage): + params = extract_attributes(mobj.group(0)) + lecture_url = urljoin(url, params.get('data-url')) + lecture_id = params.get('data-id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r'<span[^>]+class=["\']content-title[^>]+>([^<]+)', webpage, + 'title', default=None) + + return self.playlist_result(entries, display_id, title) From ebb0449049c198f04103502c95a13171b854d1c7 Mon Sep 17 00:00:00 2001 From: ealgase <mostdigitsofpi@gmail.com> Date: Thu, 6 Dec 2018 15:36:08 -0500 Subject: [PATCH 056/226] [xvideos] Switch to HTTPS (closes #18422) --- youtube_dl/extractor/xvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index efee95651..ec2d913fc 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -45,7 +45,7 @@ class XVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.xvideos.com/video%s/' % video_id, video_id) + 'https://www.xvideos.com/video%s/' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: From 8c5879715f4d979b83c49d44a9094307247097ba Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Thu, 6 Dec 2018 21:41:02 +0100 Subject: [PATCH 057/226] [ard:mediathek] Fix title and description extraction (closes #18349) --- youtube_dl/extractor/ard.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf8f61eb..84e96f769 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -173,13 +173,18 @@ class ARDMediathekIE(InfoExtractor): title = self._html_search_regex( [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', r'<meta name="dcterms\.title" content="(.*?)"/>', - r'<h4 class="headline">(.*?)</h4>'], + r'<h4 class="headline">(.*?)</h4>', + r'<title[^>]*>(.*?)'], webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( - 'description', webpage, 'meta description') + 'description', webpage, 'meta description', default=None) + if description is None: + description = self._html_search_regex( + r'(.+?)

', + webpage, 'teaser text', default=None) # Thumbnail is sometimes not present. # It is in the mobile version, but that seems to use a different URL From c3c098dcf2826ab4d668a92c9137cca2c0c42a4f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 7 Dec 2018 18:52:01 +0100 Subject: [PATCH 058/226] [hotstar] fix video data extraction(closes #18386) --- youtube_dl/extractor/hotstar.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 45aa5e7ea..8de9c4faf 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -79,7 +79,7 @@ class HotStarIE(HotStarBaseIE): r'', webpage, 'app state'), video_id) video_data = {} - getters = ( + getters = list( lambda x, k=k: x['initialState']['content%s' % k]['content'] for k in ('Data', 'Detail') ) @@ -87,6 +87,7 @@ class HotStarIE(HotStarBaseIE): content = try_get(v, getters, dict) if content and content.get('contentId') == video_id: video_data = content + break title = video_data['title'] From 9235b5091cedcc21c8dc32d4b292340edeee4ed0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 8 Dec 2018 23:57:40 +0700 Subject: [PATCH 059/226] [iprima] Relax _VALID_URL (closes #18453) --- youtube_dl/extractor/iprima.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 1d58d6e85..11a6629d2 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -12,7 +12,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:play|prima)\.iprima\.cz/(?:.+/)?(?P[^?#]+)' + _VALID_URL = r'https?://(?:play|prima|www)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_BYPASS = False _TESTS = [{ @@ -41,6 +41,9 @@ class IPrimaIE(InfoExtractor): # iframe prima.iprima.cz 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', 'only_matching': True, + }, { + 'url': 'http://www.iprima.cz/filmy/desne-rande', + 'only_matching': True, }] def _real_extract(self, url): From 1d88b3e6e6e59e4b52305faf6c1bf1fd69c555ee Mon Sep 17 00:00:00 2001 From: aegamesi Date: Sat, 29 Apr 2017 22:56:33 -0700 Subject: [PATCH 060/226] [YoutubeDL] Recognize expires=0 as session cookies and send session cookies with requests --- youtube_dl/YoutubeDL.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 38ba43a97..2433f74f4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2300,7 +2300,13 @@ class YoutubeDL(object): self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load() + self.cookiejar.load(ignore_discard=True, ignore_expires=True) + # Force CookieJar to treat 'expires=0' cookies as session/discard cookies + # Fixes https://bugs.python.org/issue17164 + for cookie in self.cookiejar: + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: From 1bab3437046646da4ebe2b8e0c7fdc25aa1072ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 06:00:32 +0700 Subject: [PATCH 061/226] [YoutubeDL] Introduce YoutubeDLCookieJar and clarify the rationale behind session cookies (closes #12929) --- youtube_dl/YoutubeDL.py | 12 +++--------- youtube_dl/utils.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2433f74f4..4493fd0e1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -88,6 +88,7 @@ from .utils import ( version_tuple, write_json_file, write_string, + YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, ) @@ -558,7 +559,7 @@ class YoutubeDL(object): self.restore_console_title() if self.params.get('cookiefile') is not None: - self.cookiejar.save() + self.cookiejar.save(ignore_discard=True, ignore_expires=True) def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -2297,16 +2298,9 @@ class YoutubeDL(object): self.cookiejar = compat_cookiejar.CookieJar() else: opts_cookiefile = expand_path(opts_cookiefile) - self.cookiejar = compat_cookiejar.MozillaCookieJar( - opts_cookiefile) + self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): self.cookiejar.load(ignore_discard=True, ignore_expires=True) - # Force CookieJar to treat 'expires=0' cookies as session/discard cookies - # Fixes https://bugs.python.org/issue17164 - for cookie in self.cookiejar: - if cookie.expires == 0: - cookie.expires = None - cookie.discard = True cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0b1c7cd6c..62e769fd5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_HTMLParser, compat_basestring, compat_chr, + compat_cookiejar, compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, compat_expanduser, @@ -1139,6 +1140,33 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) +class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + # Store session cookies with `expires` set to 0 instead of an empty + # string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): def __init__(self, cookiejar=None): compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) From 5f47a60c5d66d65b505131a672a3bad67ddaa00f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 9 Dec 2018 09:35:17 +0100 Subject: [PATCH 062/226] [imgur] improve gallery and album detection and extraction(closes #9133)(closes #16577)(closes #17223)(closes #18404) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/imgur.py | 86 +++++++++++++++--------------- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5f18a75d..e28db6968 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -483,6 +483,7 @@ from .imdb import ( from .imgur import ( ImgurIE, ImgurAlbumIE, + ImgurGalleryIE, ) from .ina import InaIE from .inc import IncIE diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index ecc958a17..0eb54db3f 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -20,28 +20,9 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', - 'info_dict': { - 'id': 'A61SaA1', - 'ext': 'mp4', - 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', - }, - }, { - 'url': 'https://imgur.com/gallery/YcAQlkx', - 'info_dict': { - 'id': 'YcAQlkx', - 'ext': 'mp4', - 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - } - }, { - 'url': 'http://imgur.com/topic/Funny/N8rOudd', - 'only_matching': True, - }, { - 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, }, { 'url': 'https://i.imgur.com/crGpqCV.mp4', @@ -50,8 +31,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) - webpage = self._download_webpage(gifv_url, video_id) + webpage = self._download_webpage( + 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -72,7 +53,6 @@ class ImgurIE(InfoExtractor): 'format_id': m.group('type').partition('/')[2], 'url': self._proto_relative_url(m.group('src')), 'ext': mimetype2ext(m.group('type')), - 'acodec': 'none', 'width': width, 'height': height, 'http_headers': { @@ -107,44 +87,64 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage, default=None), 'title': self._og_search_title(webpage), } -class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{5})(?:[/?#&]+)?$' +class ImgurGalleryIE(InfoExtractor): + IE_NAME = 'imgur:gallery' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', + 'title': 'Adding faces make every GIF better', }, 'playlist_count': 25, }, { - 'url': 'http://imgur.com/a/j6Orj', + 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, }, { - 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, }] def _real_extract(self, url): - album_id = self._match_id(url) + gallery_id = self._match_id(url) - album_images = self._download_json( - 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id, fatal=False) + data = self._download_json( + 'https://imgur.com/gallery/%s.json' % gallery_id, + gallery_id)['data']['image'] - if album_images: - data = album_images.get('data') - if data and isinstance(data, dict): - images = data.get('images') - if images and isinstance(images, list): - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in images if image.get('hash')] - return self.playlist_result(entries, album_id) + if data.get('is_album'): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) + for image in data['album_images']['images'] if image.get('hash')] + return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) - # Fallback to single video - return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) + return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) + + +class ImgurAlbumIE(ImgurGalleryIE): + IE_NAME = 'imgur:album' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://imgur.com/a/j6Orj', + 'info_dict': { + 'id': 'j6Orj', + 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', + }, + 'playlist_count': 12, + }] From 3ad6dabd33307c0125fd462c4988083e360c40ad Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 9 Dec 2018 10:04:00 +0100 Subject: [PATCH 063/226] [aenetworks] add support for History Vault(closes #18460) --- youtube_dl/extractor/aenetworks.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 398e56ea3..85ec6392d 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -22,18 +22,19 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' - IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' _VALID_URL = r'''(?x) https?:// (?:www\.)? (?P - (?:history|aetv|mylifetime|lifetimemovieclub)\.com| + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| fyi\.tv )/ (?: shows/(?P[^/]+(?:/[^/]+){0,2})| movies/(?P[^/]+)(?:/full-movie)?| - specials/(?P[^/]+)/full-special + specials/(?P[^/]+)/full-special| + collections/[^/]+/(?P[^/]+) ) ''' _TESTS = [{ @@ -80,6 +81,9 @@ class AENetworksIE(AENetworksBaseIE): }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', + 'only_matching': True }] _DOMAIN_TO_REQUESTOR_ID = { 'history.com': 'HISTORY', @@ -90,9 +94,9 @@ class AENetworksIE(AENetworksBaseIE): } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id - webpage = self._download_webpage(url, display_id) + domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id or special_display_id or collection_display_id + webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) if show_path: url_parts = show_path.split('/') url_parts_len = len(url_parts) From 5ee7ae5c7577c9c137a7b38edd5ad01ae3a40376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 22:28:24 +0700 Subject: [PATCH 064/226] [teachable] Add support for teachable based platform sites (closes #5451, closes #18150, closes #18272) --- youtube_dl/extractor/extractors.py | 8 +- youtube_dl/extractor/generic.py | 5 + .../extractor/{upskill.py => teachable.py} | 129 ++++++++++++++---- 3 files changed, 115 insertions(+), 27 deletions(-) rename youtube_dl/extractor/{upskill.py => teachable.py} (52%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e28db6968..6a5d12ab1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1091,6 +1091,10 @@ from .tass import TassIE from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -1240,10 +1244,6 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) -from .upskill import ( - UpskillIE, - UpskillCourseIE, -) from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 59cf03faf..65b482333 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -109,6 +109,7 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .teachable import TeachableIE from .indavideo import IndavideoEmbedIE from .apa import APAIE from .foxnews import FoxNewsIE @@ -3112,6 +3113,10 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/teachable.py similarity index 52% rename from youtube_dl/extractor/upskill.py rename to youtube_dl/extractor/teachable.py index 30297b4dd..a0c46b2e6 100644 --- a/youtube_dl/extractor/upskill.py +++ b/youtube_dl/extractor/teachable.py @@ -14,20 +14,38 @@ from ..utils import ( ) -class UpskillBaseIE(InfoExtractor): - _LOGIN_URL = 'http://upskillcourses.com/sign_in' - _NETRC_MACHINE = 'upskill' +class TeachableBaseIE(InfoExtractor): + _NETRC_MACHINE = 'teachable' + _URL_PREFIX = 'teachable:' + + _SITES = { + # Only notable ones here + 'upskillcourses.com': 'upskill', + 'academy.gns3.com': 'gns3', + 'academyhacker.com': 'academyhacker', + 'stackskills.com': 'stackskills', + 'market.saleshacker.com': 'saleshacker', + 'learnability.org': 'learnability', + 'edurila.com': 'edurila', + } + + _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) def _real_initialize(self): - self._login() + self._logged_in = False - def _login(self): - username, password = self._get_login_info() + def _login(self, site): + if self._logged_in: + return + + username, password = self._get_login_info( + netrc_machine=self._SITES.get(site, site)) if username is None: return login_page, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login page') + 'https://%s/sign_in' % site, None, + 'Downloading %s login page' % site) login_url = compat_str(urlh.geturl()) @@ -46,18 +64,24 @@ class UpskillBaseIE(InfoExtractor): post_url = urljoin(login_url, post_url) response = self._download_webpage( - post_url, None, 'Logging in', + post_url, None, 'Logging in to %s' % site, data=urlencode_postdata(login_form), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': login_url, }) + if '>I accept the new Privacy Policy<' in response: + raise ExtractorError( + 'Unable to login: %s asks you to accept new Privacy Policy. ' + 'Go to https://%s/ and accept.' % (site, site), expected=True) + # Successful login if any(re.search(p, response) for p in ( r'class=["\']user-signout', r']+\bhref=["\']/sign_out', r'>\s*Log out\s*<')): + self._logged_in = True return message = get_element_by_class('alert', response) @@ -68,8 +92,14 @@ class UpskillBaseIE(InfoExtractor): raise ExtractorError('Unable to log in') -class UpskillIE(UpskillBaseIE): - _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P\d+)' +class TeachableIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P[^/]+)| + https?://(?:www\.)?(?P%s) + ) + /courses/[^/]+/lectures/(?P\d+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', @@ -77,7 +107,7 @@ class UpskillIE(UpskillBaseIE): 'id': 'uzw6zw58or', 'ext': 'mp4', 'title': 'Welcome to the Course!', - 'description': 'md5:8d66c13403783370af62ca97a7357bdd', + 'description': 'md5:65edb0affa582974de4625b9cdea1107', 'duration': 138.763, 'timestamp': 1479846621, 'upload_date': '20161122', @@ -88,10 +118,38 @@ class UpskillIE(UpskillBaseIE): }, { 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'only_matching': True, + }, { + 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'only_matching': True, }] + @staticmethod + def _is_teachable(webpage): + return 'teachableTracker.linker:autoLink' in webpage and re.search( + r']+href=["\']https?://process\.fs\.teachablecdn\.com', + webpage) + + @staticmethod + def _extract_url(webpage, source_url): + if not TeachableIE._is_teachable(webpage): + print('NOT TEACHABLE') + return + if re.match(r'https?://[^/]+/(?:courses|p)', source_url): + return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + video_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + url = url[len(self._URL_PREFIX):] webpage = self._download_webpage(url, video_id) @@ -113,12 +171,18 @@ class UpskillIE(UpskillBaseIE): } -class UpskillCourseIE(UpskillBaseIE): - _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P[^/?#&]+)' +class TeachableCourseIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P[^/]+)| + https?://(?:www\.)?(?P%s) + ) + /(?:courses|p)/(?:enrolled/)?(?P[^/?#&]+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { - 'id': '119763', + 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, @@ -128,21 +192,37 @@ class UpskillCourseIE(UpskillBaseIE): }, { 'url': 'http://upskillcourses.com/courses/enrolled/119763', 'only_matching': True, + }, { + 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'only_matching': True, + }, { + 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', + 'only_matching': True, + }, { + 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return False if UpskillIE.suitable(url) else super( - UpskillCourseIE, cls).suitable(url) + return False if TeachableIE.suitable(url) else super( + TeachableCourseIE, cls).suitable(url) def _real_extract(self, url): - course_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') or mobj.group('site_t') + course_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + prefix = self._URL_PREFIX + url = url[len(prefix):] webpage = self._download_webpage(url, course_id) - course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id', - default=course_id) + url_base = 'https://%s/' % site entries = [] @@ -162,10 +242,13 @@ class UpskillCourseIE(UpskillBaseIE): title = self._html_search_regex( r']+class=["\']lecture-name[^>]+>([^<]+)', li, 'title', default=None) + entry_url = urljoin(url_base, lecture_url) + if prefixed: + entry_url = self._URL_PREFIX + entry_url entries.append( self.url_result( - urljoin('http://upskillcourses.com/', lecture_url), - ie=UpskillIE.ie_key(), video_id=lecture_id, + entry_url, + ie=TeachableIE.ie_key(), video_id=lecture_id, video_title=clean_html(title))) course_title = self._html_search_regex( From 9e02c2c704d15d56b92ebe2fc8d481f99f5d106d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 22:56:33 +0700 Subject: [PATCH 065/226] [YoutubeDLCookieJar] Add test for keeping session cookies --- test/test_YoutubeDLCookieJar.py | 34 +++++++++++++++++++++++ test/testdata/cookies/session_cookies.txt | 6 ++++ 2 files changed, 40 insertions(+) create mode 100644 test/test_YoutubeDLCookieJar.py create mode 100644 test/testdata/cookies/session_cookies.txt diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py new file mode 100644 index 000000000..6a8243590 --- /dev/null +++ b/test/test_YoutubeDLCookieJar.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +import os +import re +import sys +import tempfile +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import YoutubeDLCookieJar + + +class TestYoutubeDLCookieJar(unittest.TestCase): + def test_keep_session_cookies(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + tf = tempfile.NamedTemporaryFile(delete=False) + try: + cookiejar.save(filename=tf.name, ignore_discard=True, ignore_expires=True) + temp = tf.read().decode('utf-8') + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpiresEmpty\s+YoutubeDLExpiresEmptyValue', temp)) + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpires0\s+YoutubeDLExpires0Value', temp)) + finally: + tf.close() + os.remove(tf.name) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/testdata/cookies/session_cookies.txt b/test/testdata/cookies/session_cookies.txt new file mode 100644 index 000000000..91e5c9231 --- /dev/null +++ b/test/testdata/cookies/session_cookies.txt @@ -0,0 +1,6 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpires0 YoutubeDLExpires0Value +www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue From 24cc64254c03c56229654b13d833c600d7048bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 23:08:16 +0700 Subject: [PATCH 066/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 689d07826..6afee3a28 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version + +Core +* [YoutubeDL] Keep session cookies in cookie file between runs +* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) + +Extractors ++ [teachable] Add support for teachable platform sites (#5451, #18150, #18272) ++ [aenetworks] Add support for historyvault.com (#18460) +* [imgur] Improve gallery and album detection and extraction (#9133, #16577, + #17223, #18404) +* [iprima] Relax URL regular expression (#18453) +* [hotstar] Fix video data extraction (#18386) +* [ard:mediathek] Fix title and description extraction (#18349, #18371) +* [xvideos] Switch to HTTPS (#18422, #18427) ++ [lecturio] Add support for lecturio.com (#18405) ++ [nrktv:series] Add support for extra materials +* [nrktv:season,series] Fix extraction (#17159, #17258) +* [nrktv] Relax URL regular expression (#18304, #18387) +* [yourporn] Fix extraction (#18424, #18425) +* [tbs] Fix info extraction (#18403) ++ [gamespot] Add support for review URLs + + version 2018.12.03 Core From cefe42c412168e95b78dcefc5cdba608dba7dd02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Dec 2018 23:11:32 +0700 Subject: [PATCH 067/226] release 2018.12.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 11 +++++++---- youtube_dl/version.py | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 4b35244a8..1ccf410a7 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.03** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.12.03 +[debug] youtube-dl version 2018.12.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 6afee3a28..9aef67b52 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.12.09 Core * [YoutubeDL] Keep session cookies in cookie file between runs diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 837b0199b..31d20f255 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -33,7 +33,7 @@ - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network + - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** @@ -376,7 +376,8 @@ - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** - - **ImgurAlbum** + - **imgur:album** + - **imgur:gallery** - **Ina** - **Inc** - **IndavideoEmbed** @@ -435,6 +436,8 @@ - **Le**: 乐视网 - **Learnr** - **Lecture2Go** + - **Lecturio** + - **LecturioCourse** - **LEGO** - **Lemonde** - **Lenta** @@ -853,6 +856,8 @@ - **TastyTrade** - **TBS** - **TDSLifeway** + - **Teachable** + - **TeachableCourse** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** @@ -961,8 +966,6 @@ - **uol.com.br** - **uplynk** - **uplynk:preplay** - - **Upskill** - - **UpskillCourse** - **Urort**: NRK P3 Urørt - **URPlay** - **USANetwork** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8e1203892..d15b9583f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.12.03' +__version__ = '2018.12.09' From 59c3940165efaee6705ff06ef4fc4ac2c701107d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Dec 2018 01:37:10 +0700 Subject: [PATCH 068/226] [ard:mediathek] Add support for classic.ardmediathek.de (closes #18473) --- youtube_dl/extractor/ard.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 84e96f769..2e1536a1b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -21,7 +21,7 @@ from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ # available till 26.07.2022 @@ -51,6 +51,9 @@ class ARDMediathekIE(InfoExtractor): # audio 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, }] def _extract_media_info(self, media_info_url, webpage, video_id): From 6e29458f2479c4535f8605032b9492f52bb37f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Dec 2018 04:30:00 +0700 Subject: [PATCH 069/226] [test/testdata/cookies/session_cookies.txt] Fix empty expires test data --- test/testdata/cookies/session_cookies.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/testdata/cookies/session_cookies.txt b/test/testdata/cookies/session_cookies.txt index 91e5c9231..f6996f031 100644 --- a/test/testdata/cookies/session_cookies.txt +++ b/test/testdata/cookies/session_cookies.txt @@ -2,5 +2,5 @@ # http://curl.haxx.se/rfc/cookie_spec.html # This is a generated file! Do not edit. +www.foobar.foobar FALSE / TRUE YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpires0 YoutubeDLExpires0Value -www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue From 102a4e54c5c0819233773cc15398bc901a218f4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Dec 2018 10:10:28 +0700 Subject: [PATCH 070/226] [teachable] Remove debug output --- youtube_dl/extractor/teachable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index a0c46b2e6..47ac95ee8 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -135,7 +135,6 @@ class TeachableIE(TeachableBaseIE): @staticmethod def _extract_url(webpage, source_url): if not TeachableIE._is_teachable(webpage): - print('NOT TEACHABLE') return if re.match(r'https?://[^/]+/(?:courses|p)', source_url): return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) From 13e17cd28e7f9e3bd8be4fa7b073a5cb96f5959f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 10 Dec 2018 14:59:57 +0100 Subject: [PATCH 071/226] [uol] fix format url extraction(closes 18480) --- youtube_dl/extractor/uol.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index e67083004..08f0c072e 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -61,7 +61,7 @@ class UOLIE(InfoExtractor): 'height': 360, }, '5': { - 'width': 1080, + 'width': 1280, 'height': 720, }, '6': { @@ -80,6 +80,10 @@ class UOLIE(InfoExtractor): 'width': 568, 'height': 320, }, + '11': { + 'width': 640, + 'height': 360, + } } def _real_extract(self, url): @@ -111,19 +115,31 @@ class UOLIE(InfoExtractor): 'ver': video_data.get('numRevision', 2), 'r': 'http://mais.uol.com.br', } + for k in ('token', 'sign'): + v = video_data.get(k) + if v: + query[k] = v + formats = [] for f in video_data.get('formats', []): f_url = f.get('url') or f.get('secureUrl') if not f_url: continue + f_url = update_url_query(f_url, query) format_id = str_or_none(f.get('id')) + if format_id == '10': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + continue fmt = { 'format_id': format_id, - 'url': update_url_query(f_url, query), + 'url': f_url, + 'source_preference': 1, } fmt.update(self._FORMATS.get(format_id, {})) formats.append(fmt) - self._sort_formats(formats) + self._sort_formats(formats, ('height', 'width', 'source_preference', 'tbr', 'ext')) tags = [] for tag in video_data.get('tags', []): From 0a05cfabb6338be750474a7286ce0d72a4d7c142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Dec 2018 23:45:02 +0700 Subject: [PATCH 072/226] [lecturio] Improve subtitles extraction (closes #18488) --- youtube_dl/extractor/lecturio.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 62ff28e02..0f1265cdf 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -136,9 +136,15 @@ class LecturioIE(LecturioBaseIE): cc_url = url_or_none(cc_url) if not cc_url: continue - sub_dict = automatic_captions if 'auto-translated' in cc_label else subtitles lang = self._search_regex( - r'/([a-z]{2})_', cc_url, 'lang', default=cc_label.split()[0]) + r'/([a-z]{2})_', cc_url, 'lang', + default=cc_label.split()[0] if cc_label else 'en') + original_lang = self._search_regex( + r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', + default=None) + sub_dict = (automatic_captions + if 'auto-translated' in cc_label or original_lang + else subtitles) sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ 'url': cc_url, }) From 8fe104947d6b772ccacd0b8af51d1adb16865e99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Dec 2018 22:25:12 +0700 Subject: [PATCH 073/226] [youtube] Fix multifeed extraction (closes #18531) --- youtube_dl/extractor/youtube.py | 52 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3f49f3889..c582ab2ff 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1712,30 +1712,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_description = '' - if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): - entries = [] - feed_ids = [] - multifeed_metadata_list = video_info['multifeed_metadata_list'][0] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/rg3/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + multifeed_metadata_list = try_get( + player_response, + lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], + compat_str) or try_get( + video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + if multifeed_metadata_list: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/rg3/youtube-dl/issues/8536) + feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + else: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) if view_count is None: view_count = extract_view_count(video_info) From 7f41a598b3fba1bcab2817de64a08941200aa3c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Dec 2018 23:08:14 +0700 Subject: [PATCH 074/226] [safari] Add support for learning.oreilly.com (closes #18510) --- youtube_dl/extractor/safari.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 30e2a38b4..c0d32a1b9 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -15,10 +15,10 @@ from ..utils import ( class SafariBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' _NETRC_MACHINE = 'safari' - _API_BASE = 'https://www.safaribooksonline.com/api/v1' + _API_BASE = 'https://learning.oreilly.com/api/v1' _API_FORMAT = 'json' LOGGED_IN = False @@ -76,7 +76,7 @@ class SafariIE(SafariBaseIE): IE_DESC = 'safaribooksonline.com online video' _VALID_URL = r'''(?x) https?:// - (?:www\.)?safaribooksonline\.com/ + (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ (?: library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) @@ -104,6 +104,9 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, }] _PARTNER_ID = '1926081' @@ -160,7 +163,7 @@ class SafariIE(SafariBaseIE): class SafariApiIE(SafariBaseIE): IE_NAME = 'safari:api' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', @@ -185,7 +188,7 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/ + (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ (?: library/view/[^/]+| api/v1/book| @@ -213,6 +216,9 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, }] @classmethod From c984196cf1c9eb34725091e07a4f094b7eea1d4f Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sat, 15 Dec 2018 23:59:17 +0700 Subject: [PATCH 075/226] [README.md] Bind info dict URLs to a fixed blob (closes #18492) --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 35c3de512..a1d2904c0 100644 --- a/README.md +++ b/README.md @@ -1024,7 +1024,7 @@ After you have ensured this site is distributing its content legally, you can fo ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: @@ -1045,7 +1045,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -1053,7 +1053,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example From 21c340b83fb41094ef59b87d52c4eb1d90d1df04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 Dec 2018 19:35:48 +0700 Subject: [PATCH 076/226] [youtube] Fix mark watched (closes #18546) --- youtube_dl/extractor/youtube.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c582ab2ff..44c25c11c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -48,6 +48,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + url_or_none, urlencode_postdata, ) @@ -1386,8 +1387,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(err_msg) return {} - def _mark_watched(self, video_id, video_info): - playback_url = video_info.get('videostats_playback_base_url', [None])[0] + def _mark_watched(self, video_id, video_info, player_response): + playback_url = url_or_none(try_get( + player_response, + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( + video_info, lambda x: x['videostats_playback_base_url'][0])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -2122,7 +2126,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._sort_formats(formats) - self.mark_watched(video_id, video_info) + self.mark_watched(video_id, video_info, player_response) return { 'id': video_id, From c8b37510868b20b0829583557899d70db2ea6243 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 16 Dec 2018 14:28:40 +0100 Subject: [PATCH 077/226] [vrv] fix initial state extraction --- youtube_dl/extractor/vrv.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index ac0819c7c..483a3be3a 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -120,8 +120,10 @@ class VRVIE(VRVBaseIE): url, video_id, headers=self.geo_verification_headers()) media_resource = self._parse_json(self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})', - webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} + [ + r'window\.__INITIAL_STATE__\s*=\s*({.+?})(?:|;)', + r'window\.__INITIAL_STATE__\s*=\s*({.+})' + ], webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} video_data = media_resource.get('json') if not video_data: From 90046d77616f8f24e3716c64443118a9dcbec996 Mon Sep 17 00:00:00 2001 From: yonaikerlol Date: Sun, 16 Dec 2018 17:10:36 -0400 Subject: [PATCH 078/226] [iprima] Relax _VALID_URL (closes #18515) --- youtube_dl/extractor/iprima.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 11a6629d2..11bbeb592 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -12,7 +12,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:play|prima|www)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_BYPASS = False _TESTS = [{ @@ -44,6 +44,21 @@ class IPrimaIE(InfoExtractor): }, { 'url': 'http://www.iprima.cz/filmy/desne-rande', 'only_matching': True, + }, { + 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby', + 'only_matching': True, + }, { + 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy', + 'only_matching': True, + }, { + 'url': 'https://cool.iprima.cz/derava-silnice-nevadi', + 'only_matching': True, + }, { + 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', + 'only_matching': True, + }, { + 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', + 'only_matching': True, }] def _real_extract(self, url): From 252e172dea96b90191682afac837535de4d33107 Mon Sep 17 00:00:00 2001 From: Tim Landscheidt Date: Sun, 16 Dec 2018 21:29:12 +0000 Subject: [PATCH 079/226] [acast] Add support for embed.acast.com --- youtube_dl/extractor/acast.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 6d846ea7a..1fbff705d 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -17,25 +17,8 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' - _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' + _VALID_URL = r'https?://(?:(?:embed|www)\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' _TESTS = [{ - # test with one bling - 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan', - 'md5': 'ada3de5a1e3a2a381327d749854788bb', - 'info_dict': { - 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', - 'ext': 'mp3', - 'title': '"Where Are You?": Taipei 101, Taiwan', - 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', - 'timestamp': 1196172000, - 'upload_date': '20071127', - 'duration': 211, - 'creator': 'Concierge', - 'series': 'Condé Nast Traveler Podcast', - 'episode': '"Where Are You?": Taipei 101, Taiwan', - } - }, { - # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', 'info_dict': { @@ -50,6 +33,9 @@ class ACastIE(InfoExtractor): 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', } + }, { + 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', + 'only_matching': True, }] def _real_extract(self, url): From 50a498a68e2f6754f5b26e5ad1f6eabfd9aeeb14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 04:32:59 +0700 Subject: [PATCH 080/226] [acast] Extend _VALID_URL --- youtube_dl/extractor/acast.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 1fbff705d..b32f74a37 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -17,7 +17,14 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' - _VALID_URL = r'https?://(?:(?:embed|www)\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:embed|www)\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P[^/]+)/(?P[^/#?]+) + ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', @@ -36,6 +43,9 @@ class ACastIE(InfoExtractor): }, { 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'only_matching': True, }] def _real_extract(self, url): From 1c82122741783adf6653df25fa81ef0f95a22279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 04:51:57 +0700 Subject: [PATCH 081/226] [ard:beta] Relax _VALID_URL (closes #18441) --- youtube_dl/extractor/ard.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2e1536a1b..a5df7f0f9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -56,6 +56,10 @@ class ARDMediathekIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( media_info_url, video_id, 'Downloading media JSON') @@ -296,7 +300,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(InfoExtractor): - _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P[a-zA-Z0-9]+)/(?P[^/?#]+)' + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', @@ -310,12 +314,18 @@ class ARDBetaMediathekIE(InfoExtractor): 'upload_date': '20180826', 'ext': 'mp4', }, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') From 752582183a1942b12880139039137b3a60962611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 05:29:59 +0700 Subject: [PATCH 082/226] [ard:beta] Improve extraction robustness, fix subtitles extraction, improve geo restricted videos extraction --- youtube_dl/extractor/ard.py | 58 +++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index a5df7f0f9..8adae4644 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,13 +8,16 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, - qualities, int_or_none, parse_duration, + qualities, + str_or_none, + try_get, unified_strdate, - xpath_text, + unified_timestamp, update_url_query, url_or_none, + xpath_text, ) from ..compat import compat_etree_fromstring @@ -336,43 +339,62 @@ class ARDBetaMediathekIE(InfoExtractor): 'display_id': display_id, } formats = [] + subtitles = {} + geoblocked = False for widget in data.values(): - if widget.get('_geoblocked'): - raise ExtractorError('This video is not available due to geoblocking', expected=True) - + if widget.get('_geoblocked') is True: + geoblocked = True if '_duration' in widget: - res['duration'] = widget['_duration'] + res['duration'] = int_or_none(widget['_duration']) if 'clipTitle' in widget: res['title'] = widget['clipTitle'] if '_previewImage' in widget: res['thumbnail'] = widget['_previewImage'] if 'broadcastedOn' in widget: - res['upload_date'] = unified_strdate(widget['broadcastedOn']) + res['timestamp'] = unified_timestamp(widget['broadcastedOn']) if 'synopsis' in widget: res['description'] = widget['synopsis'] - if '_subtitleUrl' in widget: - res['subtitles'] = {'de': [{ + subtitle_url = url_or_none(widget.get('_subtitleUrl')) + if subtitle_url: + subtitles.setdefault('de', []).append({ 'ext': 'ttml', - 'url': widget['_subtitleUrl'], - }]} + 'url': subtitle_url, + }) if '_quality' in widget: - format_url = widget['_stream']['json'][0] - - if format_url.endswith('.f4m'): + format_url = url_or_none(try_get( + widget, lambda x: x['_stream']['json'][0])) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( format_url + '?hdcore=3.11.0', video_id, f4m_id='hds', fatal=False)) - elif format_url.endswith('m3u8'): + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + format_url, video_id, 'mp4', m3u8_id='hls', + fatal=False)) else: + # HTTP formats are not available when geoblocked is True, + # other formats are fine though + if geoblocked: + continue + quality = str_or_none(widget.get('_quality')) formats.append({ - 'format_id': 'http-' + widget['_quality'], + 'format_id': ('http-' + quality) if quality else 'http', 'url': format_url, 'preference': 10, # Plain HTTP, that's nice }) + if not formats and geoblocked: + self.raise_geo_restricted( + msg='This video is not available due to geoblocking', + countries=['DE']) + self._sort_formats(formats) - res['formats'] = formats + res.update({ + 'subtitles': subtitles, + 'formats': formats, + }) return res From cbb3e4b14f59fe2dba5983f7e55308eba2ea025b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 05:34:55 +0700 Subject: [PATCH 083/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9aef67b52..291ae42bb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version + +Extractors +* [ard:beta] Improve geo restricted videos extraction +* [ard:beta] Fix subtitles extraction +* [ard:beta] Improve extraction robustness +* [ard:beta] Relax URL regular expression (#18441) +* [acast] Add support for embed.acast.com and play.acast.com (#18483) +* [iprima] Relax URL regular expression (#18515, #18540) +* [vrv] Fix initial state extraction (#18553) +* [youtube] Fix mark watched (#18546) ++ [safari] Add support for learning.oreilly.com (#18510) +* [youtube] Fix multifeed extraction (#18531) +* [lecturio] Improve subtitles extraction (#18488) +* [uol] Fix format URL extraction (#18480) ++ [ard:mediathek] Add support for classic.ardmediathek.de (#18473) + + version 2018.12.09 Core From 4cee62ade0d991eedb2feae927c44370be3c389e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 05:37:50 +0700 Subject: [PATCH 084/226] release 2018.12.17 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1ccf410a7..b84559932 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.17*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.17** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.12.09 +[debug] youtube-dl version 2018.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bbcb78808..cba87190d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -152,7 +152,7 @@ After you have ensured this site is distributing its content legally, you can fo ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: @@ -173,7 +173,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -181,7 +181,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example diff --git a/ChangeLog b/ChangeLog index 291ae42bb..510013e89 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.12.17 Extractors * [ard:beta] Improve geo restricted videos extraction diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d15b9583f..0461aac0c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.12.09' +__version__ = '2018.12.17' From 7216e9bff71b4c537bb8d56386d789bb83f921f9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 17 Dec 2018 16:34:51 +0100 Subject: [PATCH 085/226] [discovery] Add support for Scripps Networks watch domains(closes #17947) --- youtube_dl/extractor/discovery.py | 35 +++++++++++++++++-------- youtube_dl/extractor/scrippsnetworks.py | 29 +++++--------------- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 3589bd428..44fbc41bb 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -17,16 +17,29 @@ from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?P - discovery| - investigationdiscovery| - discoverylife| - animalplanet| - ahctv| - destinationamerica| - sciencechannel| - tlc| - velocity + _VALID_URL = r'''(?x)https?:// + (?P + (?:www\.)? + (?: + discovery| + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc| + velocity + )| + watch\. + (?: + hgtv| + foodnetwork| + travelchannel| + diynetwork| + cookingchanneltv| + motortrend + ) )\.com(?P/tv-shows/[^/]+/(?:video|full-episode)s/(?P[^./?#]+))''' _TESTS = [{ 'url': 'https://www.discovery.com/tv-shows/cash-cab/videos/dave-foley', @@ -71,7 +84,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): if not access_token: access_token = self._download_json( - 'https://www.%s.com/anonymous' % site, display_id, query={ + 'https://%s.com/anonymous' % site, display_id, query={ 'authRel': 'authorization', 'client_id': try_get( react_data, lambda x: x['application']['apiClientId'], diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index 4023aeef8..8b3275735 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -19,7 +19,7 @@ class ScrippsNetworksWatchIE(AWSIE): _VALID_URL = r'''(?x) https?:// watch\. - (?Phgtv|foodnetwork|travelchannel|diynetwork|cookingchanneltv|geniuskitchen)\.com/ + (?Pgeniuskitchen)\.com/ (?: player\.[A-Z0-9]+\.html\#| show/(?:[^/]+/){2}| @@ -28,38 +28,23 @@ class ScrippsNetworksWatchIE(AWSIE): (?P\d+) ''' _TESTS = [{ - 'url': 'http://watch.hgtv.com/show/HGTVE/Best-Ever-Treehouses/2241515/Best-Ever-Treehouses/', - 'md5': '26545fd676d939954c6808274bdb905a', + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', 'info_dict': { - 'id': '4173834', + 'id': '4194875', 'ext': 'mp4', - 'title': 'Best Ever Treehouses', - 'description': "We're searching for the most over the top treehouses.", + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', 'uploader': 'ANV', - 'upload_date': '20170922', - 'timestamp': 1506056400, + 'upload_date': '20171011', + 'timestamp': 1507698000, }, 'params': { 'skip_download': True, }, 'add_ie': [AnvatoIE.ie_key()], - }, { - 'url': 'http://watch.diynetwork.com/show/DSAL/Salvage-Dawgs/2656646/Covington-Church/', - 'only_matching': True, - }, { - 'url': 'http://watch.diynetwork.com/player.HNT.html#2656646', - 'only_matching': True, - }, { - 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', - 'only_matching': True, }] _SNI_TABLE = { - 'hgtv': 'hgtv', - 'diynetwork': 'diy', - 'foodnetwork': 'food', - 'cookingchanneltv': 'cook', - 'travelchannel': 'trav', 'geniuskitchen': 'genius', } From 386d1fea79e861b11c77a7353c07eb162fbb4dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Dec 2018 23:43:12 +0700 Subject: [PATCH 086/226] [lecturio] Add support for lecturio.de (closes #18562) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/lecturio.py | 45 +++++++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6a5d12ab1..d72f52e36 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -557,6 +557,7 @@ from .lecture2go import Lecture2GoIE from .lecturio import ( LecturioIE, LecturioCourseIE, + LecturioDeCourseIE, ) from .leeco import ( LeIE, diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 0f1265cdf..24f78d928 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -64,8 +64,14 @@ class LecturioBaseIE(InfoExtractor): class LecturioIE(LecturioBaseIE): - _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P[^/?#&]+)\.lecture' - _TEST = { + _VALID_URL = r'''(?x) + https:// + (?: + app\.lecturio\.com/[^/]+/(?P[^/?#&]+)\.lecture| + (?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.vortrag + ) + ''' + _TESTS = [{ 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', 'info_dict': { @@ -74,7 +80,10 @@ class LecturioIE(LecturioBaseIE): 'title': 'Important Concepts and Terms – Introduction to Microbiology', }, 'skip': 'Requires lecturio account credentials', - } + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, + }] _CC_LANGS = { 'German': 'de', @@ -86,7 +95,8 @@ class LecturioIE(LecturioBaseIE): } def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('id_de') webpage = self._download_webpage( 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, @@ -190,3 +200,30 @@ class LecturioCourseIE(LecturioBaseIE): 'title', default=None) return self.playlist_result(entries, display_id, title) + + +class LecturioDeCourseIE(LecturioBaseIE): + _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.kurs' + _TEST = { + 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', + 'only_matching': True, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)]+\bdata-lecture-id=["\'](?P\d+).+?\bhref=(["\'])(?P(?:(?!\2).)+\.vortrag)\b[^>]+>', + webpage): + lecture_url = urljoin(url, mobj.group('url')) + lecture_id = mobj.group('id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r']*>([^<]+)', webpage, 'title', default=None) + + return self.playlist_result(entries, display_id, title) From cfd13c4c45f63c408a3e53ac31561c286c9a3acc Mon Sep 17 00:00:00 2001 From: Daan van Vugt Date: Mon, 17 Dec 2018 18:03:00 +0100 Subject: [PATCH 087/226] [mediasite] Relax _VALID_URL --- youtube_dl/extractor/mediasite.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 84876b883..de37df0c6 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -21,7 +21,7 @@ from ..utils import ( class MediasiteIE(InfoExtractor): - _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/livebroadcast/Presentation)/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -84,7 +84,11 @@ class MediasiteIE(InfoExtractor): 'timestamp': 1333983600, 'duration': 7794, } - } + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', + 'only_matching': True, + }, ] # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) From 4ee184545455d1ebf4a9f97f86ade2c4a3b3e03b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 18 Dec 2018 01:55:13 +0700 Subject: [PATCH 088/226] [mediasite] Extend _VALID_URL even more --- youtube_dl/extractor/mediasite.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index de37df0c6..ef9628e65 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -21,7 +21,7 @@ from ..utils import ( class MediasiteIE(InfoExtractor): - _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/livebroadcast/Presentation)/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -89,6 +89,10 @@ class MediasiteIE(InfoExtractor): 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', 'only_matching': True, }, + { + 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', + 'only_matching': True, + }, ] # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) From 65e29cdac3f3ee14e07c200293438426b1135719 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 18 Dec 2018 22:46:19 +0100 Subject: [PATCH 089/226] [twitter] pass referer with card request(closes #18579) --- youtube_dl/extractor/twitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index de41065d6..41d0b6be8 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -171,7 +171,8 @@ class TwitterCardIE(TwitterBaseIE): urls.append('https://twitter.com/i/videos/' + video_id) for u in urls: - webpage = self._download_webpage(u, video_id) + webpage = self._download_webpage( + u, video_id, headers={'Referer': 'https://twitter.com/'}) iframe_url = self._html_search_regex( r']+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', From 904bb599be0fcf5fa861f17eee757c3c9494208b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elan=20Ruusam=C3=A4e?= Date: Wed, 19 Dec 2018 22:22:10 +0200 Subject: [PATCH 090/226] [README.md] Add flake8 instructions --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a1d2904c0..b3c39bf66 100644 --- a/README.md +++ b/README.md @@ -1025,15 +1025,19 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): + + $ flake8 youtube_dl/extractor/yourextractor.py + +9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! From 835e45abab88e2a1661b86be255e06323e88ec7a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 19 Dec 2018 22:07:37 +0100 Subject: [PATCH 091/226] [crackle] extract ism and http formats --- youtube_dl/extractor/crackle.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 8dd9d6687..f73ef6b63 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -48,6 +48,21 @@ class CrackleIE(InfoExtractor): 'only_matching': True, }] + _MEDIA_FILE_SLOTS = { + '360p.mp4': { + 'width': 640, + 'height': 360, + }, + '480p.mp4': { + 'width': 768, + 'height': 432, + }, + '480p_1mbps.mp4': { + 'width': 852, + 'height': 480, + }, + } + def _real_extract(self, url): video_id = self._match_id(url) @@ -95,6 +110,20 @@ class CrackleIE(InfoExtractor): elif ext == 'mpd': formats.extend(self._extract_mpd_formats( format_url, video_id, mpd_id='dash', fatal=False)) + elif format_url.endswith('.ism/Manifest'): + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + mfs_path = e.get('Type') + mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path) + if not mfs_info: + continue + formats.append({ + 'url': format_url, + 'format_id': 'http-' + mfs_path.split('.')[0], + 'width': mfs_info['width'], + 'height': mfs_info['height'], + }) self._sort_formats(formats) description = media.get('Description') From e1a0628797e2dc7e5e032f853ab4ddc6a7a08020 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 20 Dec 2018 23:22:51 +0100 Subject: [PATCH 092/226] [liveleak] add support for another embed type and restore original format extraction --- youtube_dl/extractor/liveleak.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 26671753c..22a067e40 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -87,7 +87,7 @@ class LiveLeakIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return re.findall( - r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"', + r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"', webpage) def _real_extract(self, url): @@ -120,13 +120,27 @@ class LiveLeakIE(InfoExtractor): } for idx, info_dict in enumerate(entries): + formats = [] for a_format in info_dict['formats']: if not a_format.get('height'): a_format['height'] = int_or_none(self._search_regex( r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None)) + formats.append(a_format) - self._sort_formats(info_dict['formats']) + # Removing '.*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/rg3/youtube-dl/pull/4768) + orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) + if a_format['url'] != orig_url: + format_id = a_format.get('format_id') + formats.append({ + 'format_id': 'original' + ('-' + format_id if format_id else ''), + 'url': orig_url, + 'preference': 1, + }) + self._sort_formats(formats) + info_dict['formats'] = formats # Don't append entry ID for one-video pages to keep backward compatibility if len(entries) > 1: @@ -146,7 +160,7 @@ class LiveLeakIE(InfoExtractor): class LiveLeakEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[if])=(?P[\w_]+)' + _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[ift])=(?P[\w_]+)' # See generic.py for actual test cases _TESTS = [{ @@ -158,15 +172,14 @@ class LiveLeakEmbedIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - kind, video_id = mobj.group('kind', 'id') + kind, video_id = re.match(self._VALID_URL, url).groups() if kind == 'f': webpage = self._download_webpage(url, video_id) liveleak_url = self._search_regex( - r'logourl\s*:\s*(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, + r'(?:logourl\s*:\s*|window\.open\()(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, webpage, 'LiveLeak URL', group='url') - elif kind == 'i': - liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + else: + liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id) return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) From 4273caf5c7e4a79783106938802c1f4e1aa9c950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Dec 2018 16:38:16 +0700 Subject: [PATCH 093/226] [youtube] Extend html5 player regex (closes #17516) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 44c25c11c..906774875 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1834,7 +1834,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version From 63529e935cf5f87e6080607ef9d9196fe435e092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Dec 2018 16:57:10 +0700 Subject: [PATCH 094/226] [youtube] Relax html5 player regexes (closes #18465, closes #18466) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 906774875..954853e00 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1105,7 +1105,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1834,7 +1834,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version From 825cd268a3a02e80a9c36dada664d25074243d26 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 26 Dec 2018 09:30:48 +0100 Subject: [PATCH 095/226] [youtube] detect DRM protected videos(#1774) --- youtube_dl/extractor/youtube.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 954853e00..4b06cb597 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1077,6 +1077,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'only_matching': True, }, + { + # DRM protected + 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', + 'only_matching': True, + } ] def __init__(self, *args, **kwargs): @@ -1673,6 +1678,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + if video_info.get('license_info'): + raise ExtractorError('This video is DRM protected.', expected=True) + video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} From 140a13f5de1ff8e6bf7ba0d27b38063a2b20fa33 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 26 Dec 2018 10:55:28 +0100 Subject: [PATCH 096/226] [youtube] extract more format metadata --- youtube_dl/extractor/youtube.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4b06cb597..ac3b4fc17 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1794,6 +1794,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'height': int_or_none(width_height[1]), } q = qualities(['small', 'medium', 'hd720']) + streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) + if streaming_formats: + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: + continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) @@ -1876,7 +1895,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) - quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0] + quality = url_data.get('quality', [None])[0] more_fields = { 'filesize': filesize, @@ -1884,7 +1903,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': quality, + 'format_note': url_data.get('quality_label', [None])[0] or quality, 'quality': q(quality), } for key, value in more_fields.items(): From c2dd2dc086653f46defe0bc6bee9b5e339e67dea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 31 Dec 2018 19:57:01 +0700 Subject: [PATCH 097/226] [youtube] Unescape HTML for series (closes #18641) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ac3b4fc17..539ff6ff2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2043,7 +2043,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)', video_webpage) if m_episode: - series = m_episode.group('series') + series = unescapeHTML(m_episode.group('series')) season_number = int(m_episode.group('season')) episode_number = int(m_episode.group('episode')) else: From e4d51e751e361faaedaa50baf37501106459ab73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 31 Dec 2018 20:59:53 +0700 Subject: [PATCH 098/226] [beeg] Fix extraction (closes #18610, closes #18626) --- youtube_dl/extractor/beeg.py | 75 ++++++------------------------------ 1 file changed, 11 insertions(+), 64 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index bf22a41b7..1086d7632 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,15 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_chr, - compat_ord, - compat_urllib_parse_unquote, -) +from ..compat import compat_str from ..utils import ( int_or_none, - parse_iso8601, - urljoin, + unified_timestamp, ) @@ -36,29 +31,9 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - cpl_url = self._search_regex( - r']+src=(["\'])(?P(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', - webpage, 'cpl', default=None, group='url') - - cpl_url = urljoin(url, cpl_url) - - beeg_version, beeg_salt = [None] * 2 - - if cpl_url: - cpl = self._download_webpage( - self._proto_relative_url(cpl_url), video_id, - 'Downloading cpl JS', fatal=False) - if cpl: - beeg_version = int_or_none(self._search_regex( - r'beeg_version\s*=\s*([^\b]+)', cpl, - 'beeg version', default=None)) or self._search_regex( - r'/(\d+)\.js', cpl_url, 'beeg version', default=None) - beeg_salt = self._search_regex( - r'beeg_salt\s*=\s*(["\'])(?P.+?)\1', cpl, 'beeg salt', - default=None, group='beeg_salt') - - beeg_version = beeg_version or '2185' - beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' + beeg_version = self._search_regex( + r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', + default='1546225636701') for api_path in ('', 'api.'): video = self._download_json( @@ -68,37 +43,6 @@ class BeegIE(InfoExtractor): if video: break - def split(o, e): - def cut(s, x): - n.append(s[:x]) - return s[x:] - n = [] - r = len(o) % e - if r > 0: - o = cut(o, r) - while len(o) > e: - o = cut(o, e) - n.append(o) - return n - - def decrypt_key(key): - # Reverse engineered from http://static.beeg.com/cpl/1738.js - a = beeg_salt - e = compat_urllib_parse_unquote(key) - o = ''.join([ - compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) - for n in range(len(e))]) - return ''.join(split(o, 3)[::-1]) - - def decrypt_url(encrypted_url): - encrypted_url = self._proto_relative_url( - encrypted_url.replace('{DATA_MARKERS}', ''), 'https:') - key = self._search_regex( - r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) - if not key: - return encrypted_url - return encrypted_url.replace(key, decrypt_key(key)) - formats = [] for format_id, video_url in video.items(): if not video_url: @@ -108,18 +52,20 @@ class BeegIE(InfoExtractor): if not height: continue formats.append({ - 'url': decrypt_url(video_url), + 'url': self._proto_relative_url( + video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), 'format_id': format_id, 'height': int(height), }) self._sort_formats(formats) title = video['title'] - video_id = video.get('id') or video_id + video_id = compat_str(video.get('id') or video_id) display_id = video.get('code') description = video.get('desc') + series = video.get('ps_name') - timestamp = parse_iso8601(video.get('date'), ' ') + timestamp = unified_timestamp(video.get('date')) duration = int_or_none(video.get('duration')) tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None @@ -129,6 +75,7 @@ class BeegIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': description, + 'series': series, 'timestamp': timestamp, 'duration': duration, 'tags': tags, From 4e1ddc8da9b318873d71d0283838af6aec1bc98d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 31 Dec 2018 21:05:07 +0700 Subject: [PATCH 099/226] [npo:live] Add support for npostart.nl (closes #18644) --- youtube_dl/extractor/npo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c2cb85a73..5a427c396 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -363,7 +363,7 @@ class NPOIE(NPOBaseIE): class NPOLiveIE(NPOBaseIE): IE_NAME = 'npo.nl:live' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P[^/?#&]+))?' + _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.npo.nl/live/npo-1', @@ -380,6 +380,9 @@ class NPOLiveIE(NPOBaseIE): }, { 'url': 'http://www.npo.nl/live', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/live/npo-1', + 'only_matching': True, }] def _real_extract(self, url): From 373941c5f0c163bf9f2fbbe7f65ee549a228720f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 31 Dec 2018 23:20:40 +0700 Subject: [PATCH 100/226] [bbc] Add support for another embed pattern (closes #18643) --- youtube_dl/extractor/bbc.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index abcfa301d..eac9a5a46 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -795,6 +795,15 @@ class BBCIE(BBCCoUkIE): 'uploader': 'Radio 3', 'uploader_id': 'bbc_radio_three', }, + }, { + 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', + 'info_dict': { + 'id': 'p06w9tws', + 'ext': 'mp4', + 'title': 'md5:2fabf12a726603193a2879a055f72514', + 'description': 'Learn English words and phrases from this story', + }, + 'add_ie': [BBCCoUkIE.ie_key()], }] @classmethod @@ -945,6 +954,15 @@ class BBCIE(BBCCoUkIE): if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 + group_id = self._search_regex( + r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + webpage, 'group id', default=None) + if playlist_id: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % group_id, + ie=BBCCoUkIE.ie_key()) + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, From 32ac3d49ae83fe933515096a74068a8033131f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 31 Dec 2018 23:57:46 +0700 Subject: [PATCH 101/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index 510013e89..31458dbd9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +version + +Extractors ++ [bbc] Add support for another embed pattern (#18643) ++ [npo:live] Add support for npostart.nl (#18644) +* [beeg] Fix extraction (#18610, #18626) +* [youtube] Unescape HTML for series (#18641) ++ [youtube] Extract more format metadata +* [youtube] Detect DRM protected videos (#1774) +* [youtube] Relax HTML5 player regular expressions (#18465, #18466) +* [youtube] Extend HTML5 player regular expression (#17516) ++ [liveleak] Add support for another embed type and restore original + format extraction ++ [crackle] Extract ISM and HTTP formats ++ [twitter] Pass Referer with card request (#18579) +* [mediasite] Extend URL regular expression (#18558) ++ [lecturio] Add support for lecturio.de (#18562) ++ [discovery] Add support for Scripps Networks watch domains (#17947) + + version 2018.12.17 Extractors From 9d9daed4647c0a0798adee5a580e9fd95478cf6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 31 Dec 2018 23:59:52 +0700 Subject: [PATCH 102/226] release 2018.12.31 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 10 +++++++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b84559932..b2fe6ee6a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.17*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.17** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.31*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.31** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.12.17 +[debug] youtube-dl version 2018.12.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cba87190d..24617264c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -153,15 +153,19 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): + + $ flake8 youtube_dl/extractor/yourextractor.py + +9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! diff --git a/ChangeLog b/ChangeLog index 31458dbd9..72bfe2110 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.12.31 Extractors + [bbc] Add support for another embed pattern (#18643) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 31d20f255..e92064432 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -438,6 +438,7 @@ - **Lecture2Go** - **Lecturio** - **LecturioCourse** + - **LecturioDeCourse** - **LEGO** - **Lemonde** - **Lenta** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0461aac0c..87de5848d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.12.17' +__version__ = '2018.12.31' From 6b688b8942a0bd18b2a3835e69a903f1eeffee2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Jan 2019 18:12:44 +0700 Subject: [PATCH 103/226] [bitchute] Fix extraction (closes #18567) --- youtube_dl/extractor/bitchute.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 43b4732aa..aa034355a 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -5,7 +5,10 @@ import itertools import re from .common import InfoExtractor -from ..utils import urlencode_postdata +from ..utils import ( + orderedSet, + urlencode_postdata, +) class BitChuteIE(InfoExtractor): @@ -43,10 +46,15 @@ class BitChuteIE(InfoExtractor): 'description', webpage, 'title', default=None) or self._og_search_description(webpage) + format_urls = [] + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P(?:(?!\1).)+)\1', webpage): + format_urls.append(mobj.group('url')) + format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) + formats = [ - {'url': mobj.group('url')} - for mobj in re.finditer( - r'addWebSeed\s*\(\s*(["\'])(?P(?:(?!\1).)+)\1', webpage)] + {'url': format_url} + for format_url in orderedSet(format_urls)] self._sort_formats(formats) description = self._html_search_regex( From 38d15ba7f981cc82db91edcbf05ea3844082d045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Jan 2019 20:31:48 +0700 Subject: [PATCH 104/226] [manyvids] Fix extraction (closes #18604, closes #18614) --- youtube_dl/extractor/manyvids.py | 62 +++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index b94b3c2ab..e8d7163e4 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + determine_ext, + int_or_none, + str_to_int, + urlencode_postdata, +) class ManyVidsIE(InfoExtractor): _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P\d+)' - _TEST = { + _TESTS = [{ + # preview video 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', 'info_dict': { @@ -17,7 +23,18 @@ class ManyVidsIE(InfoExtractor): 'view_count': int, 'like_count': int, }, - } + }, { + # full video + 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', + 'md5': 'f3e8f7086409e9b470e2643edb96bdcc', + 'info_dict': { + 'id': '935718', + 'ext': 'mp4', + 'title': 'MY FACE REVEAL', + 'view_count': int, + 'like_count': int, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -28,12 +45,41 @@ class ManyVidsIE(InfoExtractor): r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video URL', group='url') - title = '%s (Preview)' % self._html_search_regex( - r']+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + title = self._html_search_regex( + (r']+class=["\']item-title[^>]+>([^<]+)', + r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True) + + if any(p in webpage for p in ('preview_videos', '_preview.mp4')): + title += ' (Preview)' + + mv_token = self._search_regex( + r'data-mvtoken=(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'mv token', default=None, group='value') + + if mv_token: + # Sets some cookies + self._download_webpage( + 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', + video_id, fatal=False, data=urlencode_postdata({ + 'mvtoken': mv_token, + 'vid': video_id, + }), headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest' + }) + + if determine_ext(video_url) == 'm3u8': + formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{'url': video_url}] like_count = int_or_none(self._search_regex( r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) - view_count = int_or_none(self._html_search_regex( + view_count = str_to_int(self._html_search_regex( r'(?s)]+class="views-wrapper"[^>]*>(.+?) Date: Sun, 30 Dec 2018 14:30:36 +0100 Subject: [PATCH 105/226] [rmcdecouverte] Update _VALID_URL (closes #18595) --- youtube_dl/extractor/rmcdecouverte.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index e921ca3e6..c44b90274 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -10,18 +10,18 @@ from ..compat import ( class RMCDecouverteIE(InfoExtractor): - _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P\d+)' + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/.+/program_(?P\d+)' _TEST = { - 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET', + 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', 'info_dict': { - 'id': '5419055995001', + 'id': '5983675500001', 'ext': 'mp4', - 'title': 'UN DELICIEUX PROJET', - 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5', + 'title': 'CORVETTE', + 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', 'uploader_id': '1969646226001', - 'upload_date': '20170502', - 'timestamp': 1493745308, + 'upload_date': '20181226', + 'timestamp': 1545861635, }, 'params': { 'skip_download': True, From d9f1123c08e21c7170b906c7de2562b7293f7a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Jan 2019 20:49:13 +0700 Subject: [PATCH 106/226] [rmcdecouverte] Improve, bypass geo restriction and add support for live (closes #18697) --- youtube_dl/extractor/rmcdecouverte.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index c44b90274..c3623edcc 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -1,18 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE from ..compat import ( compat_parse_qs, compat_urlparse, ) +from ..utils import smuggle_url class RMCDecouverteIE(InfoExtractor): - _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/.+/program_(?P\d+)' + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P\d+)|(?Pmediaplayer-direct))' - _TEST = { + _TESTS = [{ 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', 'info_dict': { 'id': '5983675500001', @@ -27,12 +30,17 @@ class RMCDecouverteIE(InfoExtractor): 'skip_download': True, }, 'skip': 'only available for a week', - } + }, { + # live, geo restricted, bypassable + 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('live_id') + webpage = self._download_webpage(url, display_id) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) if brightcove_legacy_url: brightcove_id = compat_parse_qs(compat_urlparse.urlparse( @@ -41,5 +49,7 @@ class RMCDecouverteIE(InfoExtractor): brightcove_id = self._search_regex( r'data-video-id=["\'](\d+)', webpage, 'brightcove id') return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', - brightcove_id) + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['FR']}), + 'BrightcoveNew', brightcove_id) From 9b5c8751eeb10fd41c560696d73fec3401055ddc Mon Sep 17 00:00:00 2001 From: iwconfig Date: Tue, 1 Jan 2019 15:39:18 +0100 Subject: [PATCH 107/226] [extractors] Add missing age limits --- youtube_dl/extractor/cammodels.py | 2 ++ youtube_dl/extractor/camtube.py | 2 ++ youtube_dl/extractor/camwithher.py | 2 ++ youtube_dl/extractor/yourporn.py | 2 ++ 4 files changed, 8 insertions(+) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 79350817f..1eb81b75e 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -14,6 +14,7 @@ class CamModelsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.cammodels.com/cam/AutumnKnight/', 'only_matching': True, + 'age_limit': 18 }] def _real_extract(self, url): @@ -93,4 +94,5 @@ class CamModelsIE(InfoExtractor): 'title': self._live_title(user_id), 'is_live': True, 'formats': formats, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py index c7d40f849..b3be3bdcf 100644 --- a/youtube_dl/extractor/camtube.py +++ b/youtube_dl/extractor/camtube.py @@ -20,6 +20,7 @@ class CamTubeIE(InfoExtractor): 'duration': 1274, 'timestamp': 1528018608, 'upload_date': '20180603', + 'age_limit': 18 }, 'params': { 'skip_download': True, @@ -66,4 +67,5 @@ class CamTubeIE(InfoExtractor): 'like_count': like_count, 'creator': creator, 'formats': formats, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py index afbc5ea26..bbc5205fd 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/youtube_dl/extractor/camwithher.py @@ -25,6 +25,7 @@ class CamWithHerIE(InfoExtractor): 'comment_count': int, 'uploader': 'MileenaK', 'upload_date': '20160322', + 'age_limit': 18, }, 'params': { 'skip_download': True, @@ -84,4 +85,5 @@ class CamWithHerIE(InfoExtractor): 'comment_count': comment_count, 'uploader': uploader, 'upload_date': upload_date, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index a9951f3b8..432d1fff4 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -14,6 +14,7 @@ class YourPornIE(InfoExtractor): 'ext': 'mp4', 'title': 'md5:c9f43630bd968267672651ba905a7d35', 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18 }, } @@ -38,4 +39,5 @@ class YourPornIE(InfoExtractor): 'url': video_url, 'title': title, 'thumbnail': thumbnail, + 'age_limit': 18 } From 0e713dbb116493181218c38dc03716feda2fa7d9 Mon Sep 17 00:00:00 2001 From: biwubo <45994985+biwubo@users.noreply.github.com> Date: Tue, 1 Jan 2019 15:48:06 +0000 Subject: [PATCH 108/226] [acast:channel] Add support for play.acast.com (closes #18587) --- youtube_dl/extractor/acast.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index b32f74a37..c4362be88 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -79,17 +79,27 @@ class ACastIE(InfoExtractor): class ACastChannelIE(InfoExtractor): IE_NAME = 'acast:channel' - _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/#?]+)' - _TEST = { - 'url': 'https://www.acast.com/condenasttraveler', + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P[^/#?]+) + ''' + _TESTS = [{ + 'url': 'https://www.acast.com/todayinfocus', 'info_dict': { - 'id': '50544219-29bb-499e-a083-6087f4cb7797', - 'title': 'Condé Nast Traveler Podcast', - 'description': 'md5:98646dee22a5b386626ae31866638fbd', + 'id': '4efc5294-5385-4847-98bd-519799ce5786', + 'title': 'Today in Focus', + 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', }, - 'playlist_mincount': 20, - } - _API_BASE_URL = 'https://www.acast.com/api/' + 'playlist_mincount': 35, + }, { + 'url': 'http://play.acast.com/s/ft-banking-weekly', + 'only_matching': True, + }] + _API_BASE_URL = 'https://play.acast.com/api/' _PAGE_SIZE = 10 @classmethod @@ -102,7 +112,7 @@ class ACastChannelIE(InfoExtractor): channel_slug, note='Download page %d of channel data' % page) for cast in casts: yield self.url_result( - 'https://www.acast.com/%s/%s' % (channel_slug, cast['url']), + 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), 'ACast', cast['id']) def _real_extract(self, url): From 696f4e41149a8f91e963d33e2f6ea9bc835f2260 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Tue, 1 Jan 2019 23:13:39 +0700 Subject: [PATCH 109/226] [README.md] Add more guide lines for regular expressions --- README.md | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b3c39bf66..bdc5faeec 100644 --- a/README.md +++ b/README.md @@ -1133,11 +1133,33 @@ title = meta.get('title') or self._og_search_title(webpage) This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. -### Make regular expressions flexible +### Regular expressions -When using regular expressions try to write them fuzzy and flexible. +#### Don't capture groups you don't use + +Capturing group must be an indication that it's used somewhere in the code. Any group that is not used must be non capturing. + +##### Example + +Don't capture id attribute name here since you can't use it for anything anyway. + +Correct: + +```python +r'(?:id|ID)=(?P\d+)' +``` + +Incorrect: +```python +r'(id|ID)=(?P\d+)' +``` + + +#### Make regular expressions relaxed and flexible + +When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. -#### Example +##### Example Say you need to extract `title` from the following HTML code: From 1d803085d730bf2aaf54e5cea2362133b3fdb831 Mon Sep 17 00:00:00 2001 From: v-delta <45652398+v-delta@users.noreply.github.com> Date: Tue, 1 Jan 2019 17:26:59 +0100 Subject: [PATCH 110/226] [yourporn] Fix extraction --- youtube_dl/extractor/yourporn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index 432d1fff4..c8dc29bd8 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -27,7 +27,7 @@ class YourPornIE(InfoExtractor): self._search_regex( r'data-vnfo=(["\'])(?P{.+?})\1', webpage, 'data info', group='data'), - video_id)[video_id]).replace('/cdn/', '/cdn2/') + video_id)[video_id]).replace('/cdn/', '/cdn3/') title = (self._search_regex( r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', From 8437f5089f2fac85a249c4368c2ea1415955d8b0 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Tue, 1 Jan 2019 23:50:02 +0700 Subject: [PATCH 111/226] [README.md] Add long lines policy to coding conventions --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index bdc5faeec..901595444 100644 --- a/README.md +++ b/README.md @@ -1192,6 +1192,25 @@ title = self._search_regex( webpage, 'title', group='title') ``` +### Long lines policy + +There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. + +For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: + +Correct: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +Incorrect: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=' +'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + ### Use safe conversion functions Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. From d226c560a6250d03ff5f27d5b078f894d448a0b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elan=20Ruusam=C3=A4e?= Date: Tue, 1 Jan 2019 18:56:05 +0200 Subject: [PATCH 112/226] Refactor code to use url_result --- youtube_dl/extractor/audiomack.py | 2 +- youtube_dl/extractor/cnn.py | 12 ++---------- youtube_dl/extractor/freespeech.py | 7 ++----- youtube_dl/extractor/generic.py | 5 +---- youtube_dl/extractor/livestream.py | 5 +---- youtube_dl/extractor/savefrom.py | 7 ++----- youtube_dl/extractor/ted.py | 6 ++---- youtube_dl/extractor/testurl.py | 6 +----- youtube_dl/extractor/wimp.py | 6 +----- 9 files changed, 13 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 62049b921..cc7771354 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -62,7 +62,7 @@ class AudiomackIE(InfoExtractor): # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor if SoundcloudIE.suitable(api_response['url']): - return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} + return self.url_result(api_response['url'], SoundcloudIE.ie_key()) return { 'id': compat_str(api_response.get('id', album_url_tag)), diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5fc311f53..774b71055 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -119,11 +119,7 @@ class CNNBlogsIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') - return { - '_type': 'url', - 'url': cnn_url, - 'ie_key': CNNIE.ie_key(), - } + return self.url_result(cnn_url, CNNIE.ie_key()) class CNNArticleIE(InfoExtractor): @@ -145,8 +141,4 @@ class CNNArticleIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') - return { - '_type': 'url', - 'url': 'http://cnn.com/video/?/video/' + cnn_url, - 'ie_key': CNNIE.ie_key(), - } + return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index 486a49c05..ea9c3e317 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .youtube import YoutubeIE class FreespeechIE(InfoExtractor): @@ -27,8 +28,4 @@ class FreespeechIE(InfoExtractor): r'data-video-url="([^"]+)"', webpage, 'youtube url') - return { - '_type': 'url', - 'url': youtube_url, - 'ie_key': 'Youtube', - } + return self.url_result(youtube_url, YoutubeIE.ie_key()) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 65b482333..067de28cd 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2197,10 +2197,7 @@ class GenericIE(InfoExtractor): def _real_extract(self, url): if url.startswith('//'): - return { - '_type': 'url', - 'url': self.http_scheme() + url, - } + return self.url_result(self.http_scheme() + url) parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c4776bbf3..e55b1a202 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -363,7 +363,4 @@ class LivestreamShortenerIE(InfoExtractor): id = mobj.group('id') webpage = self._download_webpage(url, id) - return { - '_type': 'url', - 'url': self._og_search_url(webpage), - } + return self.url_result(self._og_search_url(webpage)) diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py index 30f9cf824..21e44b69a 100644 --- a/youtube_dl/extractor/savefrom.py +++ b/youtube_dl/extractor/savefrom.py @@ -30,8 +30,5 @@ class SaveFromIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = os.path.splitext(url.split('/')[-1])[0] - return { - '_type': 'url', - 'id': video_id, - 'url': mobj.group('url'), - } + + return self.url_result(mobj.group('url'), video_id=video_id) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index f9b6aa48f..d3e4205f5 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -203,10 +203,8 @@ class TEDIE(InfoExtractor): ext_url = None if service.lower() == 'youtube': ext_url = external.get('code') - return { - '_type': 'url', - 'url': ext_url or external['uri'], - } + + return self.url_result(ext_url or external['uri']) resources_ = player_talk.get('resources') or talk_info.get('resources') diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py index 46918adb0..84a14a0bd 100644 --- a/youtube_dl/extractor/testurl.py +++ b/youtube_dl/extractor/testurl.py @@ -61,8 +61,4 @@ class TestURLIE(InfoExtractor): self.to_screen('Test URL: %s' % tc['url']) - return { - '_type': 'url', - 'url': tc['url'], - 'id': video_id, - } + return self.url_result(tc['url'], video_id=video_id) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 3dab9145b..ea234e3c5 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -40,11 +40,7 @@ class WimpIE(InfoExtractor): r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: - return { - '_type': 'url', - 'url': youtube_id, - 'ie_key': YoutubeIE.ie_key(), - } + return self.url_result(youtube_id, YoutubeIE.ie_key()) info_dict = self._extract_jwplayer_data( webpage, video_id, require_title=False) From 751e051557372ab3fa41d839e7e6a149aa2d04b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 Jan 2019 17:26:15 +0700 Subject: [PATCH 113/226] [packtpub] Add support for subscription.packtpub.com (closes #18718) --- youtube_dl/extractor/packtpub.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 56a2a1083..1324137df 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -24,9 +24,9 @@ class PacktPubBaseIE(InfoExtractor): class PacktPubIE(PacktPubBaseIE): - _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+)/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P\d+)/(?P\d+)/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', 'info_dict': { @@ -37,7 +37,10 @@ class PacktPubIE(PacktPubBaseIE): 'timestamp': 1490918400, 'upload_date': '20170331', }, - } + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215/20528/20530/project-intro', + 'only_matching': True, + }] _NETRC_MACHINE = 'packtpub' _TOKEN = None @@ -110,15 +113,18 @@ class PacktPubIE(PacktPubBaseIE): class PacktPubCourseIE(PacktPubBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+))' - _TEST = { + _VALID_URL = r'(?Phttps?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P\d+))' + _TESTS = [{ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', 'info_dict': { 'id': '9781787122215', 'title': 'Learn Nodejs by building 12 projects [Video]', }, 'playlist_count': 90, - } + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215', + 'only_matching': True, + }] @classmethod def suitable(cls, url): From 2122d7151deca9c7047426320cd73152cb907afa Mon Sep 17 00:00:00 2001 From: nyuszika7h Date: Wed, 2 Jan 2019 17:46:07 +0100 Subject: [PATCH 114/226] [discovery] Use geo verification headers --- youtube_dl/extractor/discovery.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 44fbc41bb..b70c307a7 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -94,11 +94,12 @@ class DiscoveryIE(DiscoveryGoBaseIE): })['access_token'] try: + headers = self.geo_verification_headers() + headers['Authorization'] = 'Bearer ' + access_token + stream = self._download_json( 'https://api.discovery.com/v1/streaming/video/' + video_id, - display_id, headers={ - 'Authorization': 'Bearer ' + access_token, - }) + display_id, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): e_description = self._parse_json( From aeb72b3a413c8a9d11e3c002b8ab2612b9075a56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 Jan 2019 23:51:23 +0700 Subject: [PATCH 115/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 72bfe2110..1a28d6be0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version + +Extractors +* [discovery] Use geo verification headers (#17838) ++ [packtpub] Add support for subscription.packtpub.com (#18718) +* [yourporn] Fix extraction (#18583) ++ [acast:channel] Add support for play.acast.com (#18587) ++ [extractors] Add missing age limits (#18621) ++ [rmcdecouverte] Add support for live stream +* [rmcdecouverte] Bypass geo restriction +* [rmcdecouverte] Update URL regular expression (#18595, 18697) +* [manyvids] Fix extraction (#18604, #18614) +* [bitchute] Fix extraction (#18567) + + version 2018.12.31 Extractors From d7c3af7a72a1e6bd7f93321cce4daf2a13ebdf12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 Jan 2019 23:52:54 +0700 Subject: [PATCH 116/226] release 2019.01.02 --- .github/ISSUE_TEMPLATE.md | 6 ++--- CONTRIBUTING.md | 47 ++++++++++++++++++++++++++++++++++++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b2fe6ee6a..3986bbc8e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.12.31*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.12.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.12.31 +[debug] youtube-dl version 2019.01.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 24617264c..a71b045d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -261,11 +261,33 @@ title = meta.get('title') or self._og_search_title(webpage) This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. -### Make regular expressions flexible +### Regular expressions -When using regular expressions try to write them fuzzy and flexible. +#### Don't capture groups you don't use + +Capturing group must be an indication that it's used somewhere in the code. Any group that is not used must be non capturing. + +##### Example + +Don't capture id attribute name here since you can't use it for anything anyway. + +Correct: + +```python +r'(?:id|ID)=(?P\d+)' +``` + +Incorrect: +```python +r'(id|ID)=(?P\d+)' +``` + + +#### Make regular expressions relaxed and flexible + +When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. -#### Example +##### Example Say you need to extract `title` from the following HTML code: @@ -298,6 +320,25 @@ title = self._search_regex( webpage, 'title', group='title') ``` +### Long lines policy + +There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. + +For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: + +Correct: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +Incorrect: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=' +'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + ### Use safe conversion functions Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. diff --git a/ChangeLog b/ChangeLog index 1a28d6be0..41d190a72 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.01.02 Extractors * [discovery] Use geo verification headers (#17838) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 87de5848d..07a444706 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.12.31' +__version__ = '2019.01.02' From c87f65e43dea0f9edf1b5ed8979e274902fb60a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 4 Jan 2019 22:21:53 +0700 Subject: [PATCH 117/226] [carambatv:page] Fix extraction (closes #18739) --- youtube_dl/extractor/carambatv.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py index 9ba909a91..b57b86af7 100644 --- a/youtube_dl/extractor/carambatv.py +++ b/youtube_dl/extractor/carambatv.py @@ -82,6 +82,12 @@ class CarambaTVPageIE(InfoExtractor): webpage = self._download_webpage(url, video_id) videomore_url = VideomoreIE._extract_url(webpage) + if not videomore_url: + videomore_id = self._search_regex( + r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', + default=None) + if videomore_id: + videomore_url = 'videomore:%s' % videomore_id if videomore_url: title = self._og_search_title(webpage) return { From de0359c0af3667605464212e66ba4048b6ba093b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Jan 2019 03:40:41 +0700 Subject: [PATCH 118/226] [tvnow] Fix and rework extractors, prepare for a switch to the new API (closes #17245, closes #18499) --- youtube_dl/extractor/extractors.py | 4 +- youtube_dl/extractor/tvnow.py | 378 +++++++++++++++++++++-------- 2 files changed, 283 insertions(+), 99 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d72f52e36..3b1dfc451 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1193,7 +1193,9 @@ from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, - TVNowListIE, + TVNowNewIE, + TVNowSeasonIE, + TVNowAnnualIE, TVNowShowIE, ) from .tvp import ( diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 60937616f..3c6a60c39 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -10,8 +10,9 @@ from ..utils import ( int_or_none, parse_iso8601, parse_duration, - try_get, + str_or_none, update_url_query, + urljoin, ) @@ -24,8 +25,7 @@ class TVNowBaseIE(InfoExtractor): def _call_api(self, path, video_id, query): return self._download_json( - 'https://api.tvnow.de/v3/' + path, - video_id, query=query) + 'https://api.tvnow.de/v3/' + path, video_id, query=query) def _extract_video(self, info, display_id): video_id = compat_str(info['id']) @@ -108,6 +108,11 @@ class TVNowIE(TVNowBaseIE): (?!(?:list|jahr)(?:/|$))(?P[^/?\#&]+) ''' + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url) + else super(TVNowIE, cls).suitable(url)) + _TESTS = [{ 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', 'info_dict': { @@ -116,7 +121,6 @@ class TVNowIE(TVNowBaseIE): 'ext': 'mp4', 'title': 'Der neue Porsche 911 GT 3', 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', - 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1495994400, 'upload_date': '20170528', 'duration': 5283, @@ -161,136 +165,314 @@ class TVNowIE(TVNowBaseIE): info = self._call_api( 'movies/' + display_id, display_id, query={ 'fields': ','.join(self._VIDEO_FIELDS), - 'station': mobj.group(1), }) return self._extract_video(info, display_id) -class TVNowListBaseIE(TVNowBaseIE): - _SHOW_VALID_URL = r'''(?x) - (?P - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ - (?P[^/]+) - ) +class TVNowNewIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?Phttps?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien))/ + (?P[^/]+)-\d+/ + [^/]+/ + episode-\d+-(?P[^/?$&]+)-(?P\d+) ''' - def _extract_list_info(self, display_id, show_id): - fields = list(self._SHOW_FIELDS) - fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in self._VIDEO_FIELDS) - return self._call_api( - 'formats/seo', display_id, query={ - 'fields': ','.join(fields), - 'name': show_id + '.php' - }) - - -class TVNowListIE(TVNowListBaseIE): - _VALID_URL = r'%s/(?:list|jahr)/(?P[^?\#&]+)' % TVNowListBaseIE._SHOW_VALID_URL - - _SHOW_FIELDS = ('title', ) - _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) - _VIDEO_FIELDS = ('id', 'headline', 'seoUrl', ) - _TESTS = [{ - 'url': 'https://www.tvnow.de/rtl/30-minuten-deutschland/list/aktuell', - 'info_dict': { - 'id': '28296', - 'title': '30 Minuten Deutschland - Aktuell', - }, - 'playlist_mincount': 1, - }, { - 'url': 'https://www.tvnow.de/vox/ab-ins-beet/list/staffel-14', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/2018/3', + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return (False if TVNowIE.suitable(url) - else super(TVNowListIE, cls).suitable(url)) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url')) + show, episode = mobj.group('show', 'episode') + return self.url_result( + # Rewrite new URLs to the old format and use extraction via old API + # at api.tvnow.de as a loophole for bypassing premium content checks + '%s/%s/%s' % (base_url, show, episode), + ie=TVNowIE.ie_key(), video_id=mobj.group('id')) + + +class TVNowNewBaseIE(InfoExtractor): + def _call_api(self, path, video_id, query={}): + result = self._download_json( + 'https://apigw.tvnow.de/module/' + path, video_id, query=query) + error = result.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + return result + + +""" +TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it +when api.tvnow.de is shut down. This version can't bypass premium checks though. +class TVNowIE(TVNowNewBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien)/[^/]+/ + (?:[^/]+/)+ + (?P[^/?$&]+)-(?P\d+) + ''' + + _TESTS = [{ + # episode with annual navigation + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'info_dict': { + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'ext': 'mp4', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', + }, + }, { + # rtl2, episode with season navigation + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124', + 'only_matching': True, + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822', + 'only_matching': True, + }, { + # superrtl + 'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120', + 'only_matching': True, + }, { + # ntv + 'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630', + 'only_matching': True, + }, { + # vox + 'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'only_matching': True, + }] + + def _extract_video(self, info, url, display_id): + config = info['config'] + source = config['source'] + + video_id = compat_str(info.get('id') or source['videoId']) + title = source['title'].strip() + + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: + if try_get(info, lambda x: x['rights']['isDrm']): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if try_get(config, lambda x: x['boards']['geoBlocking']['block']): + raise self.raise_geo_restricted() + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + self._sort_formats(formats) + + description = source.get('description') + thumbnail = url_or_none(source.get('poster')) + timestamp = unified_timestamp(source.get('previewStart')) + duration = parse_duration(source.get('length')) + + series = source.get('format') + season_number = int_or_none(self._search_regex( + r'staffel-(\d+)', url, 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'episode-(\d+)', url, 'episode number', default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'episode': title, + 'formats': formats, + } def _real_extract(self, url): - base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() + display_id, video_id = re.match(self._VALID_URL, url).groups() + info = self._call_api('player/' + video_id, video_id) + return self._extract_video(info, video_id, display_id) +""" - list_info = self._extract_list_info(season_id, show_id) - season = next( - season for season in list_info['formatTabs']['items'] - if season.get('seoheadline') == season_id) +class TVNowListBaseIE(TVNowNewBaseIE): + _SHOW_VALID_URL = r'''(?x) + (?P + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/ + [^/?#&]+-(?P\d+) + ) + ''' - title = list_info.get('title') - headline = season.get('headline') - if title and headline: - title = '%s - %s' % (title, headline) - else: - title = headline or title + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) + else super(TVNowListBaseIE, cls).suitable(url)) + + def _extract_items(self, url, show_id, list_id, query): + items = self._call_api( + 'teaserrow/format/episode/' + show_id, list_id, + query=query)['items'] entries = [] - for container in season['formatTabPages']['items']: - items = try_get( - container, lambda x: x['container']['movies']['items'], - list) or [] - for info in items: - seo_url = info.get('seoUrl') - if not seo_url: - continue - video_id = info.get('id') - entries.append(self.url_result( - '%s/%s/player' % (base_url, seo_url), TVNowIE.ie_key(), - compat_str(video_id) if video_id else None)) + for item in items: + if not isinstance(item, dict): + continue + item_url = urljoin(url, item.get('url')) + if not item_url: + continue + video_id = str_or_none(item.get('id') or item.get('videoId')) + item_title = item.get('subheadline') or item.get('text') + entries.append(self.url_result( + item_url, ie=TVNowNewIE.ie_key(), video_id=video_id, + video_title=item_title)) - return self.playlist_result( - entries, compat_str(season.get('id') or season_id), title) + return self.playlist_result(entries, '%s/%s' % (show_id, list_id)) + + +class TVNowSeasonIE(TVNowListBaseIE): + _VALID_URL = r'%s/staffel-(?P\d+)' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13', + 'info_dict': { + 'id': '1815/13', + }, + 'playlist_mincount': 22, + }] + + def _real_extract(self, url): + _, show_id, season_id = re.match(self._VALID_URL, url).groups() + return self._extract_items( + url, show_id, season_id, {'season': season_id}) + + +class TVNowAnnualIE(TVNowListBaseIE): + _VALID_URL = r'%s/(?P\d{4})-(?P\d{2})' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05', + 'info_dict': { + 'id': '1669/2017-05', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + _, show_id, year, month = re.match(self._VALID_URL, url).groups() + return self._extract_items( + url, show_id, '%s-%s' % (year, month), { + 'year': int(year), + 'month': int(month), + }) class TVNowShowIE(TVNowListBaseIE): _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL - - _SHOW_FIELDS = ('id', 'title', ) - _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) - _VIDEO_FIELDS = () - _TESTS = [{ - 'url': 'https://www.tvnow.at/vox/ab-ins-beet', + # annual navigationType + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669', 'info_dict': { - 'id': 'ab-ins-beet', - 'title': 'Ab ins Beet!', + 'id': '1669', }, - 'playlist_mincount': 7, + 'playlist_mincount': 73, }, { - 'url': 'https://www.tvnow.at/vox/ab-ins-beet/list', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/', - 'only_matching': True, + # season navigationType + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471', + 'info_dict': { + 'id': '11471', + }, + 'playlist_mincount': 3, }] @classmethod def suitable(cls, url): - return (False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) else super(TVNowShowIE, cls).suitable(url)) def _real_extract(self, url): base_url, show_id = re.match(self._VALID_URL, url).groups() - list_info = self._extract_list_info(show_id, show_id) + result = self._call_api( + 'teaserrow/format/navigation/' + show_id, show_id) + + items = result['items'] entries = [] - for season_info in list_info['formatTabs']['items']: - season_url = season_info.get('seoheadline') - if not season_url: - continue - season_id = season_info.get('id') - entries.append(self.url_result( - '%s/list/%s' % (base_url, season_url), TVNowListIE.ie_key(), - compat_str(season_id) if season_id else None, - season_info.get('headline'))) + navigation = result.get('navigationType') + if navigation == 'annual': + for item in items: + if not isinstance(item, dict): + continue + year = int_or_none(item.get('year')) + if year is None: + continue + months = item.get('months') + if not isinstance(months, list): + continue + for month_dict in months: + if not isinstance(month_dict, dict) or not month_dict: + continue + month_number = int_or_none(list(month_dict.keys())[0]) + if month_number is None: + continue + entries.append(self.url_result( + '%s/%04d-%02d' % (base_url, year, month_number), + ie=TVNowAnnualIE.ie_key())) + elif navigation == 'season': + for item in items: + if not isinstance(item, dict): + continue + season_number = int_or_none(item.get('season')) + if season_number is None: + continue + entries.append(self.url_result( + '%s/staffel-%d' % (base_url, season_number), + ie=TVNowSeasonIE.ie_key())) + else: + raise ExtractorError('Unknown navigationType') - return self.playlist_result(entries, show_id, list_info.get('title')) + return self.playlist_result(entries, show_id) From b7acc83550598b9facf1b31d3d9a1c2823b29cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Jan 2019 00:55:39 +0700 Subject: [PATCH 119/226] [utils] Add language codes replaced in 1989 revision of ISO 639 to ISO639Utils (closes #18765) --- youtube_dl/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 62e769fd5..64b524d2f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2968,6 +2968,7 @@ class ISO639Utils(object): 'gv': 'glv', 'ha': 'hau', 'he': 'heb', + 'iw': 'heb', # Replaced by he in 1989 revision 'hi': 'hin', 'ho': 'hmo', 'hr': 'hrv', @@ -2977,6 +2978,7 @@ class ISO639Utils(object): 'hz': 'her', 'ia': 'ina', 'id': 'ind', + 'in': 'ind', # Replaced by id in 1989 revision 'ie': 'ile', 'ig': 'ibo', 'ii': 'iii', @@ -3091,6 +3093,7 @@ class ISO639Utils(object): 'wo': 'wol', 'xh': 'xho', 'yi': 'yid', + 'ji': 'yid', # Replaced by he yi 1989 revision 'yo': 'yor', 'za': 'zha', 'zh': 'zho', From 04fb6928da5e34fb3c7eaf50592a141400e00487 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Jan 2019 00:57:24 +0700 Subject: [PATCH 120/226] [postprocessor/ffmpeg] Embed subtitles with non-standard language codes (refs #18765) --- youtube_dl/postprocessor/ffmpeg.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 757b496a1..efcd39d57 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -384,9 +384,8 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): opts += ['-c:s', 'mov_text'] for (i, lang) in enumerate(sub_langs): opts.extend(['-map', '%d:0' % (i + 1)]) - lang_code = ISO639Utils.short2long(lang) - if lang_code is not None: - opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) + lang_code = ISO639Utils.short2long(lang) or lang + opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) temp_filename = prepend_extension(filename, 'temp') self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename) From e9a50fba8603128dcb81fa1b59206a0cb106d540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 7 Jan 2019 01:02:34 +0700 Subject: [PATCH 121/226] [utils] Fix typo --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 64b524d2f..d2d3c1a9f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3093,7 +3093,7 @@ class ISO639Utils(object): 'wo': 'wol', 'xh': 'xho', 'yi': 'yid', - 'ji': 'yid', # Replaced by he yi 1989 revision + 'ji': 'yid', # Replaced by yi in 1989 revision 'yo': 'yor', 'za': 'zha', 'zh': 'zho', From bcc334a3c66f2465cd63be284e389cc4c7a78203 Mon Sep 17 00:00:00 2001 From: 4rensiker <4rensics@gmx.de> Date: Tue, 8 Jan 2019 02:44:42 +0100 Subject: [PATCH 122/226] [dtube] Fix extraction (closes #18741) --- youtube_dl/extractor/dtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py index 5887887e1..20190c9cc 100644 --- a/youtube_dl/extractor/dtube.py +++ b/youtube_dl/extractor/dtube.py @@ -48,7 +48,7 @@ class DTubeIE(InfoExtractor): def canonical_url(h): if not h: return None - return 'https://ipfs.io/ipfs/' + h + return 'https://video.dtube.top/ipfs/' + h formats = [] for q in ('240', '480', '720', '1080', ''): From 0266854f63baaa9669b30ee49c5e7e9f353970dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 Jan 2019 08:46:34 +0700 Subject: [PATCH 123/226] [dtube] Update test --- youtube_dl/extractor/dtube.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py index 20190c9cc..114d2dbe3 100644 --- a/youtube_dl/extractor/dtube.py +++ b/youtube_dl/extractor/dtube.py @@ -15,16 +15,16 @@ from ..utils import ( class DTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P[0-9a-z.-]+)/(?P[0-9a-z]{8})' _TEST = { - 'url': 'https://d.tube/#!/v/benswann/zqd630em', - 'md5': 'a03eaa186618ffa7a3145945543a251e', + 'url': 'https://d.tube/#!/v/broncnutz/x380jtr1', + 'md5': '9f29088fa08d699a7565ee983f56a06e', 'info_dict': { - 'id': 'zqd630em', + 'id': 'x380jtr1', 'ext': 'mp4', - 'title': 'Reality Check: FDA\'s Disinformation Campaign on Kratom', - 'description': 'md5:700d164e066b87f9eac057949e4227c2', - 'uploader_id': 'benswann', - 'upload_date': '20180222', - 'timestamp': 1519328958, + 'title': 'Lefty 3-Rings is Back Baby!! NCAA Picks', + 'description': 'md5:60be222088183be3a42f196f34235776', + 'uploader_id': 'broncnutz', + 'upload_date': '20190107', + 'timestamp': 1546854054, }, 'params': { 'format': '480p', From 8cb5c2181ac039e92a20247f266f48cee2411748 Mon Sep 17 00:00:00 2001 From: Awal Garg Date: Mon, 7 Jan 2019 17:41:12 +0530 Subject: [PATCH 124/226] [hungama] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hungama.py | 32 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 youtube_dl/extractor/hungama.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3b1dfc451..5ff34216f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -469,6 +469,7 @@ from .hrti import ( ) from .huajiao import HuajiaoIE from .huffpost import HuffPostIE +from .hungama import HungamaIE from .hypem import HypemIE from .iconosquare import IconosquareIE from .ign import ( diff --git a/youtube_dl/extractor/hungama.py b/youtube_dl/extractor/hungama.py new file mode 100644 index 000000000..e51440299 --- /dev/null +++ b/youtube_dl/extractor/hungama.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class HungamaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)hungama\.com/song/[\w\d-]+/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', + 'md5': '396fa7e8e7e67aa25da0edc4cac9b785', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp4', + 'title': 'Kitni Haseen Zindagi', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_data = self._download_json('https://www.hungama.com/audio-player-data/track/%s?_country=IN' % video_id, video_id)[0] + title = player_data.get('song_name') or self._og_search_title(webpage) + track_data = self._download_json(player_data['file'], video_id) + media_url = track_data['response']['media_url'] + + return { + 'id': video_id, + 'title': title, + 'formats': self._extract_m3u8_formats(media_url, video_id, ext='mp4'), + } From 06b4b90c70636d8044b7a2f588503cadf8b202d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 Jan 2019 09:09:49 +0700 Subject: [PATCH 125/226] [hungama] Fix code and extract more metadata (closes #18771) --- youtube_dl/extractor/hungama.py | 41 +++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/hungama.py b/youtube_dl/extractor/hungama.py index e51440299..d9ad1281a 100644 --- a/youtube_dl/extractor/hungama.py +++ b/youtube_dl/extractor/hungama.py @@ -2,31 +2,54 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import int_or_none class HungamaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)hungama\.com/song/[\w\d-]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P[0-9]+)' _TEST = { 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', - 'md5': '396fa7e8e7e67aa25da0edc4cac9b785', + 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', 'info_dict': { 'id': '2931166', 'ext': 'mp4', - 'title': 'Kitni Haseen Zindagi', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': 'Aks', + 'release_year': 2000, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - player_data = self._download_json('https://www.hungama.com/audio-player-data/track/%s?_country=IN' % video_id, video_id)[0] - title = player_data.get('song_name') or self._og_search_title(webpage) - track_data = self._download_json(player_data['file'], video_id) - media_url = track_data['response']['media_url'] + data = self._download_json( + 'https://www.hungama.com/audio-player-data/track/%s' % video_id, + video_id, query={'_country': 'IN'})[0] + + track = data['song_name'] + artist = data.get('singer_name') + + m3u8_url = self._download_json( + data.get('file') or data['preview_link'], + video_id)['response']['media_url'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + title = '%s - %s' % (artist, track) if artist else track + thumbnail = data.get('img_src') or data.get('album_image') return { 'id': video_id, 'title': title, - 'formats': self._extract_m3u8_formats(media_url, video_id, ext='mp4'), + 'thumbnail': thumbnail, + 'track': track, + 'artist': artist, + 'album': data.get('album_name'), + 'release_year': int_or_none(data.get('date')), + 'formats': formats, } From 391256dc0ee5e6ac8d3022455de207b8b02a5b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 Jan 2019 10:02:00 +0700 Subject: [PATCH 126/226] [extractor/common] Add support for movies in _json_ld --- youtube_dl/extractor/common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e5f8136fc..f507400cc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1250,6 +1250,13 @@ class InfoExtractor(object): part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Movie': + info.update({ + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('dateCreated')), + }) elif item_type in ('Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), From 440863ade10c70bec272c1ffe17b63c877e46ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 Jan 2019 10:02:49 +0700 Subject: [PATCH 127/226] [extractor/common] Use episode name as title in _json_ld --- youtube_dl/extractor/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f507400cc..9e7febcad 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1239,11 +1239,14 @@ class InfoExtractor(object): if expected_type is not None and expected_type != item_type: return info if item_type in ('TVEpisode', 'Episode'): + episode_name = unescapeHTML(e.get('name')) info.update({ - 'episode': unescapeHTML(e.get('name')), + 'episode': episode_name, 'episode_number': int_or_none(e.get('episodeNumber')), 'description': unescapeHTML(e.get('description')), }) + if not info.get('title') and episode_name: + info['title'] = episode_name part_of_season = e.get('partOfSeason') if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) From 2543938bbe393ceef8dca6a69b441d54df099107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 Jan 2019 10:03:44 +0700 Subject: [PATCH 128/226] [hungama] Add support for videos (closes #17402) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/hungama.py | 78 +++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5ff34216f..ddeb70284 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -469,7 +469,10 @@ from .hrti import ( ) from .huajiao import HuajiaoIE from .huffpost import HuffPostIE -from .hungama import HungamaIE +from .hungama import ( + HungamaIE, + HungamaSongIE, +) from .hypem import HypemIE from .iconosquare import IconosquareIE from .ign import ( diff --git a/youtube_dl/extractor/hungama.py b/youtube_dl/extractor/hungama.py index d9ad1281a..3fdaac5b6 100644 --- a/youtube_dl/extractor/hungama.py +++ b/youtube_dl/extractor/hungama.py @@ -2,11 +2,73 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + urlencode_postdata, +) class HungamaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P[0-9]+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?hungama\.com/ + (?: + (?:video|movie)/[^/]+/| + tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', + 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp4', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': 'Aks', + 'release_year': 2000, + } + }, { + 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', + 'only_matching': True, + }, { + 'url': 'https://www.hungama.com/tv-show/padded-ki-pushup/season-1/44139461/episode/ep-02-training-sasu-pathlaag-karing/44139503/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info = self._search_json_ld(webpage, video_id) + + m3u8_url = self._download_json( + 'https://www.hungama.com/index.php', video_id, + data=urlencode_postdata({'content_id': video_id}), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'X-Requested-With': 'XMLHttpRequest', + }, query={ + 'c': 'common', + 'm': 'get_video_mdn_url', + })['stream_url'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + info.update({ + 'id': video_id, + 'formats': formats, + }) + return info + + +class HungamaSongIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P\d+)' _TEST = { 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', @@ -22,21 +84,21 @@ class HungamaIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + audio_id = self._match_id(url) data = self._download_json( - 'https://www.hungama.com/audio-player-data/track/%s' % video_id, - video_id, query={'_country': 'IN'})[0] + 'https://www.hungama.com/audio-player-data/track/%s' % audio_id, + audio_id, query={'_country': 'IN'})[0] track = data['song_name'] artist = data.get('singer_name') m3u8_url = self._download_json( data.get('file') or data['preview_link'], - video_id)['response']['media_url'] + audio_id)['response']['media_url'] formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_url, audio_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) @@ -44,7 +106,7 @@ class HungamaIE(InfoExtractor): thumbnail = data.get('img_src') or data.get('album_image') return { - 'id': video_id, + 'id': audio_id, 'title': title, 'thumbnail': thumbnail, 'track': track, From 6089ff40e7cc7710e399db1be87fea103a190ee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 9 Jan 2019 00:37:01 +0700 Subject: [PATCH 129/226] [youporn] Fix title and description extraction (closes #18748) --- youtube_dl/extractor/youporn.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index ea0bce784..d4eccb4b2 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -68,11 +68,9 @@ class YouPornIE(InfoExtractor): request.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(request, display_id) - title = self._search_regex( - [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P(?:(?!\1).)+)\1', - r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'], - webpage, 'title', group='title', - default=None) or self._og_search_title( + title = self._html_search_regex( + r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', + webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'title', webpage, fatal=True) @@ -134,7 +132,11 @@ class YouPornIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>', + webpage, 'description', + default=None) or self._og_search_description( + webpage, default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') From 3c1089dba41f137dcc6c373daacdc19a24aef81c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 9 Jan 2019 14:23:26 +0100 Subject: [PATCH 130/226] [gaia] Add new extractor(#14605) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/gaia.py | 98 ++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 youtube_dl/extractor/gaia.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ddeb70284..c7d04d366 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -411,6 +411,7 @@ from .funk import ( from .funnyordie import FunnyOrDieIE from .fusion import FusionIE from .fxnetworks import FXNetworksIE +from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gameone import ( GameOneIE, diff --git a/youtube_dl/extractor/gaia.py b/youtube_dl/extractor/gaia.py new file mode 100644 index 000000000..f2eef3f4c --- /dev/null +++ b/youtube_dl/extractor/gaia.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + strip_or_none, + try_get, +) + + +class GaiaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gaia\.com/video/(?P<id>[^/?]+).*?\bfullplayer=(?P<type>feature|preview)' + _TESTS = [{ + 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=feature', + 'info_dict': { + 'id': '89356', + 'ext': 'mp4', + 'title': 'Connecting with Universal Consciousness', + 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', + 'upload_date': '20151116', + 'timestamp': 1447707266, + 'duration': 936, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=preview', + 'info_dict': { + 'id': '89351', + 'ext': 'mp4', + 'title': 'Connecting with Universal Consciousness', + 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', + 'upload_date': '20151116', + 'timestamp': 1447707266, + 'duration': 53, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id, vtype = re.search(self._VALID_URL, url).groups() + node_id = self._download_json( + 'https://brooklyn.gaia.com/pathinfo', display_id, query={ + 'path': 'video/' + display_id, + })['id'] + node = self._download_json( + 'https://brooklyn.gaia.com/node/%d' % node_id, node_id) + vdata = node[vtype] + media_id = compat_str(vdata['nid']) + title = node['title'] + + media = self._download_json( + 'https://brooklyn.gaia.com/media/' + media_id, media_id) + formats = self._extract_m3u8_formats( + media['mediaUrls']['bcHLS'], media_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + text_tracks = media.get('textTracks', {}) + for key in ('captions', 'subtitles'): + for lang, sub_url in text_tracks.get(key, {}).items(): + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + + fivestar = node.get('fivestar', {}) + fields = node.get('fields', {}) + + def get_field_value(key, value_key='value'): + return try_get(fields, lambda x: x[key][0][value_key]) + + return { + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'description': strip_or_none(get_field_value('body') or get_field_value('teaser')), + 'timestamp': int_or_none(node.get('created')), + 'subtitles': subtitles, + 'duration': int_or_none(vdata.get('duration')), + 'like_count': int_or_none(try_get(fivestar, lambda x: x['up_count']['value'])), + 'dislike_count': int_or_none(try_get(fivestar, lambda x: x['down_count']['value'])), + 'comment_count': int_or_none(node.get('comment_count')), + 'series': try_get(node, lambda x: x['series']['title'], compat_str), + 'season_number': int_or_none(get_field_value('season')), + 'season_id': str_or_none(get_field_value('series_nid', 'nid')), + 'episode_number': int_or_none(get_field_value('episode')), + } From 65615be36874fe98d5a972546e8024cda3879c86 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 9 Jan 2019 19:17:58 +0100 Subject: [PATCH 131/226] [globo] set GLBID cookie manually(closes #17346) --- youtube_dl/extractor/globo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index c2140c362..fb8f7679b 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -72,7 +72,7 @@ class GloboIE(InfoExtractor): return try: - self._download_json( + glb_id = (self._download_json( 'https://login.globo.com/api/authentication', None, data=json.dumps({ 'payload': { 'email': email, @@ -81,7 +81,9 @@ class GloboIE(InfoExtractor): }, }).encode(), headers={ 'Content-Type': 'application/json; charset=utf-8', - }) + }) or {}).get('glbId') + if glb_id: + self._set_cookie('.globo.com', 'GLBID', glb_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: resp = self._parse_json(e.cause.read(), None) From 4ad159c7b0ce476171aad1eaa8a36f7b9b1cce65 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 9 Jan 2019 20:39:48 +0100 Subject: [PATCH 132/226] [playplustv] Add new extractor(closes #18789) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/playplustv.py | 109 +++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 youtube_dl/extractor/playplustv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c7d04d366..7d7e4247a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -861,6 +861,7 @@ from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE from .playfm import PlayFMIE +from .playplustv import PlayPlusTVIE from .plays import PlaysTVIE from .playtvak import PlaytvakIE from .playvid import PlayvidIE diff --git a/youtube_dl/extractor/playplustv.py b/youtube_dl/extractor/playplustv.py new file mode 100644 index 000000000..419c2974b --- /dev/null +++ b/youtube_dl/extractor/playplustv.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + PUTRequest, +) + + +class PlayPlusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playplus\.tv/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _TEST = { + 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', + 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', + 'info_dict': { + 'id': 'db8d274a5163424e967f35a30ddafb8e', + 'ext': 'mp4', + 'title': 'Capítulo 179 - Final', + 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', + 'timestamp': 1529992740, + 'upload_date': '20180626', + }, + 'skip': 'Requires account credential', + } + _NETRC_MACHINE = 'playplustv' + _GEO_COUNTRIES = ['BR'] + _token = None + _profile_id = None + + def _call_api(self, resource, video_id=None, query=None): + return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ + 'Authorization': 'Bearer ' + self._token, + }, query=query) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + self.raise_login_required() + + req = PUTRequest( + 'https://api.playplus.tv/api/web/login', json.dumps({ + 'email': email, + 'password': password, + }).encode(), { + 'Content-Type': 'application/json; charset=utf-8', + }) + + try: + self._token = self._download_json(req, None)['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(self._parse_json( + e.cause.read(), None)['errorMessage'], expected=True) + raise + + self._profile = self._call_api('Profiles')['list'][0]['_id'] + + def _real_extract(self, url): + project_id, media_id = re.match(self._VALID_URL, url).groups() + media = self._call_api( + 'Media', media_id, { + 'profileId': self._profile, + 'projectId': project_id, + 'mediaId': media_id, + })['obj'] + title = media['title'] + + formats = [] + for f in media.get('files', []): + f_url = f.get('url') + if not f_url: + continue + file_info = f.get('fileInfo') or {} + formats.append({ + 'url': f_url, + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumb in media.get('thumbs', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(media.get('description')) or media.get('shortDescription'), + 'timestamp': int_or_none(media.get('publishDate'), 1000), + 'view_count': int_or_none(media.get('numberOfViews')), + 'comment_count': int_or_none(media.get('numberOfComments')), + 'tags': media.get('tags'), + } From 96c186e1fd8af4943eeaa8c858f9cb0f15e8b265 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 10 Jan 2019 09:05:00 +0100 Subject: [PATCH 133/226] [fox] add support National Geographic(closes #17985)(closes #15333)(closes #14698) --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/fox.py | 93 +++++++------- youtube_dl/extractor/nationalgeographic.py | 135 --------------------- 3 files changed, 48 insertions(+), 186 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7d7e4247a..a01d99b85 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -687,11 +687,7 @@ from .myvi import ( MyviEmbedIE, ) from .myvidster import MyVidsterIE -from .nationalgeographic import ( - NationalGeographicVideoIE, - NationalGeographicIE, - NationalGeographicEpisodeGuideIE, -) +from .nationalgeographic import NationalGeographicVideoIE from .naver import NaverIE from .nba import NBAIE from .nbc import ( diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 11d6c9c32..b1c91f095 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -1,11 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +# import json +# import uuid + from .adobepass import AdobePassIE -from .uplynk import UplynkPreplayIE -from ..compat import compat_str from ..utils import ( - HEADRequest, int_or_none, parse_age_limit, parse_duration, @@ -16,7 +16,7 @@ from ..utils import ( class FOXIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' + _VALID_URL = r'https?://(?:www\.)?(?:fox\.com|nationalgeographic\.com/tv)/watch/(?P<id>[\da-fA-F]+)' _TESTS = [{ # clip 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', @@ -43,41 +43,47 @@ class FOXIE(AdobePassIE): # episode, geo-restricted, tv provided required 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', 'only_matching': True, + }, { + 'url': 'https://www.nationalgeographic.com/tv/watch/f690e05ebbe23ab79747becd0cc223d1/', + 'only_matching': True, }] + # _access_token = None + + # def _call_api(self, path, video_id, data=None): + # headers = { + # 'X-Api-Key': '238bb0a0c2aba67922c48709ce0c06fd', + # } + # if self._access_token: + # headers['Authorization'] = 'Bearer ' + self._access_token + # return self._download_json( + # 'https://api2.fox.com/v2.0/' + path, video_id, data=data, headers=headers) + + # def _real_initialize(self): + # self._access_token = self._call_api( + # 'login', None, json.dumps({ + # 'deviceId': compat_str(uuid.uuid4()), + # }).encode())['accessToken'] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, + 'https://api.fox.com/fbc-content/v1_5/video/%s' % video_id, video_id, headers={ 'apikey': 'abdcbed02c124d393b39e818a4312055', 'Content-Type': 'application/json', 'Referer': url, }) + # video = self._call_api('vodplayer/' + video_id, video_id) title = video['name'] release_url = video['videoRelease']['url'] - - description = video.get('description') - duration = int_or_none(video.get('durationInSeconds')) or int_or_none( - video.get('duration')) or parse_duration(video.get('duration')) - timestamp = unified_timestamp(video.get('datePublished')) - rating = video.get('contentRating') - age_limit = parse_age_limit(rating) + # release_url = video['url'] data = try_get( video, lambda x: x['trackingData']['properties'], dict) or {} - creator = data.get('brand') or data.get('network') or video.get('network') - - series = video.get('seriesName') or data.get( - 'seriesName') or data.get('show') - season_number = int_or_none(video.get('seasonNumber')) - episode = video.get('name') - episode_number = int_or_none(video.get('episodeNumber')) - release_year = int_or_none(video.get('releaseYear')) - + rating = video.get('contentRating') if data.get('authRequired'): resource = self._get_mvpd_resource( 'fbc-fox', title, video.get('guid'), rating) @@ -86,6 +92,18 @@ class FOXIE(AdobePassIE): 'auth': self._extract_mvpd_auth( url, video_id, 'fbc-fox', resource) }) + m3u8_url = self._download_json(release_url, video_id)['playURL'] + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( + video.get('duration')) or parse_duration(video.get('duration')) + timestamp = unified_timestamp(video.get('datePublished')) + creator = data.get('brand') or data.get('network') or video.get('network') + series = video.get('seriesName') or data.get( + 'seriesName') or data.get('show') subtitles = {} for doc_rel in video.get('documentReleases', []): @@ -98,36 +116,19 @@ class FOXIE(AdobePassIE): }] break - info = { + return { 'id': video_id, 'title': title, - 'description': description, + 'formats': formats, + 'description': video.get('description'), 'duration': duration, 'timestamp': timestamp, - 'age_limit': age_limit, + 'age_limit': parse_age_limit(rating), 'creator': creator, 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'release_year': release_year, + 'season_number': int_or_none(video.get('seasonNumber')), + 'episode': video.get('name'), + 'episode_number': int_or_none(video.get('episodeNumber')), + 'release_year': int_or_none(video.get('releaseYear')), 'subtitles': subtitles, } - - urlh = self._request_webpage(HEADRequest(release_url), video_id) - video_url = compat_str(urlh.geturl()) - - if UplynkPreplayIE.suitable(video_url): - info.update({ - '_type': 'url_transparent', - 'url': video_url, - 'ie_key': UplynkPreplayIE.ie_key(), - }) - else: - m3u8_url = self._download_json(release_url, video_id)['playURL'] - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) - info['formats'] = formats - return info diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 4d2ee6408..165964ca0 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,15 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .adobepass import AdobePassIE -from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, url_basename, - update_url_query, - get_element_by_class, ) @@ -64,132 +58,3 @@ class NationalGeographicVideoIE(InfoExtractor): {'force_smil_url': True}), 'id': guid, } - - -class NationalGeographicIE(ThePlatformIE, AdobePassIE): - IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:(?:wild/)?[^/]+/)?(?:videos|episodes)|u)/(?P<id>[^/?]+)' - - _TESTS = [ - { - 'url': 'http://channel.nationalgeographic.com/u/kdi9Ld0PN2molUUIMSBGxoeDhD729KRjQcnxtetilWPMevo8ZwUBIDuPR0Q3D2LVaTsk0MPRkRWDB8ZhqWVeyoxfsZZm36yRp1j-zPfsHEyI_EgAeFY/', - 'md5': '518c9aa655686cf81493af5cc21e2a04', - 'info_dict': { - 'id': 'vKInpacll2pC', - 'ext': 'mp4', - 'title': 'Uncovering a Universal Knowledge', - 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', - 'timestamp': 1458680907, - 'upload_date': '20160322', - 'uploader': 'NEWA-FNG-NGTV', - }, - 'add_ie': ['ThePlatform'], - }, - { - 'url': 'http://channel.nationalgeographic.com/u/kdvOstqYaBY-vSBPyYgAZRUL4sWUJ5XUUPEhc7ISyBHqoIO4_dzfY3K6EjHIC0hmFXoQ7Cpzm6RkET7S3oMlm6CFnrQwSUwo/', - 'md5': 'c4912f656b4cbe58f3e000c489360989', - 'info_dict': { - 'id': 'Pok5lWCkiEFA', - 'ext': 'mp4', - 'title': 'The Stunning Red Bird of Paradise', - 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', - 'timestamp': 1459362152, - 'upload_date': '20160330', - 'uploader': 'NEWA-FNG-NGTV', - }, - 'add_ie': ['ThePlatform'], - }, - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - release_url = self._search_regex( - r'video_auth_playlist_url\s*=\s*"([^"]+)"', - webpage, 'release url') - theplatform_path = self._search_regex(r'https?://link\.theplatform\.com/s/([^?]+)', release_url, 'theplatform path') - video_id = theplatform_path.split('/')[-1] - query = { - 'mbr': 'true', - } - is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) - if is_auth == 'auth': - auth_resource_id = self._search_regex( - r"video_auth_resourceId\s*=\s*'([^']+)'", - webpage, 'auth resource id') - query['auth'] = self._extract_mvpd_auth(url, video_id, 'natgeo', auth_resource_id) - - formats = [] - subtitles = {} - for key, value in (('switch', 'http'), ('manifest', 'm3u')): - tp_query = query.copy() - tp_query.update({ - key: value, - }) - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query(release_url, tp_query), video_id, 'Downloading %s SMIL data' % value) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self._extract_theplatform_metadata(theplatform_path, display_id) - info.update({ - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - 'display_id': display_id, - }) - return info - - -class NationalGeographicEpisodeGuideIE(InfoExtractor): - IE_NAME = 'natgeo:episodeguide' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P<id>[^/]+)/episode-guide' - _TESTS = [ - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/', - 'info_dict': { - 'id': 'the-story-of-god-with-morgan-freeman-season-1', - 'title': 'The Story of God with Morgan Freeman - Season 1', - }, - 'playlist_mincount': 6, - }, - { - 'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2', - 'info_dict': { - 'id': 'underworld-inc-season-2', - 'title': 'Underworld, Inc. - Season 2', - }, - 'playlist_mincount': 7, - }, - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - show = get_element_by_class('show', webpage) - selected_season = self._search_regex( - r'<div[^>]+class="select-seasons[^"]*".*?<a[^>]*>(.*?)</a>', - webpage, 'selected season') - entries = [ - self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic') - for entry_url in re.findall('(?s)<div[^>]+class="col-inner"[^>]*?>.*?<a[^>]+href="([^"]+)"', webpage)] - return self.playlist_result( - entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')), - '%s - %s' % (show, selected_season)) From 7c072f00d66f3b3f44d301059443480a5c83d3c0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 10 Jan 2019 10:50:18 +0100 Subject: [PATCH 134/226] [jwplatform] use JW Platform Delivery API V2 and add support for more urls --- youtube_dl/extractor/jwplatform.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 63d0dc998..d19a6a774 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -7,8 +7,8 @@ from .common import InfoExtractor class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' - _TEST = { + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|video|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', 'info_dict': { @@ -19,7 +19,10 @@ class JWPlatformIE(InfoExtractor): 'upload_date': '20081127', 'timestamp': 1227796140, } - } + }, { + 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', + 'only_matching': True, + }] @staticmethod def _extract_url(webpage): @@ -34,5 +37,5 @@ class JWPlatformIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id) return self._parse_jwplayer_data(json_data, video_id) From 432aba1c5e68cf8efccb868107785a15617d5109 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 10 Jan 2019 10:54:46 +0100 Subject: [PATCH 135/226] [outsidetv] Add new extractor(closes #18774) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/outsidetv.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 youtube_dl/extractor/outsidetv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a01d99b85..de38c6641 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -829,6 +829,7 @@ from .orf import ( ORFOE1IE, ORFIPTVIE, ) +from .outsidetv import OutsideTVIE from .packtpub import ( PacktPubIE, PacktPubCourseIE, diff --git a/youtube_dl/extractor/outsidetv.py b/youtube_dl/extractor/outsidetv.py new file mode 100644 index 000000000..c5333b08c --- /dev/null +++ b/youtube_dl/extractor/outsidetv.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OutsideTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?outsidetv\.com/(?:[^/]+/)*?play/[a-zA-Z0-9]{8}/\d+/\d+/(?P<id>[a-zA-Z0-9]{8})' + _TESTS = [{ + 'url': 'http://www.outsidetv.com/category/snow/play/ZjQYboH6/1/10/Hdg0jukV/4', + 'md5': '192d968fedc10b2f70ec31865ffba0da', + 'info_dict': { + 'id': 'Hdg0jukV', + 'ext': 'mp4', + 'title': 'Home - Jackson Ep 1 | Arbor Snowboards', + 'description': 'md5:41a12e94f3db3ca253b04bb1e8d8f4cd', + 'upload_date': '20181225', + 'timestamp': 1545742800, + } + }, { + 'url': 'http://www.outsidetv.com/home/play/ZjQYboH6/1/10/Hdg0jukV/4', + 'only_matching': True, + }] + + def _real_extract(self, url): + jw_media_id = self._match_id(url) + return self.url_result( + 'jwplatform:' + jw_media_id, 'JWPlatform', jw_media_id) From c3e543893bfba7faa7c13e53fbe6b60f936b81f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 Jan 2019 22:46:53 +0700 Subject: [PATCH 136/226] [youtube] Extract live HLS URL from player response (closes #18799) --- youtube_dl/extractor/youtube.py | 55 +++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 539ff6ff2..29773877e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1931,31 +1931,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'http_chunk_size': 10485760, } formats.append(dct) - elif video_info.get('hlsvp'): - manifest_url = video_info['hlsvp'][0] - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) else: - error_message = clean_html(video_info.get('reason', [None])[0]) - if not error_message: - error_message = extract_unavailable_message() - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + manifest_url = ( + url_or_none(try_get( + player_response, + lambda x: x['streamingData']['hlsManifestUrl'], + compat_str)) or + url_or_none(try_get( + video_info, lambda x: x['hlsvp'][0], compat_str))) + if manifest_url: + formats = [] + m3u8_formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', fatal=False) + for a_format in m3u8_formats: + itag = self._search_regex( + r'/itag/(\d+)/', a_format['url'], 'itag', default=None) + if itag: + a_format['format_id'] = itag + if itag in self._formats: + dct = self._formats[itag].copy() + dct.update(a_format) + a_format = dct + a_format['player_url'] = player_url + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' + formats.append(a_format) + else: + error_message = clean_html(video_info.get('reason', [None])[0]) + if not error_message: + error_message = extract_unavailable_message() + if error_message: + raise ExtractorError(error_message, expected=True) + raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') # uploader video_uploader = try_get( From a4491dd55c5feaf02ac8969caafd637265cbeede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 Jan 2019 23:23:19 +0700 Subject: [PATCH 137/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ChangeLog b/ChangeLog index 41d190a72..c7d090c6c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,29 @@ +version <unreleased> + +Core +* [extractor/common] Use episode name as title in _json_ld ++ [extractor/common] Add support for movies in _json_ld +* [postprocessor/ffmpeg] Embed subtitles with non-standard language codes + (#18765) ++ [utils] Add language codes replaced in 1989 revision of ISO 639 + to ISO639Utils (#18765) + +Extractors +* [youtube] Extract live HLS URL from player response (#18799) ++ [outsidetv] Add support for outsidetv.com (#18774) +* [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs ++ [fox] Add support National Geographic (#17985, #15333, #14698) ++ [playplustv] Add support for playplus.tv (#18789) +* [globo] Set GLBID cookie manually (#17346) ++ [gaia] Add support for gaia.com (#14605) +* [youporn] Fix title and description extraction (#18748) ++ [hungama] Add support for hungama.com (#17402, #18771) +* [dtube] Fix extraction (#18741) +* [tvnow] Fix and rework extractors and prepare for a switch to the new API + (#17245, #18499) +* [carambatv:page] Fix extraction (#18739) + + version 2019.01.02 Extractors From b64f6e690ff84052751c593a8826a40a63cb1d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 Jan 2019 23:26:54 +0700 Subject: [PATCH 138/226] release 2019.01.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 11 ++++++++--- youtube_dl/version.py | 2 +- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3986bbc8e..d6abdbcb7 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.02 +[debug] youtube-dl version 2019.01.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index c7d090c6c..3d60754c5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.10 Core * [extractor/common] Use episode name as title in _json_ld diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e92064432..c01409419 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -320,6 +320,7 @@ - **Fusion** - **Fux** - **FXNetworks** + - **Gaia** - **GameInformer** - **GameOne** - **gameone:playlist** @@ -370,6 +371,8 @@ - **HRTiPlaylist** - **Huajiao**: 花椒直播 - **HuffPost**: Huffington Post + - **Hungama** + - **HungamaSong** - **Hypem** - **Iconosquare** - **ign.com** @@ -540,8 +543,6 @@ - **MyviEmbed** - **MyVisionTV** - **n-tv.de** - - **natgeo** - - **natgeo:episodeguide** - **natgeo:video** - **Naver** - **NBA** @@ -642,6 +643,7 @@ - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek - **OsnatelTV** + - **OutsideTV** - **PacktPub** - **PacktPubCourse** - **PandaTV**: 熊猫TV @@ -666,6 +668,7 @@ - **Pinkbike** - **Pladform** - **play.fm** + - **PlayPlusTV** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** @@ -934,7 +937,9 @@ - **TVNet** - **TVNoe** - **TVNow** - - **TVNowList** + - **TVNowAnnual** + - **TVNowNew** + - **TVNowSeason** - **TVNowShow** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 07a444706..5ba61f489 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.02' +__version__ = '2019.01.10' From c469e8808c723d6ba090c0b33c292622a3cb555e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 10 Jan 2019 18:48:52 +0100 Subject: [PATCH 139/226] [playplustv] add support for playplus.com(#18789) --- youtube_dl/extractor/playplustv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/playplustv.py b/youtube_dl/extractor/playplustv.py index 419c2974b..1e30ab23a 100644 --- a/youtube_dl/extractor/playplustv.py +++ b/youtube_dl/extractor/playplustv.py @@ -15,7 +15,7 @@ from ..utils import ( class PlayPlusTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playplus\.tv/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' _TEST = { 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', From a64646e417615173c9436887116402c9c4c9f6ee Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 11 Jan 2019 15:09:44 +0100 Subject: [PATCH 140/226] [postprocessor/ffmpeg] sanitize ffmpeg version for Ubuntu and Arch Linux systems(closes #18813) --- youtube_dl/postprocessor/ffmpeg.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index efcd39d57..173bdbe68 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -79,6 +79,19 @@ class FFmpegPostProcessor(PostProcessor): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] prefer_ffmpeg = True + def get_ffmpeg_version(path): + ver = get_exe_version(path, args=['-version']) + if ver: + regexs = [ + r'([0-9.]+)-0ubuntu0\.[0-9.]+$', # Ubuntu + r'n([0-9.]+)$', # Arch Linux + ] + for regex in regexs: + mobj = re.match(regex, ver) + if mobj: + ver = mobj.group(1) + return ver + self.basename = None self.probe_basename = None @@ -110,11 +123,10 @@ class FFmpegPostProcessor(PostProcessor): self._paths = dict( (p, os.path.join(location, p)) for p in programs) self._versions = dict( - (p, get_exe_version(self._paths[p], args=['-version'])) - for p in programs) + (p, get_ffmpeg_version(self._paths[p])) for p in programs) if self._versions is None: self._versions = dict( - (p, get_exe_version(p, args=['-version'])) for p in programs) + (p, get_ffmpeg_version(p)) for p in programs) self._paths = dict((p, p) for p in programs) if prefer_ffmpeg is False: From 5caa531a1a7e070e3bef8cc60c3d00c3f7ab1e6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 11 Jan 2019 23:47:23 +0700 Subject: [PATCH 141/226] [postprocessor/ffmpeg] PEP 8 --- youtube_dl/postprocessor/ffmpeg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 173bdbe68..a9fafa363 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -83,8 +83,8 @@ class FFmpegPostProcessor(PostProcessor): ver = get_exe_version(path, args=['-version']) if ver: regexs = [ - r'([0-9.]+)-0ubuntu0\.[0-9.]+$', # Ubuntu - r'n([0-9.]+)$', # Arch Linux + r'([0-9.]+)-0ubuntu0\.[0-9.]+$', # Ubuntu + r'n([0-9.]+)$', # Arch Linux ] for regex in regexs: mobj = re.match(regex, ver) From cbdc688c41876714f1d7e2f82ba9d11f6695448a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 12 Jan 2019 00:30:06 +0700 Subject: [PATCH 142/226] [postprocessor/ffmpeg] Relax ubuntu ffmpeg version regex --- youtube_dl/postprocessor/ffmpeg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index a9fafa363..39a905380 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -83,8 +83,9 @@ class FFmpegPostProcessor(PostProcessor): ver = get_exe_version(path, args=['-version']) if ver: regexs = [ - r'([0-9.]+)-0ubuntu0\.[0-9.]+$', # Ubuntu + r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] r'n([0-9.]+)$', # Arch Linux + # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ ] for regex in regexs: mobj = re.match(regex, ver) From 60a899bb7ed583b4c40289925c8127e746c683ea Mon Sep 17 00:00:00 2001 From: Atlas Sullivan <AtlasJan@gmx.com> Date: Fri, 11 Jan 2019 19:15:48 +0000 Subject: [PATCH 143/226] [README.md] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 901595444..70bcfaccf 100644 --- a/README.md +++ b/README.md @@ -496,7 +496,7 @@ The `-o` option allows users to indicate a template for the output file names. **tl;dr:** [navigate me to examples](#output-template-examples). -The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. Allowed names along with sequence type are: - `id` (string): Video identifier - `title` (string): Video title From ed8db0a25c93f5611dfae81ee8f15cbc177bd0ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 12 Jan 2019 04:56:17 +0700 Subject: [PATCH 144/226] [wistia] Extend _VALID_URL (closes #18823) --- youtube_dl/extractor/wistia.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 01a51275e..fa142b974 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -12,7 +12,7 @@ from ..utils import ( class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/iframe/)(?P<id>[a-z0-9]+)' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]+)' _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' @@ -38,6 +38,9 @@ class WistiaIE(InfoExtractor): }, { 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', 'only_matching': True, + }, { + 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', + 'only_matching': True, }] @staticmethod From d65f6e734b5c00acbb396f4569907f7957e1cbef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 13 Jan 2019 03:57:31 +0700 Subject: [PATCH 145/226] [bitchute] Check formats (#18833) --- youtube_dl/extractor/bitchute.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index aa034355a..4f39424f5 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -55,6 +55,7 @@ class BitChuteIE(InfoExtractor): formats = [ {'url': format_url} for format_url in orderedSet(format_urls)] + self._check_formats(formats, video_id) self._sort_formats(formats) description = self._html_search_regex( From f1ab3b7de7667c0382bf6eb6364ccab4260c0654 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 13 Jan 2019 10:01:26 +0100 Subject: [PATCH 146/226] [downloader/hls] fix uplynk ad skipping(closes #18824) --- youtube_dl/downloader/hls.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index fd304527e..4def8e2d5 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -75,10 +75,14 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - def is_ad_fragment(s): + def is_ad_fragment_start(s): return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + def is_ad_fragment_end(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s or + s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) + media_frags = 0 ad_frags = 0 ad_frag_next = False @@ -87,12 +91,13 @@ class HlsFD(FragmentFD): if not line: continue if line.startswith('#'): - if is_ad_fragment(line): - ad_frags += 1 + if is_ad_fragment_start(line): ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False continue if ad_frag_next: - ad_frag_next = False + ad_frags += 1 continue media_frags += 1 @@ -123,7 +128,6 @@ class HlsFD(FragmentFD): if line: if not line.startswith('#'): if ad_frag_next: - ad_frag_next = False continue frag_index += 1 if frag_index <= ctx['fragment_index']: @@ -196,8 +200,10 @@ class HlsFD(FragmentFD): 'start': sub_range_start, 'end': sub_range_start + int(splitted_byte_range[0]), } - elif is_ad_fragment(line): + elif is_ad_fragment_start(line): ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False self._finish_frag_download(ctx) From 3b983ee471305946709dcb83fa3799fc26a7db03 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 13 Jan 2019 15:46:54 +0100 Subject: [PATCH 147/226] [curiositystream] add support for non app urls --- youtube_dl/extractor/curiositystream.py | 56 +++++++++++++------------ 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 35b1e7a34..e4a7fca6c 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -46,8 +46,24 @@ class CuriosityStreamBaseIE(InfoExtractor): self._handle_errors(result) self._auth_token = result['message']['auth_token'] - def _extract_media_info(self, media): - video_id = compat_str(media['id']) + +class CuriosityStreamIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/video/2', + 'md5': '262bb2f257ff301115f1973540de8983', + 'info_dict': { + 'id': '2', + 'ext': 'mp4', + 'title': 'How Did You Develop The Internet?', + 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('media/' + video_id, video_id) title = media['title'] formats = [] @@ -114,38 +130,21 @@ class CuriosityStreamBaseIE(InfoExtractor): } -class CuriosityStreamIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream' - _VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)' - _TEST = { - 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', - 'info_dict': { - 'id': '2', - 'ext': 'mp4', - 'title': 'How Did You Develop The Internet?', - 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - return self._extract_media_info(media) - - class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://app\.curiositystream\.com/collection/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://app.curiositystream.com/collection/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 12, - } + 'playlist_mincount': 17, + }, { + 'url': 'https://curiositystream.com/series/2', + 'only_matching': True, + }] def _real_extract(self, url): collection_id = self._match_id(url) @@ -153,7 +152,10 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'collections/' + collection_id, collection_id) entries = [] for media in collection.get('media', []): - entries.append(self._extract_media_info(media)) + media_id = compat_str(media.get('id')) + entries.append(self.url_result( + 'https://curiositystream.com/video/' + media_id, + CuriosityStreamIE.ie_key(), media_id)) return self.playlist_result( entries, collection_id, collection.get('title'), collection.get('description')) From 10026329c2534635876921da229f382098b8f8e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 14 Jan 2019 23:23:51 +0700 Subject: [PATCH 148/226] [skylinewebcams] Fix extraction (closes #18853) --- youtube_dl/extractor/skylinewebcams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/skylinewebcams.py b/youtube_dl/extractor/skylinewebcams.py index 5b4aaac6f..b7f8ac736 100644 --- a/youtube_dl/extractor/skylinewebcams.py +++ b/youtube_dl/extractor/skylinewebcams.py @@ -26,7 +26,7 @@ class SkylineWebcamsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) stream_url = self._search_regex( - r'url\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, + r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, 'stream url', group='url') title = self._og_search_title(webpage) From 929ba3997b026fd9fafe15d447732d8ccdf367bb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 15 Jan 2019 10:23:59 +0100 Subject: [PATCH 149/226] [funimation] fix extraction(closes #14089) --- youtube_dl/extractor/funimation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 07d01caec..8bbedca26 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import random +import string + from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -87,7 +90,7 @@ class FunimationIE(InfoExtractor): video_id = title_data.get('id') or self._search_regex([ r"KANE_customdimensions.videoID\s*=\s*'(\d+)';", - r'<iframe[^>]+src="/player/(\d+)"', + r'<iframe[^>]+src="/player/(\d+)', ], webpage, 'video_id', default=None) if not video_id: player_url = self._html_search_meta([ @@ -108,8 +111,10 @@ class FunimationIE(InfoExtractor): if self._TOKEN: headers['Authorization'] = 'Token %s' % self._TOKEN sources = self._download_json( - 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, - video_id, headers=headers)['items'] + 'https://www.funimation.com/api/showexperience/%s/' % video_id, + video_id, headers=headers, query={ + 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]), + })['items'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read(), video_id)['errors'][0] From 561b456e2d10e60b74a1bee6154bb7fdb0bfaf60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Jan 2019 01:12:58 +0700 Subject: [PATCH 150/226] [youtube] Extract DASH formats from player response (closes #18804) --- youtube_dl/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 29773877e..cca149107 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1545,6 +1545,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + def add_dash_mpd_pr(pl_response): + dash_mpd = url_or_none(try_get( + pl_response, lambda x: x['streamingData']['dashManifestUrl'], + compat_str)) + if dash_mpd and dash_mpd not in dash_mpds: + dash_mpds.append(dash_mpd) + is_live = None view_count = None @@ -1602,6 +1609,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if isinstance(pl_response, dict): player_response = pl_response if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + add_dash_mpd_pr(player_response) # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH @@ -1633,6 +1641,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): pl_response = get_video_info.get('player_response', [None])[0] if isinstance(pl_response, dict): player_response = pl_response + add_dash_mpd_pr(player_response) add_dash_mpd(get_video_info) if view_count is None: view_count = extract_view_count(get_video_info) From 2f483bc1c389709623117079439708783122b5ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Jan 2019 01:28:50 +0700 Subject: [PATCH 151/226] [youtube] Skip unsupported adaptive stream type (#18804) --- youtube_dl/extractor/youtube.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cca149107..5e93b5329 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1827,6 +1827,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url_data = compat_parse_qs(url_data_str) if 'itag' not in url_data or 'url' not in url_data: continue + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) + # Unsupported FORMAT_STREAM_TYPE_OTF + if stream_type == 3: + continue format_id = url_data['itag'][0] url = url_data['url'][0] From a16c7c033a161f310b69d444edc9dbf67cfc49ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Jan 2019 02:17:49 +0700 Subject: [PATCH 152/226] [test/helper] Add support for maxcount and count collection len test checkers --- test/helper.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/test/helper.py b/test/helper.py index aa9a1c9b2..e62aab11e 100644 --- a/test/helper.py +++ b/test/helper.py @@ -153,15 +153,27 @@ def expect_value(self, got, expected, field): isinstance(got, compat_str), 'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got))) got = 'md5:' + md5(got) - elif isinstance(expected, compat_str) and expected.startswith('mincount:'): + elif isinstance(expected, compat_str) and re.match(r'^(?:min|max)?count:\d+', expected): self.assertTrue( isinstance(got, (list, dict)), 'Expected field %s to be a list or a dict, but it is of type %s' % ( field, type(got).__name__)) - expected_num = int(expected.partition(':')[2]) - assertGreaterEqual( + op, _, expected_num = expected.partition(':') + expected_num = int(expected_num) + if op == 'mincount': + assert_func = assertGreaterEqual + msg_tmpl = 'Expected %d items in field %s, but only got %d' + elif op == 'maxcount': + assert_func = assertLessEqual + msg_tmpl = 'Expected maximum %d items in field %s, but got %d' + elif op == 'count': + assert_func = assertEqual + msg_tmpl = 'Expected exactly %d items in field %s, but got %d' + else: + assert False + assert_func( self, len(got), expected_num, - 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) + msg_tmpl % (expected_num, field, len(got))) return self.assertEqual( expected, got, @@ -237,6 +249,20 @@ def assertGreaterEqual(self, got, expected, msg=None): self.assertTrue(got >= expected, msg) +def assertLessEqual(self, got, expected, msg=None): + if not (got <= expected): + if msg is None: + msg = '%r not less than or equal to %r' % (got, expected) + self.assertTrue(got <= expected, msg) + + +def assertEqual(self, got, expected, msg=None): + if not (got == expected): + if msg is None: + msg = '%r not equal to %r' % (got, expected) + self.assertTrue(got == expected, msg) + + def expect_warnings(ydl, warnings_re): real_warning = ydl.report_warning From 4fe54c128a11d394874505af75aaa5a2276aa3ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Jan 2019 02:18:27 +0700 Subject: [PATCH 153/226] [youtube] Update tests and add a tests for #18804 --- youtube_dl/extractor/youtube.py | 57 +++++++++++++++++---------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5e93b5329..730935657 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -498,7 +498,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', - 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], @@ -527,7 +526,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'license': 'Standard YouTube License', 'creator': 'Icona Pop', 'track': 'I Love It (feat. Charli XCX)', 'artist': 'Icona Pop', @@ -540,14 +538,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '07FYdnEawAQ', 'ext': 'mp4', 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', 'alt_title': 'Tunnel Vision', - 'description': 'md5:64249768eec3bc4276236606ea996373', + 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', 'track': 'Tunnel Vision', 'artist': 'Justin Timberlake', @@ -566,7 +563,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'SET India', 'uploader_id': 'setindia', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'license': 'Standard YouTube License', 'age_limit': 18, } }, @@ -581,7 +577,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', - 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], @@ -605,7 +600,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', - 'license': 'Standard YouTube License', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -620,13 +614,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'IB3lcPjvWLA', 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:1900ed86ee514927b9e00fbead6969a5', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', - 'license': 'Standard YouTube License', }, 'params': { 'youtube_include_dash_manifest': True, @@ -640,13 +633,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'alt_title': 'Shake It Off', - 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', + 'description': 'md5:bec2185232c05479482cb5a9b82719bf', 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', - 'license': 'Standard YouTube License', 'creator': 'Taylor Swift', }, 'params': { @@ -662,10 +653,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'TJ Kirk', + 'uploader': 'Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'license': 'Standard YouTube License', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } @@ -683,7 +673,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', - 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -692,7 +681,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', - 'ext': 'webm', + 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', 'duration': 246, @@ -700,7 +689,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'upload_date': '20110629', - 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -718,7 +706,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', - 'license': 'Standard YouTube License', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', }, @@ -736,7 +723,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', @@ -758,7 +744,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫ᄋᄅ', - 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, }, @@ -792,7 +777,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'dorappi2000', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', - 'license': 'Standard YouTube License', 'formats': 'mincount:31', }, 'skip': 'not actual anymore', @@ -808,7 +792,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Airtek', 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'license': 'Standard YouTube License', 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', }, 'params': { @@ -881,6 +864,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536) @@ -917,7 +901,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'license': 'Standard YouTube License', 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', 'track': 'Dark Walk - Position Music', 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', @@ -1021,13 +1004,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:25b78d2f64ae81719f5c96319889b736', + 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', 'uploader_id': 'Vsauce', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'license': 'Standard YouTube License', 'series': 'Mind Field', 'season_number': 1, 'episode_number': 1, @@ -1053,7 +1035,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'New Century Foundation', 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', - 'license': 'Standard YouTube License', }, 'params': { 'skip_download': True, @@ -1081,6 +1062,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # DRM protected 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', 'only_matching': True, + }, + { + # Video with unsupported adaptive stream type formats + 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', + 'info_dict': { + 'id': 'Z4Vy8R84T1U', + 'ext': 'mp4', + 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 433, + 'upload_date': '20130923', + 'uploader': 'Amelia Putri Harwita', + 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', + 'formats': 'maxcount:10', + }, + 'params': { + 'skip_download': True, + 'youtube_include_dash_manifest': False, + }, } ] From b0d73a745656443baf3e2134d9ca03bd3a4a934c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Jan 2019 02:20:10 +0700 Subject: [PATCH 154/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 3d60754c5..d25169074 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Core ++ [test/helper] Add support for maxcount and count collection len checkers +* [downloader/hls] Fix uplynk ad skipping (#18824) +* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813) + +Extractors +* [youtube] Skip unsupported adaptive stream type (#18804) ++ [youtube] Extract DASH formats from player response (#18804) +* [funimation] Fix extraction (#14089) +* [skylinewebcams] Fix extraction (#18853) ++ [curiositystream] Add support for non app URLs ++ [bitchute] Check formats (#18833) +* [wistia] Extend URL regular expression (#18823) ++ [playplustv] Add support for playplus.com (#18789) + + version 2019.01.10 Core From bfc8eeea57251796d44f1e10b61d06760690bc25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Jan 2019 02:24:08 +0700 Subject: [PATCH 155/226] release 2019.01.16 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d6abdbcb7..650f78511 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.16*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.16** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.10 +[debug] youtube-dl version 2019.01.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d25169074..13019bf2b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.16 Core + [test/helper] Add support for maxcount and count collection len checkers diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5ba61f489..c13f3a38a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.10' +__version__ = '2019.01.16' From fa4ac365f69cbd51e4c9801984ebea49a12825b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Jan 2019 10:24:44 +0700 Subject: [PATCH 156/226] [youtube] Extend JS player signature function name regexes (closes #18890, closes #18891, closes #18893) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 730935657..c8bf98b58 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1198,8 +1198,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') From f53cecd796dbb698abbde6ac2f7f973dba78f8a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Jan 2019 10:25:50 +0700 Subject: [PATCH 157/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index 13019bf2b..171034a75 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version <unreleased> + +Extractors +* [youtube] Extend JS player signature function name regular expressions + (#18890, #18891, #18893) + + version 2019.01.16 Core From 29639b363ddab7903ceae096912a0227c8017533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Jan 2019 10:27:17 +0700 Subject: [PATCH 158/226] release 2019.01.17 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 650f78511..841bca914 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.16*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.16** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.17*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.17** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.16 +[debug] youtube-dl version 2019.01.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 171034a75..902301765 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.17 Extractors * [youtube] Extend JS player signature function name regular expressions diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c13f3a38a..ea3f62928 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.16' +__version__ = '2019.01.17' From 79fec976b0c250446ea9a9eb7323fb2045ee37fe Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Jan 2019 09:44:08 +0100 Subject: [PATCH 159/226] [vimeo] fix extraction for password protected player URLs(closes #18889) --- youtube_dl/extractor/vimeo.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5e15f060b..fd37f919b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import json import re import itertools @@ -392,6 +393,22 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://player.vimeo.com/video/68375962', + 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', + 'info_dict': { + 'id': '68375962', + 'ext': 'mp4', + 'title': 'youtube-dl password protected test video', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', + 'uploader_id': 'user18948128', + 'uploader': 'Jaime Marquínez Ferrándiz', + 'duration': 10, + }, + 'params': { + 'videopassword': 'youtube-dl', + }, + }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', 'only_matching': True, @@ -452,7 +469,9 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = urlencode_postdata({'password': password}) + data = urlencode_postdata({ + 'password': base64.b64encode(password.encode()), + }) pass_url = url + '/check-password' password_request = sanitized_Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') From e2dd132f054df5b6c09b7c274752a77d8ba44f8d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Jan 2019 23:56:37 +0100 Subject: [PATCH 160/226] [cartoonnetwork] fix extraction(closes #15664)(closes #17224) --- youtube_dl/extractor/cartoonnetwork.py | 56 +++++++++++++++++--------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py index 6aeebd7b3..48b33617f 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .turner import TurnerBaseIE +from ..utils import int_or_none class CartoonNetworkIE(TurnerBaseIE): _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' _TEST = { - 'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html', + 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', 'info_dict': { - 'id': '8a250ab04ed07e6c014ef3f1e2f9016c', + 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', 'ext': 'mp4', - 'title': 'Starfire the Cat Lady', - 'description': 'Robin decides to become a cat so that Starfire will finally love him.', + 'title': 'How to Draw Upgrade', + 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', }, 'params': { # m3u8 download @@ -25,18 +24,39 @@ class CartoonNetworkIE(TurnerBaseIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups() - query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id - return self._extract_cvp_info( - 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big', - 'tokenizer_src': 'https://token.vgtf.net/token/token_mobile', - }, - }, { + + def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): + metadata_re = '' + if content_re: + metadata_re = r'|video_metadata\.content_' + content_re + return self._search_regex( + r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), + webpage, name, fatal=fatal) + + media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) + title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) + + info = self._extract_ngtv_info( + media_id, {'networkId': 'cartoonnetwork'}, { 'url': url, 'site_name': 'CartoonNetwork', - 'auth_required': self._search_regex( - r'_cnglobal\.cvpFullOrPreviewAuth\s*=\s*(true|false);', - webpage, 'auth required', default='false') == 'true', + 'auth_required': find_field('authType', 'auth type') != 'unauth', }) + + series = find_field( + 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'series': series, + 'episode': title, + }) + + for field in ('season', 'episode'): + field_name = field + 'Number' + info[field + '_number'] = int_or_none(find_field( + field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) + + return info From 2bfc1d9d68dec097fd8093dc0284dd0cd64beb2e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 19 Jan 2019 21:25:15 +0100 Subject: [PATCH 161/226] [extractor/common] imporove HLS video only format detection(closes #18923) --- youtube_dl/extractor/common.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9e7febcad..af621b74b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1596,6 +1596,7 @@ class InfoExtractor(object): # References: # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 # 2. https://github.com/rg3/youtube-dl/issues/12211 + # 3. https://github.com/rg3/youtube-dl/issues/18923 # We should try extracting formats only from master playlists [1, 4.3.4], # i.e. playlists that describe available qualities. On the other hand @@ -1667,11 +1668,16 @@ class InfoExtractor(object): rendition = stream_group[0] return rendition.get('NAME') or stream_group_id + # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF inorder to have the + # chance to detect video only formats when EXT-X-STREAM-INF tags + # precede EXT-X-MEDIA tags in HLS manifest such as [3]. + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-MEDIA:'): + extract_media(line) + for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_stream_inf = parse_m3u8_attributes(line) - elif line.startswith('#EXT-X-MEDIA:'): - extract_media(line) elif line.startswith('#') or not line.strip(): continue else: From f28363ad1fb45b4450ae6f09ff9aff8f93227e40 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 19 Jan 2019 21:25:53 +0100 Subject: [PATCH 162/226] [ted] correct acodec for http formats(#18923) --- youtube_dl/extractor/ted.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index d3e4205f5..645942dfd 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -265,6 +265,8 @@ class TEDIE(InfoExtractor): 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) + if f.get('acodec') == 'none': + del f['acodec'] formats.append(f) audio_download = talk_info.get('audioDownload') From 379306ef55b64c966392c072b17a450831fec252 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 19 Jan 2019 21:35:02 +0100 Subject: [PATCH 163/226] [extractor/common] fix typo --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index af621b74b..6e36e6778 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1668,7 +1668,7 @@ class InfoExtractor(object): rendition = stream_group[0] return rendition.get('NAME') or stream_group_id - # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF inorder to have the + # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the # chance to detect video only formats when EXT-X-STREAM-INF tags # precede EXT-X-MEDIA tags in HLS manifest such as [3]. for line in m3u8_doc.splitlines(): From 2cc779f497ae20d6e0e28fc546a25723cfea631a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Jan 2019 13:48:09 +0700 Subject: [PATCH 164/226] [YoutubeDL] Add negation support for string comparisons in format selection expressions (closes #18600, closes #18805) --- README.md | 3 ++- test/test_YoutubeDL.py | 46 +++++++++++++++++++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 9 +++++--- 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 70bcfaccf..886696015 100644 --- a/README.md +++ b/README.md @@ -667,13 +667,14 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, ` - `asr`: Audio sampling rate in Hertz - `fps`: Frame rate -Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begins with), `$=` (ends with), `*=` (contains) and following string meta fields: +Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields: - `ext`: File extension - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format +Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f0f5a8470..df8994b84 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -239,6 +239,52 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_string_ops(self): + formats = [ + {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + # equals (=) + ydl = YDL({'format': '[format_id=abc-cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not equal (!=) + ydl = YDL({'format': '[format_id!=abc-cba]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # starts with (^=) + ydl = YDL({'format': '[format_id^=abc]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not start with (!^=) + ydl = YDL({'format': '[format_id!^=abc-cba]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # ends with ($=) + ydl = YDL({'format': '[format_id$=cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not end with (!$=) + ydl = YDL({'format': '[format_id!$=abc-cba]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # contains (*=) + ydl = YDL({'format': '[format_id*=-]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not contain (!*=) + ydl = YDL({'format': '[format_id!*=-]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4493fd0e1..a827414dc 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1063,21 +1063,24 @@ class YoutubeDL(object): if not m: STR_OPERATORS = { '=': operator.eq, - '!=': operator.ne, '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? + \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') - op = STR_OPERATORS[m.group('op')] + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op + else: + op = str_op if not m: raise ValueError('Invalid filter specification %r' % filter_spec) From 4e58d9fabbfef80b2ecc0d20959bdf9dd3705e73 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Sun, 20 Jan 2019 14:23:35 +0700 Subject: [PATCH 165/226] [README.md] Fix formatting --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 886696015..4ba982907 100644 --- a/README.md +++ b/README.md @@ -674,6 +674,7 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format + Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. From fc746c3fdd7d40935a11e72b5cb69a8aee840e94 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 20 Jan 2019 09:04:51 +0100 Subject: [PATCH 166/226] [test/test_InfoExtractor] add test for #18923 --- test/test_InfoExtractor.py | 59 ++++++++++++++++++++++++++++++- test/testdata/m3u8/ted_18923.m3u8 | 28 +++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 test/testdata/m3u8/ted_18923.m3u8 diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 06be72616..75fa0bbb7 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -497,7 +497,64 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 1280, 'height': 720, }] - ) + ), + ( + # https://github.com/rg3/youtube-dl/issues/18923 + # https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa + 'ted_18923', + 'http://hls.ted.com/talks/31241.m3u8', + [{ + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '600k-Audio', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '68', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '163', + 'acodec': 'none', + 'width': 320, + 'height': 180, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '481', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '769', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '984', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1255', + 'acodec': 'none', + 'width': 640, + 'height': 360, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1693', + 'acodec': 'none', + 'width': 853, + 'height': 480, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '2462', + 'acodec': 'none', + 'width': 1280, + 'height': 720, + }] + ), ] for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: diff --git a/test/testdata/m3u8/ted_18923.m3u8 b/test/testdata/m3u8/ted_18923.m3u8 new file mode 100644 index 000000000..52a27118b --- /dev/null +++ b/test/testdata/m3u8/ted_18923.m3u8 @@ -0,0 +1,28 @@ +#EXTM3U +#EXT-X-VERSION:4 +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1255659,PROGRAM-ID=1,CODECS="avc1.42c01e,mp4a.40.2",RESOLUTION=640x360 +/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=163154,PROGRAM-ID=1,CODECS="avc1.42c00c,mp4a.40.2",RESOLUTION=320x180 +/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=481701,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=769968,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=984037,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1693925,PROGRAM-ID=1,CODECS="avc1.4d401f,mp4a.40.2",RESOLUTION=853x480 +/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=2462469,PROGRAM-ID=1,CODECS="avc1.640028,mp4a.40.2",RESOLUTION=1280x720 +/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=68101,PROGRAM-ID=1,CODECS="mp4a.40.2",DEFAULT=YES +/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b + +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=74298,PROGRAM-ID=1,CODECS="avc1.42c00c",RESOLUTION=320x180,URI="/videos/BorisHesser_2018S/video/64k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=216200,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/180k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=304717,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/320k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=350933,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/450k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=495850,PROGRAM-ID=1,CODECS="avc1.42c01e",RESOLUTION=640x360,URI="/videos/BorisHesser_2018S/video/600k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=810750,PROGRAM-ID=1,CODECS="avc1.4d401f",RESOLUTION=853x480,URI="/videos/BorisHesser_2018S/video/950k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=1273700,PROGRAM-ID=1,CODECS="avc1.640028",RESOLUTION=1280x720,URI="/videos/BorisHesser_2018S/video/1500k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" + +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="600k",LANGUAGE="en",NAME="Audio",AUTOSELECT=YES,DEFAULT=YES,URI="/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b",BANDWIDTH=614400 From 15870747f02effba9376fd1a1500c32d36d1b811 Mon Sep 17 00:00:00 2001 From: aviperes <avipr24@gmail.com> Date: Sun, 20 Jan 2019 10:15:01 +0200 Subject: [PATCH 167/226] [odnoklassniki] Detect paid videos --- youtube_dl/extractor/odnoklassniki.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 190d8af4d..114b93c07 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -115,6 +115,10 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', 'only_matching': True, + }, { + # Paid video + 'url': 'https://ok.ru/video/954886983203', + 'only_matching': True, }] def _real_extract(self, url): @@ -244,6 +248,11 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'flv', }) + if not formats: + payment_info = metadata.get('paymentInfo') + if payment_info: + raise ExtractorError('This video is paid, subscribe to download it', expected=True) + self._sort_formats(formats) info['formats'] = formats From 31fbedc06a349bc555ab934588544f75734e3a55 Mon Sep 17 00:00:00 2001 From: jhwgh1968 <jhwgh1968@users.noreply.github.com> Date: Sun, 20 Jan 2019 09:10:46 +0000 Subject: [PATCH 168/226] [instagram] Add base extractor for playlists and tag extractor --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/instagram.py | 136 +++++++++++++++++++++-------- 2 files changed, 105 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de38c6641..24361def4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -494,7 +494,11 @@ from .ina import InaIE from .inc import IncIE from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE -from .instagram import InstagramIE, InstagramUserIE +from .instagram import ( + InstagramIE, + InstagramUserIE, + InstagramTagIE, +) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 7e0e838f0..ffd87b55f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -227,44 +227,37 @@ class InstagramIE(InfoExtractor): } -class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' - IE_DESC = 'Instagram user profile' - IE_NAME = 'instagram:user' - _TEST = { - 'url': 'https://instagram.com/porsche', - 'info_dict': { - 'id': 'porsche', - 'title': 'porsche', - }, - 'playlist_count': 5, - 'params': { - 'extract_flat': True, - 'skip_download': True, - 'playlistend': 5, - } - } +class InstagramPlaylistIE(InfoExtractor): + # A superclass for handling any kind of query based on GraphQL which + # results in a playlist. - _gis_tmpl = None + _gis_tmpl = None # used to cache GIS request type - def _entries(self, data): + def _parse_graphql(self, webpage, item_id): + # Reads a webpage and returns its GraphQL data. + return self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + item_id) + + def _extract_graphql(self, data, url): + # Parses GraphQL queries containing videos and generates a playlist. def get_count(suffix): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' - self._set_cookie('instagram.com', 'ig_pr', '1') - cursor = '' for page_num in itertools.count(1): - variables = json.dumps({ - 'id': uploader_id, + variables = { 'first': 12, 'after': cursor, - }) + } + variables.update(self._query_vars_for(data)) + variables = json.dumps(variables) if self._gis_tmpl: gis_tmpls = [self._gis_tmpl] @@ -276,21 +269,26 @@ class InstagramUserIE(InfoExtractor): '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), ] + # try all of the ways to generate a GIS query, and not only use the + # first one that works, but cache it for future requests for gis_tmpl in gis_tmpls: try: - media = self._download_json( + json_data = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-Instagram-GIS': hashlib.md5( ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), }, query={ - 'query_hash': '42323d64886122307be10013ad2dcc44', + 'query_hash': self._QUERY_HASH, 'variables': variables, - })['data']['user']['edge_owner_to_timeline_media'] + }) + media = self._parse_timeline_from(json_data) self._gis_tmpl = gis_tmpl break except ExtractorError as e: + # if it's an error caused by a bad query, and there are + # more GIS templates to try, ignore it and keep trying if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if gis_tmpl != gis_tmpls[-1]: continue @@ -348,14 +346,80 @@ class InstagramUserIE(InfoExtractor): break def _real_extract(self, url): - username = self._match_id(url) + user_or_tag = self._match_id(url) + webpage = self._download_webpage(url, user_or_tag) + data = self._parse_graphql(webpage, user_or_tag) - webpage = self._download_webpage(url, username) - - data = self._parse_json( - self._search_regex( - r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), - username) + self._set_cookie('instagram.com', 'ig_pr', '1') return self.playlist_result( - self._entries(data), username, username) + self._extract_graphql(data, url), user_or_tag, user_or_tag) + + +class InstagramUserIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' + IE_DESC = 'Instagram user profile' + IE_NAME = 'instagram:user' + _TEST = { + 'url': 'https://instagram.com/porsche', + 'info_dict': { + 'id': 'porsche', + 'title': 'porsche', + }, + 'playlist_count': 5, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 5, + } + } + + _QUERY_HASH = '42323d64886122307be10013ad2dcc44', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['user']['edge_owner_to_timeline_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + } + + +class InstagramTagIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' + IE_DESC = 'Instagram hashtag search' + IE_NAME = 'instagram:tag' + _TEST = { + 'url': 'https://instagram.com/explore/tags/lolcats', + 'info_dict': { + 'id': 'lolcats', + 'title': 'lolcats', + }, + 'playlist_count': 50, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 50, + } + } + + _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['hashtag']['edge_hashtag_to_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'tag_name': + data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] + } From 6ca3fa898cd066422daea3d5be53efdae22187d8 Mon Sep 17 00:00:00 2001 From: yonaikerlol <lawlietrs7@gmail.com> Date: Sun, 20 Jan 2019 05:24:21 -0400 Subject: [PATCH 169/226] [streamango] Add support for fruithosts.net --- youtube_dl/extractor/streamango.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index fcaa5ac0b..efb259f96 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -14,7 +14,7 @@ from ..utils import ( class StreamangoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net)/(?:f|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', 'md5': 'e992787515a182f55e38fc97588d802a', @@ -38,6 +38,9 @@ class StreamangoIE(InfoExtractor): }, { 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', 'only_matching': True, + }, { + 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4', + 'only_matching': True, }] def _real_extract(self, url): From 289ef490f77cb35c43d98b9d2ea4cf529c24895e Mon Sep 17 00:00:00 2001 From: Anthony Fok <foka@debian.org> Date: Sun, 30 Dec 2018 02:44:40 -0700 Subject: [PATCH 170/226] [hketv] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hketv.py | 185 +++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 youtube_dl/extractor/hketv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 24361def4..574a47e6d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -452,6 +452,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE +from .hketv import HKETVIE from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE diff --git a/youtube_dl/extractor/hketv.py b/youtube_dl/extractor/hketv.py new file mode 100644 index 000000000..b5790cdee --- /dev/null +++ b/youtube_dl/extractor/hketv.py @@ -0,0 +1,185 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + merge_dicts, + str_or_none, + str_to_int, + try_get, + unified_strdate, + urlencode_postdata, + urljoin, +) + + +class HKETVIE(InfoExtractor): + IE_NAME = 'hketv' + IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['HK'] + _VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hkedcity.net/etv/resource/2932360618', + 'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7', + 'info_dict': { + 'id': '2932360618', + 'ext': 'mp4', + 'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)', + 'description': '本節目輯錄了「閱讀滿Fun嘉年華」和「二○一八響應世界閱讀日――悅愛閱讀・愈讀愈愛」的活動花絮,並由學者、作家、演藝界人士等,分享培養子女閱讀興趣和習慣的方法,以及呼籲大家一同分享閱讀的樂趣。', + 'upload_date': '20181024', + 'duration': 900, + 'subtitles': { + 'en': [{ + 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=en', + 'ext': 'srt', + }], + 'zh-Hant': [{ + 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=qmt', + 'ext': 'srt', + }], + } + }, + }, { + 'url': 'https://www.hkedcity.net/etv/resource/972641418', + 'md5': '1ed494c1c6cf7866a8290edad9b07dc9', + 'info_dict': { + 'id': '972641418', + 'ext': 'mp4', + 'title': '衣冠楚楚 (天使系列之一)', + 'description': '天國仙境,有兩位可愛的天使小姐妹。她們對幾千年來天使衣著一成不變頗有不滿。她們下望人世間:只見人們穿著七彩繽紛、款式各異的服裝,漂亮極了。天使姐妹決定下凡考察衣著,以設計天使新裝。 下到人間,姐妹試穿各式各樣的衣著,引發連串奇特有趣的情節:她們穿著校服在街上閒逛時,被女警誤認為逃學而送回學校,到校後又被體育老師誤認為是新同學,匆匆忙忙換上運動服後在操場上大顯神通。她們穿著護士服在醫院散步時,又被誤認為當班護士,而投入追尋失蹤病童、治病救人的工作中去。姐妹倆還到過玩具店,與布娃娃們談論衣著。她們也去過服裝設計學校,被當成小模特兒而試穿各式服裝。最令姐妹倆興奮的是一場盛大的民族服裝表演會。身穿盛裝的十二個民族的少女在台上翩翩起舞,各種服飾七彩繽紛、美不勝收。姐妹們情不自禁地穿上民族服裝,小天使變成了少數民族姑娘……最後天使姐妹回到天上,對於天使究竟穿甚麼樣的衣服好,她們還是拿不定主意。 節目通過天使姐妹的奇特經歷,反復示範各式衣服鞋襪的正確讀音及談論衣著時的常用句式,並以盛大的民族服裝表演活動,帶出有關服裝的文化知識。內容豐富而饒有趣味。', + 'upload_date': '20070109', + 'duration': 907, + 'subtitles': {}, + }, + }] + + _CC_LANGS = { + '中文(繁體中文)': 'zh-Hant', + '中文(简体中文)': 'zh-Hans', + 'English': 'en', + 'Bahasa Indonesia': 'id', + '\u0939\u093f\u0928\u094d\u0926\u0940': 'hi', + '\u0928\u0947\u092a\u093e\u0932\u0940': 'ne', + 'Tagalog': 'tl', + '\u0e44\u0e17\u0e22': 'th', + '\u0627\u0631\u062f\u0648': 'ur', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('ed_title', webpage, fatal=True) + + file_id = self._html_search_regex(r'post_var\["file_id"\]\s*=\s*(.+?);', webpage, 'file ID') + curr_url = self._html_search_regex(r'post_var\["curr_url"\]\s*=\s*"(.+?)";', webpage, 'curr URL') + data = { + 'action': 'get_info', + 'curr_url': curr_url, + 'file_id': file_id, + 'video_url': file_id, + } + _APPS_BASE_URL = 'https://apps.hkedcity.net' + handler_url = _APPS_BASE_URL + '/media/play/handler.php' + + response = self._download_json( + handler_url, video_id, + data=urlencode_postdata(data), + headers=merge_dicts({'Content-Type': 'application/x-www-form-urlencoded'}, + self.geo_verification_headers())) + + result = response['result'] + + formats = [] + subtitles = {} + + if response.get('success') and response.get('access'): + width = int_or_none(result.get('width')) + height = int_or_none(result.get('height')) + + playlist0 = try_get(result, lambda x: x['playlist'][0], dict) + fmts = playlist0.get('sources') + for fmt in fmts: + file_path = fmt.get('file') + if file_path: + file_url = urljoin(_APPS_BASE_URL, file_path) + # If we ever wanted to provide the final resolved URL that + # does not require cookies, albeit with a shorter lifespan: + # urlh = self._downloader.urlopen(file_url) + # resolved_url = urlh.geturl() + + label = fmt.get('label') + w = None + h = None + if label == 'HD': + h = 720 + elif label == 'SD': + h = 360 + if h: + if width and height: + w = h * width // height + else: + w = h * 4 // 3 + + formats.append({ + 'format_id': label, + 'ext': fmt.get('type'), + 'url': file_url, + 'width': w, + 'height': h, + }) + + tracks = playlist0.get('tracks', []) + for track in tracks: + if not isinstance(track, dict): + continue + track_kind = str_or_none(track.get('kind')) + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): + continue + track_url = urljoin(_APPS_BASE_URL, track.get('file')) + if not track_url: + continue + track_label = track.get('label') + subtitles.setdefault(self._CC_LANGS.get(track_label, track_label), []).append({ + 'url': self._proto_relative_url(track_url), + 'ext': 'srt', + }) + + else: + error = clean_html(response.get('access_err_msg')) + if 'Video streaming is not available in your country' in error: + self.raise_geo_restricted(msg=error, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError(error) + + # Likes + emotion = self._download_json( + 'https://emocounter.hkedcity.net/handler.php', + video_id, + data=urlencode_postdata({ + 'action': 'get_emotion', + 'data[bucket_id]': 'etv', + 'data[identifier]': video_id, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}, + fatal=False) + like_count = int_or_none(try_get(emotion, lambda x: x['data']['emotion_data'][0]['count'])) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'upload_date': unified_strdate(self._html_search_meta('ed_date', webpage, fatal=False), day_first=False), + 'duration': int_or_none(result.get('length')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': urljoin(_APPS_BASE_URL, result.get('image')), + 'view_count': str_to_int(result.get('view_count')), + 'like_count': like_count, + } From 73c19aaa9f74e9383a7aaf0dfb3c608727d5b6b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Jan 2019 17:42:25 +0700 Subject: [PATCH 171/226] [hketv] Improve and simplify (closes #18696) --- youtube_dl/extractor/hketv.py | 180 ++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 87 deletions(-) diff --git a/youtube_dl/extractor/hketv.py b/youtube_dl/extractor/hketv.py index b5790cdee..b57927fc1 100644 --- a/youtube_dl/extractor/hketv.py +++ b/youtube_dl/extractor/hketv.py @@ -8,8 +8,8 @@ from ..utils import ( ExtractorError, int_or_none, merge_dicts, + parse_count, str_or_none, - str_to_int, try_get, unified_strdate, urlencode_postdata, @@ -30,20 +30,12 @@ class HKETVIE(InfoExtractor): 'id': '2932360618', 'ext': 'mp4', 'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)', - 'description': '本節目輯錄了「閱讀滿Fun嘉年華」和「二○一八響應世界閱讀日――悅愛閱讀・愈讀愈愛」的活動花絮,並由學者、作家、演藝界人士等,分享培養子女閱讀興趣和習慣的方法,以及呼籲大家一同分享閱讀的樂趣。', + 'description': 'md5:d5286d05219ef50e0613311cbe96e560', 'upload_date': '20181024', 'duration': 900, - 'subtitles': { - 'en': [{ - 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=en', - 'ext': 'srt', - }], - 'zh-Hant': [{ - 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=qmt', - 'ext': 'srt', - }], - } + 'subtitles': 'count:2', }, + 'skip': 'Geo restricted to HK', }, { 'url': 'https://www.hkedcity.net/etv/resource/972641418', 'md5': '1ed494c1c6cf7866a8290edad9b07dc9', @@ -51,11 +43,15 @@ class HKETVIE(InfoExtractor): 'id': '972641418', 'ext': 'mp4', 'title': '衣冠楚楚 (天使系列之一)', - 'description': '天國仙境,有兩位可愛的天使小姐妹。她們對幾千年來天使衣著一成不變頗有不滿。她們下望人世間:只見人們穿著七彩繽紛、款式各異的服裝,漂亮極了。天使姐妹決定下凡考察衣著,以設計天使新裝。 下到人間,姐妹試穿各式各樣的衣著,引發連串奇特有趣的情節:她們穿著校服在街上閒逛時,被女警誤認為逃學而送回學校,到校後又被體育老師誤認為是新同學,匆匆忙忙換上運動服後在操場上大顯神通。她們穿著護士服在醫院散步時,又被誤認為當班護士,而投入追尋失蹤病童、治病救人的工作中去。姐妹倆還到過玩具店,與布娃娃們談論衣著。她們也去過服裝設計學校,被當成小模特兒而試穿各式服裝。最令姐妹倆興奮的是一場盛大的民族服裝表演會。身穿盛裝的十二個民族的少女在台上翩翩起舞,各種服飾七彩繽紛、美不勝收。姐妹們情不自禁地穿上民族服裝,小天使變成了少數民族姑娘……最後天使姐妹回到天上,對於天使究竟穿甚麼樣的衣服好,她們還是拿不定主意。 節目通過天使姐妹的奇特經歷,反復示範各式衣服鞋襪的正確讀音及談論衣著時的常用句式,並以盛大的民族服裝表演活動,帶出有關服裝的文化知識。內容豐富而饒有趣味。', + 'description': 'md5:10bb3d659421e74f58e5db5691627b0f', 'upload_date': '20070109', 'duration': 907, 'subtitles': {}, }, + 'params': { + 'geo_verification_proxy': '<HK proxy here>', + }, + 'skip': 'Geo restricted to HK', }] _CC_LANGS = { @@ -69,117 +65,127 @@ class HKETVIE(InfoExtractor): '\u0e44\u0e17\u0e22': 'th', '\u0627\u0631\u062f\u0648': 'ur', } + _FORMAT_HEIGHTS = { + 'SD': 360, + 'HD': 720, + } + _APPS_BASE_URL = 'https://apps.hkedcity.net' def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('ed_title', webpage, fatal=True) - file_id = self._html_search_regex(r'post_var\["file_id"\]\s*=\s*(.+?);', webpage, 'file ID') - curr_url = self._html_search_regex(r'post_var\["curr_url"\]\s*=\s*"(.+?)";', webpage, 'curr URL') + title = ( + self._html_search_meta( + ('ed_title', 'search.ed_title'), webpage, default=None) or + self._search_regex( + r'data-favorite_title_(?:eng|chi)=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'title', default=None, group='url') or + self._html_search_regex( + r'<h1>([^<]+)</h1>', webpage, 'title', default=None) or + self._og_search_title(webpage) + ) + + file_id = self._search_regex( + r'post_var\[["\']file_id["\']\s*\]\s*=\s*(.+?);', + webpage, 'file ID') + curr_url = self._search_regex( + r'post_var\[["\']curr_url["\']\s*\]\s*=\s*"(.+?)";', + webpage, 'curr URL') data = { 'action': 'get_info', 'curr_url': curr_url, 'file_id': file_id, 'video_url': file_id, } - _APPS_BASE_URL = 'https://apps.hkedcity.net' - handler_url = _APPS_BASE_URL + '/media/play/handler.php' response = self._download_json( - handler_url, video_id, + self._APPS_BASE_URL + '/media/play/handler.php', video_id, data=urlencode_postdata(data), - headers=merge_dicts({'Content-Type': 'application/x-www-form-urlencoded'}, - self.geo_verification_headers())) + headers=merge_dicts({ + 'Content-Type': 'application/x-www-form-urlencoded'}, + self.geo_verification_headers())) result = response['result'] + if not response.get('success') or not response.get('access'): + error = clean_html(response.get('access_err_msg')) + if 'Video streaming is not available in your country' in error: + self.raise_geo_restricted( + msg=error, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError(error, expected=True) + formats = [] + + width = int_or_none(result.get('width')) + height = int_or_none(result.get('height')) + + playlist0 = result['playlist'][0] + for fmt in playlist0['sources']: + file_url = urljoin(self._APPS_BASE_URL, fmt.get('file')) + if not file_url: + continue + # If we ever wanted to provide the final resolved URL that + # does not require cookies, albeit with a shorter lifespan: + # urlh = self._downloader.urlopen(file_url) + # resolved_url = urlh.geturl() + label = fmt.get('label') + h = self._FORMAT_HEIGHTS.get(label) + w = h * width // height if h and width and height else None + formats.append({ + 'format_id': label, + 'ext': fmt.get('type'), + 'url': file_url, + 'width': w, + 'height': h, + }) + self._sort_formats(formats) + subtitles = {} - - if response.get('success') and response.get('access'): - width = int_or_none(result.get('width')) - height = int_or_none(result.get('height')) - - playlist0 = try_get(result, lambda x: x['playlist'][0], dict) - fmts = playlist0.get('sources') - for fmt in fmts: - file_path = fmt.get('file') - if file_path: - file_url = urljoin(_APPS_BASE_URL, file_path) - # If we ever wanted to provide the final resolved URL that - # does not require cookies, albeit with a shorter lifespan: - # urlh = self._downloader.urlopen(file_url) - # resolved_url = urlh.geturl() - - label = fmt.get('label') - w = None - h = None - if label == 'HD': - h = 720 - elif label == 'SD': - h = 360 - if h: - if width and height: - w = h * width // height - else: - w = h * 4 // 3 - - formats.append({ - 'format_id': label, - 'ext': fmt.get('type'), - 'url': file_url, - 'width': w, - 'height': h, - }) - - tracks = playlist0.get('tracks', []) - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = str_or_none(track.get('kind')) - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(_APPS_BASE_URL, track.get('file')) - if not track_url: - continue - track_label = track.get('label') - subtitles.setdefault(self._CC_LANGS.get(track_label, track_label), []).append({ + tracks = try_get(playlist0, lambda x: x['tracks'], list) or [] + for track in tracks: + if not isinstance(track, dict): + continue + track_kind = str_or_none(track.get('kind')) + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): + continue + track_url = urljoin(self._APPS_BASE_URL, track.get('file')) + if not track_url: + continue + track_label = track.get('label') + subtitles.setdefault(self._CC_LANGS.get( + track_label, track_label), []).append({ 'url': self._proto_relative_url(track_url), 'ext': 'srt', }) - else: - error = clean_html(response.get('access_err_msg')) - if 'Video streaming is not available in your country' in error: - self.raise_geo_restricted(msg=error, countries=self._GEO_COUNTRIES) - else: - raise ExtractorError(error) - # Likes emotion = self._download_json( - 'https://emocounter.hkedcity.net/handler.php', - video_id, + 'https://emocounter.hkedcity.net/handler.php', video_id, data=urlencode_postdata({ 'action': 'get_emotion', 'data[bucket_id]': 'etv', 'data[identifier]': video_id, }), headers={'Content-Type': 'application/x-www-form-urlencoded'}, - fatal=False) - like_count = int_or_none(try_get(emotion, lambda x: x['data']['emotion_data'][0]['count'])) + fatal=False) or {} + like_count = int_or_none(try_get( + emotion, lambda x: x['data']['emotion_data'][0]['count'])) return { 'id': video_id, 'title': title, - 'description': self._html_search_meta('description', webpage, fatal=False), - 'upload_date': unified_strdate(self._html_search_meta('ed_date', webpage, fatal=False), day_first=False), + 'description': self._html_search_meta( + 'description', webpage, fatal=False), + 'upload_date': unified_strdate(self._html_search_meta( + 'ed_date', webpage, fatal=False), day_first=False), 'duration': int_or_none(result.get('length')), 'formats': formats, 'subtitles': subtitles, - 'thumbnail': urljoin(_APPS_BASE_URL, result.get('image')), - 'view_count': str_to_int(result.get('view_count')), + 'thumbnail': urljoin(self._APPS_BASE_URL, result.get('image')), + 'view_count': parse_count(result.get('view_count')), 'like_count': like_count, } From a1a460759815414c6194bc921ac77a5533b6e02e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Jan 2019 18:21:31 +0700 Subject: [PATCH 172/226] [vimeo] Fix video password verification for videos protected by Referer HTTP header --- youtube_dl/extractor/vimeo.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fd37f919b..6215b3258 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -435,6 +435,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, } + # https://gettingthingsdone.com/workflowmap/ + # vimeo embed with check-password page protected by Referer header ] @staticmethod @@ -465,20 +467,22 @@ class VimeoIE(VimeoBaseInfoExtractor): urls = VimeoIE._extract_urls(url, webpage) return urls[0] if urls else None - def _verify_player_video_password(self, url, video_id): + def _verify_player_video_password(self, url, video_id, headers): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) - pass_url = url + '/check-password' - password_request = sanitized_Request(pass_url, data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - return self._download_json( - password_request, video_id, - 'Verifying the password', 'Wrong password') + headers = merge_dicts(headers, { + 'Content-Type': 'application/x-www-form-urlencoded', + }) + checked = self._download_json( + url + '/check-password', video_id, + 'Verifying the password', data=data, headers=headers) + if checked is False: + raise ExtractorError('Wrong video password', expected=True) + return checked def _real_initialize(self): self._login() @@ -591,7 +595,7 @@ class VimeoIE(VimeoBaseInfoExtractor): cause=e) else: if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id) + config = self._verify_player_video_password(redirect_url, video_id, headers) vod = config.get('video', {}).get('vod', {}) From 29cfcb43da8ac60e6c2eddad095a41c800d4d95a Mon Sep 17 00:00:00 2001 From: Alexandre Huot <alexandre.huot@usherbrooke.ca> Date: Sun, 20 Jan 2019 06:33:09 -0500 Subject: [PATCH 173/226] [radiocanada] Relax DRM check --- youtube_dl/extractor/radiocanada.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index b952e59b4..302f67d96 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -49,6 +49,16 @@ class RadioCanadaIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, } ] @@ -67,8 +77,10 @@ class RadioCanadaIE(InfoExtractor): el = find_xpath_attr(metadata, './/Meta', 'name', name) return el.text if el is not None else None + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/rg3/youtube-dl/pull/18609). if get_meta('protectionType'): - raise ExtractorError('This video is DRM protected.', expected=True) + self.report_warning('This video is probably DRM protected.') device_types = ['ipad'] if not smuggled_data: From 6945b9e78f38284eb4e440b7badea2fc60b66c2f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 20 Jan 2019 13:31:41 +0100 Subject: [PATCH 174/226] [extractor/common] improve jwplayer relative url handling(closes #18892) --- youtube_dl/extractor/common.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6e36e6778..95456b291 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2630,7 +2630,7 @@ class InfoExtractor(object): 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, @@ -2657,12 +2657,9 @@ class InfoExtractor(object): for source in jwplayer_sources_data: if not isinstance(source, dict): continue - source_url = self._proto_relative_url(source.get('file')) - if not source_url: - continue - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - if source_url in urls: + source_url = urljoin( + base_url, self._proto_relative_url(source.get('file'))) + if not source_url or source_url in urls: continue urls.append(source_url) source_type = source.get('type') or '' From fad4ceb53404227f471af2f3544c4c14a5df4acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Jan 2019 20:21:24 +0700 Subject: [PATCH 175/226] [utils] Fix urljoin for paths with non-http(s) schemes --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9e28e008f..409482c3b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -507,6 +507,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(urljoin('http://foo.de/', ''), None) self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + self.assertEqual(urljoin('http://foo.de/a/b/c.txt', 'rtmp://foo.de'), 'rtmp://foo.de') + self.assertEqual(urljoin(None, 'rtmp://foo.de'), 'rtmp://foo.de') def test_url_or_none(self): self.assertEqual(url_or_none(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d2d3c1a9f..d0cb65814 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1868,7 +1868,7 @@ def urljoin(base, path): path = path.decode('utf-8') if not isinstance(path, compat_str) or not path: return None - if re.match(r'^(?:https?:)?//', path): + if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode('utf-8') From 07f9febc4b86a9cd819329f3a7daafdbe9455f40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Jan 2019 22:07:01 +0700 Subject: [PATCH 176/226] [tnaflix] Pass Referer in metadata request (closes #18925) --- youtube_dl/extractor/tnaflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 6798ef4c3..b3573c6e0 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -96,7 +96,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands) + transform_source=fix_xml_ampersands, headers={'Referer': url}) formats = [] From 19d6991312405f5af108af28b3721966720fc72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Jan 2019 03:03:53 +0700 Subject: [PATCH 177/226] [videomore] Improve extraction and fix season extractor (closes #18908) --- youtube_dl/extractor/videomore.py | 96 ++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 9b56630de..e3eda3327 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -4,8 +4,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, + orderedSet, + parse_duration, + str_or_none, + unified_strdate, + url_or_none, xpath_element, xpath_text, ) @@ -13,7 +19,19 @@ from ..utils import ( class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' - _VALID_URL = r'videomore:(?P<sid>\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P<id>\d+)(?:[/?#&]|\.(?:xml|json)|$)' + _VALID_URL = r'''(?x) + videomore:(?P<sid>\d+)$| + https?://(?:player\.)?videomore\.ru/ + (?: + (?: + embed| + [^/]+/[^/]+ + )/| + [^/]*\?.*?\btrack_id= + ) + (?P<id>\d+) + (?:[/?#&]|\.(?:xml|json)|$) + ''' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', 'md5': '44455a346edc0d509ac5b5a5b531dc35', @@ -79,6 +97,9 @@ class VideomoreIE(InfoExtractor): }, { 'url': 'videomore:367617', 'only_matching': True, + }, { + 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', + 'only_matching': True, }] @staticmethod @@ -136,7 +157,7 @@ class VideomoreIE(InfoExtractor): class VideomoreVideoIE(InfoExtractor): IE_NAME = 'videomore:video' - _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', @@ -176,6 +197,9 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', + 'only_matching': True, }] @classmethod @@ -196,13 +220,16 @@ class VideomoreVideoIE(InfoExtractor): r'track-id=["\'](\d+)', r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') video_url = 'videomore:%s' % video_id + else: + video_id = None - return self.url_result(video_url, VideomoreIE.ie_key()) + return self.url_result( + video_url, ie=VideomoreIE.ie_key(), video_id=video_id) class VideomoreSeasonIE(InfoExtractor): IE_NAME = 'videomore:season' - _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ 'url': 'http://videomore.ru/molodezhka/sezon_promo', 'info_dict': { @@ -210,8 +237,16 @@ class VideomoreSeasonIE(InfoExtractor): 'title': 'Молодежка Промо', }, 'playlist_mincount': 12, + }, { + 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url)) + else super(VideomoreSeasonIE, cls).suitable(url)) + def _real_extract(self, url): display_id = self._match_id(url) @@ -219,9 +254,54 @@ class VideomoreSeasonIE(InfoExtractor): title = self._og_search_title(webpage) - entries = [ - self.url_result(item) for item in re.findall( - r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' - % display_id, webpage)] + data = self._parse_json( + self._html_search_regex( + r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1', + webpage, 'data', default='{}', group='value'), + display_id, fatal=False) + + entries = [] + + if data: + episodes = data.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + if not isinstance(ep, dict): + continue + ep_id = int_or_none(ep.get('id')) + ep_url = url_or_none(ep.get('url')) + if ep_id: + e = { + 'url': 'videomore:%s' % ep_id, + 'id': compat_str(ep_id), + } + elif ep_url: + e = {'url': ep_url} + else: + continue + e.update({ + '_type': 'url', + 'ie_key': VideomoreIE.ie_key(), + 'title': str_or_none(ep.get('title')), + 'thumbnail': url_or_none(ep.get('image')), + 'duration': parse_duration(ep.get('duration')), + 'episode_number': int_or_none(ep.get('number')), + 'upload_date': unified_strdate(ep.get('date')), + }) + entries.append(e) + + if not entries: + entries = [ + self.url_result( + 'videomore:%s' % video_id, ie=VideomoreIE.ie_key(), + video_id=video_id) + for video_id in orderedSet(re.findall( + r':(?:id|key)=["\'](\d+)["\']', webpage))] + + if not entries: + entries = [ + self.url_result(item) for item in re.findall( + r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' + % display_id, webpage)] return self.playlist_result(entries, display_id, title) From 4b85f0f9db9329ef1774a66c3e2fd4da558a5201 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 22 Jan 2019 14:38:29 +0100 Subject: [PATCH 178/226] [vrv] add support for authentication(closes #14307) --- youtube_dl/extractor/vrv.py | 95 ++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 483a3be3a..014513051 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -11,10 +11,12 @@ import time from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse_urlencode, compat_urllib_parse, ) from ..utils import ( + ExtractorError, float_or_none, int_or_none, ) @@ -24,29 +26,41 @@ class VRVBaseIE(InfoExtractor): _API_DOMAIN = None _API_PARAMS = {} _CMS_SIGNING = {} + _TOKEN = None + _TOKEN_SECRET = '' def _call_api(self, path, video_id, note, data=None): + # https://tools.ietf.org/html/rfc5849#section-3 base_url = self._API_DOMAIN + '/core/' + path - encoded_query = compat_urllib_parse_urlencode({ + query = { 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), 'oauth_signature_method': 'HMAC-SHA1', 'oauth_timestamp': int(time.time()), - 'oauth_version': '1.0', - }) + } + if self._TOKEN: + query['oauth_token'] = self._TOKEN + encoded_query = compat_urllib_parse_urlencode(query) headers = self.geo_verification_headers() if data: data = json.dumps(data).encode() headers['Content-Type'] = 'application/json' - method = 'POST' if data else 'GET' - base_string = '&'.join([method, compat_urllib_parse.quote(base_url, ''), compat_urllib_parse.quote(encoded_query, '')]) + base_string = '&'.join([ + 'POST' if data else 'GET', + compat_urllib_parse.quote(base_url, ''), + compat_urllib_parse.quote(encoded_query, '')]) oauth_signature = base64.b64encode(hmac.new( - (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'), + (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), base_string.encode(), hashlib.sha1).digest()).decode() encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') - return self._download_json( - '?'.join([base_url, encoded_query]), video_id, - note='Downloading %s JSON metadata' % note, headers=headers, data=data) + try: + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) + raise def _call_cms(self, path, video_id, note): if not self._CMS_SIGNING: @@ -55,19 +69,22 @@ class VRVBaseIE(InfoExtractor): self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) - def _set_api_params(self, webpage, video_id): - if not self._API_PARAMS: - self._API_PARAMS = self._parse_json(self._search_regex( - r'window\.__APP_CONFIG__\s*=\s*({.+?})</script>', - webpage, 'api config'), video_id)['cxApiParams'] - self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') - def _get_cms_resource(self, resource_key, video_id): return self._call_api( 'cms_resource', video_id, 'resource path', data={ 'resource_key': resource_key, })['__links__']['cms_resource']['href'] + def _real_initialize(self): + webpage = self._download_webpage( + 'https://vrv.co/', None, headers=self.geo_verification_headers()) + self._API_PARAMS = self._parse_json(self._search_regex( + [ + r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)', + r'window\.__APP_CONFIG__\s*=\s*({.+})' + ], webpage, 'app config'), None)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + class VRVIE(VRVBaseIE): IE_NAME = 'vrv' @@ -86,6 +103,22 @@ class VRVIE(VRVBaseIE): 'skip_download': True, }, }] + _NETRC_MACHINE = 'vrv' + + def _real_initialize(self): + super(VRVIE, self)._real_initialize() + + email, password = self._get_login_info() + if email is None: + return + + token_credentials = self._call_api( + 'authenticate/by:credentials', None, 'Token Credentials', data={ + 'email': email, + 'password': password, + }) + self._TOKEN = token_credentials['oauth_token'] + self._TOKEN_SECRET = token_credentials['oauth_token_secret'] def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): if not url or stream_format not in ('hls', 'dash'): @@ -116,28 +149,16 @@ class VRVIE(VRVBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, - headers=self.geo_verification_headers()) - media_resource = self._parse_json(self._search_regex( - [ - r'window\.__INITIAL_STATE__\s*=\s*({.+?})(?:</script>|;)', - r'window\.__INITIAL_STATE__\s*=\s*({.+})' - ], webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} - video_data = media_resource.get('json') - if not video_data: - self._set_api_params(webpage, video_id) - episode_path = self._get_cms_resource( - 'cms:/episodes/' + video_id, video_id) - video_data = self._call_cms(episode_path, video_id, 'video') + episode_path = self._get_cms_resource( + 'cms:/episodes/' + video_id, video_id) + video_data = self._call_cms(episode_path, video_id, 'video') title = video_data['title'] - streams_json = media_resource.get('streams', {}).get('json', {}) - if not streams_json: - self._set_api_params(webpage, video_id) - streams_path = video_data['__links__']['streams']['href'] - streams_json = self._call_cms(streams_path, video_id, 'streams') + streams_path = video_data['__links__'].get('streams', {}).get('href') + if not streams_path: + self.raise_login_required() + streams_json = self._call_cms(streams_path, video_id, 'streams') audio_locale = streams_json.get('audio_locale') formats = [] @@ -202,11 +223,7 @@ class VRVSeriesIE(VRVBaseIE): def _real_extract(self, url): series_id = self._match_id(url) - webpage = self._download_webpage( - url, series_id, - headers=self.geo_verification_headers()) - self._set_api_params(webpage, series_id) seasons_path = self._get_cms_resource( 'cms:/seasons?series_id=' + series_id, series_id) seasons_data = self._call_cms(seasons_path, series_id, 'seasons') From 503b604a316837b9dd6ef32045e4e9bbfb6a1363 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 22 Jan 2019 18:21:37 +0100 Subject: [PATCH 179/226] [vrv] fix oauth signing for python 2(#14307) --- youtube_dl/extractor/vrv.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 014513051..6c060ae76 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -32,14 +32,14 @@ class VRVBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, data=None): # https://tools.ietf.org/html/rfc5849#section-3 base_url = self._API_DOMAIN + '/core/' + path - query = { - 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], - 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'oauth_signature_method': 'HMAC-SHA1', - 'oauth_timestamp': int(time.time()), - } + query = [ + ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), + ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_signature_method', 'HMAC-SHA1'), + ('oauth_timestamp', int(time.time())), + ] if self._TOKEN: - query['oauth_token'] = self._TOKEN + query.append(('oauth_token', self._TOKEN)) encoded_query = compat_urllib_parse_urlencode(query) headers = self.geo_verification_headers() if data: From 278d061a0c5eae20963c0a6df4b9b13fd1537186 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 23 Jan 2019 03:51:29 +0700 Subject: [PATCH 180/226] [pornhub] Bypass scrape detection (closes #5930) --- youtube_dl/extractor/pornhub.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index e377de196..f5f3e6593 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -10,7 +10,9 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, + compat_urllib_request, ) +from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, int_or_none, @@ -126,6 +128,26 @@ class PornHubIE(InfoExtractor): 'only_matching': True, }] + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r'<body\b[^>]+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + @staticmethod def _extract_urls(webpage): return re.findall( From 6510a3aa971c00525969040ad654249c0c73f125 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 23 Jan 2019 03:55:41 +0700 Subject: [PATCH 181/226] [crunchyroll] Extend _VALID_URL (closes #18955) --- youtube_dl/extractor/crunchyroll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4a68d092b..5e2cbe41d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -144,7 +144,7 @@ class CrunchyrollBaseIE(InfoExtractor): class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -269,6 +269,9 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): }, { 'url': 'http://www.crunchyroll.com/media-723735', 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', + 'only_matching': True, }] _FORMAT_IDS = { From 71a1f61700789fb0d61fc6ad9681b6f0899d2f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 23 Jan 2019 04:12:06 +0700 Subject: [PATCH 182/226] [pornhub] Apply scrape detection bypass for all extractors --- youtube_dl/extractor/pornhub.py | 46 +++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index f5f3e6593..be93d5d48 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -24,7 +24,29 @@ from ..utils import ( ) -class PornHubIE(InfoExtractor): +class PornHubBaseIE(InfoExtractor): + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r'<body\b[^>]+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + +class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' _VALID_URL = r'''(?x) https?:// @@ -128,26 +150,6 @@ class PornHubIE(InfoExtractor): 'only_matching': True, }] - def _download_webpage_handle(self, *args, **kwargs): - def dl(*args, **kwargs): - return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs) - - webpage, urlh = dl(*args, **kwargs) - - if any(re.search(p, webpage) for p in ( - r'<body\b[^>]+\bonload=["\']go\(\)', - r'document\.cookie\s*=\s*["\']RNKEY=', - r'document\.location\.reload\(true\)')): - url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, compat_urllib_request.Request) - else url_or_request) - phantom = PhantomJSwrapper(self, required_version='2.0') - phantom.get(url, html=webpage) - webpage, urlh = dl(*args, **kwargs) - - return webpage, urlh - @staticmethod def _extract_urls(webpage): return re.findall( @@ -329,7 +331,7 @@ class PornHubIE(InfoExtractor): } -class PornHubPlaylistBaseIE(InfoExtractor): +class PornHubPlaylistBaseIE(PornHubBaseIE): def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see From 0670bdd8f2bca39fdeadc63e1cd53b5d5b3e638c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 23 Jan 2019 04:43:55 +0700 Subject: [PATCH 183/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ChangeLog b/ChangeLog index 902301765..687796a0e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version <unreleased> + +Core +* [utils] Fix urljoin for paths with non-http(s) schemes +* [extractor/common] Improve jwplayer relative URL handling (#18892) ++ [YoutubeDL] Add negation support for string comparisons in format selection + expressions (#18600, #18805) +* [extractor/common] Improve HLS video-only format detection (#18923) + +Extractors +* [crunchyroll] Extend URL regular expression (#18955) +* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, + #17197, #18338 #18842, #18899) ++ [vrv] Add support for authentication (#14307) +* [videomore:season] Fix extraction +* [videomore] Improve extraction (#18908) ++ [tnaflix] Pass Referer in metadata request (#18925) +* [radiocanada] Relax DRM check (#18608, #18609) +* [vimeo] Fix video password verification for videos protected by + Referer HTTP header ++ [hketv] Add support for hkedcity.net (#18696) ++ [streamango] Add support for fruithosts.net (#18710) ++ [instagram] Add support for tags (#18757) ++ [odnoklassniki] Detect paid videos (#18876) +* [ted] Correct acodec for HTTP formats (#18923) +* [cartoonnetwork] Fix extraction (#15664, #17224) +* [vimeo] Fix extraction for password protected player URLs (#18889) + + version 2019.01.17 Extractors From 435e382423f860aca82a58d7c3db58cbfa242b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 23 Jan 2019 04:46:55 +0700 Subject: [PATCH 184/226] release 2019.01.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 841bca914..db3ebaeed 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.17*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.17** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.17 +[debug] youtube-dl version 2019.01.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 687796a0e..55ad44315 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.23 Core * [utils] Fix urljoin for paths with non-http(s) schemes diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c01409419..d759d0273 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -361,6 +361,7 @@ - **hitbox** - **hitbox:live** - **HitRecord** + - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau - **HornBunny** - **HotNewHipHop** - **hotstar** @@ -386,6 +387,7 @@ - **IndavideoEmbed** - **InfoQ** - **Instagram** + - **instagram:tag**: Instagram hashtag search - **instagram:user**: Instagram user profile - **Internazionale** - **InternetVideoArchive** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ea3f62928..d77949eed 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.17' +__version__ = '2019.01.23' From e118a8794ffe5a3a414afd489726f34d753b0b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 24 Jan 2019 01:34:41 +0700 Subject: [PATCH 185/226] [YoutubeDL] Fix typo in string negation implementation and add more tests (closes #18961) --- test/test_YoutubeDL.py | 30 +++++++++++++++++++++++++++--- youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index df8994b84..1d7452744 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -242,6 +242,7 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_string_ops(self): formats = [ {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, + {'format_id': 'zxc-cxz', 'ext': 'webm', 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -253,6 +254,11 @@ class TestFormatSelection(unittest.TestCase): # does not equal (!=) ydl = YDL({'format': '[format_id!=abc-cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!=abc-cba][format_id!=zxc-cxz]'}) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) # starts with (^=) @@ -262,7 +268,12 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'abc-cba') # does not start with (!^=) - ydl = YDL({'format': '[format_id!^=abc-cba]'}) + ydl = YDL({'format': '[format_id!^=abc]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!^=abc][format_id!^=zxc]'}) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) # ends with ($=) @@ -272,16 +283,29 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'abc-cba') # does not end with (!$=) - ydl = YDL({'format': '[format_id!$=abc-cba]'}) + ydl = YDL({'format': '[format_id!$=cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!$=cba][format_id!$=cxz]'}) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) # contains (*=) - ydl = YDL({'format': '[format_id*=-]'}) + ydl = YDL({'format': '[format_id*=bc-cb]'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'abc-cba') # does not contain (!*=) + ydl = YDL({'format': '[format_id!*=bc-cb]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!*=abc][format_id!*=zxc]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + ydl = YDL({'format': '[format_id!*=-]'}) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a827414dc..80ed8d7e5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1078,7 +1078,7 @@ class YoutubeDL(object): comparison_value = m.group('value') str_op = STR_OPERATORS[m.group('op')] if m.group('negation'): - op = lambda attr, value: not str_op + op = lambda attr, value: not str_op(attr, value) else: op = str_op From 7d311586eda9eae9430da3f6d18932d79127daa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 24 Jan 2019 01:44:09 +0700 Subject: [PATCH 186/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index 55ad44315..a28c6e951 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Core +* [YoutubeDL] Fix negation for string operators in format selection (#18961) + + version 2019.01.23 Core From a1e171233d86a064865353cc820969c10cb1f251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 24 Jan 2019 01:46:23 +0700 Subject: [PATCH 187/226] release 2019.01.24 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index db3ebaeed..63aefe013 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.24*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.24** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.23 +[debug] youtube-dl version 2019.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a28c6e951..1fda747bb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.24 Core * [YoutubeDL] Fix negation for string operators in format selection (#18961) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d77949eed..18c1f8d4c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.23' +__version__ = '2019.01.24' From 9713d1d1e0a7eff5c1b9873a2f4f054111a568ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 24 Jan 2019 02:30:12 +0700 Subject: [PATCH 188/226] [openload] Add support for oload.club (closes #18969) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index cf51e4770..b713e78b8 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -249,7 +249,7 @@ class OpenloadIE(InfoExtractor): (?:www\.)? (?: openload\.(?:co|io|link)| - oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun) + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club) ) )/ (?:f|embed)/ @@ -334,6 +334,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.fun/f/gb6G1H4sHXY', 'only_matching': True, + }, { + 'url': 'https://oload.club/f/Nr1L-aZ2dbQ', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From 118afcf52ff23726e5f0c436083710f5c63230fa Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 23 Jan 2019 22:16:21 +0100 Subject: [PATCH 189/226] [go] fix adobe pass requests for Disney Now(closes #18901) --- youtube_dl/extractor/go.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index ec9dd6e3a..206d89e82 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -25,15 +25,15 @@ class GoIE(AdobePassIE): }, 'watchdisneychannel': { 'brand': '004', - 'requestor_id': 'Disney', + 'resource_id': 'Disney', }, 'watchdisneyjunior': { 'brand': '008', - 'requestor_id': 'DisneyJunior', + 'resource_id': 'DisneyJunior', }, 'watchdisneyxd': { 'brand': '009', - 'requestor_id': 'DisneyXD', + 'resource_id': 'DisneyXD', } } _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))'\ @@ -130,8 +130,8 @@ class GoIE(AdobePassIE): 'device': '001', } if video_data.get('accesslevel') == '1': - requestor_id = site_info['requestor_id'] - resource = self._get_mvpd_resource( + requestor_id = site_info.get('requestor_id', 'DisneyChannels') + resource = site_info.get('resource_id') or self._get_mvpd_resource( requestor_id, title, video_id, None) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) From eb35b163adf61f8ff0ee6c504e98bc94db16e705 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 24 Jan 2019 20:23:04 +0100 Subject: [PATCH 190/226] [postprocessor/ffmpeg] fallback to ffmpeg/avconv for audio codec detection(closes #681) --- youtube_dl/postprocessor/ffmpeg.py | 53 +++++++++++++++++++----------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 39a905380..b952b0970 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -9,9 +9,6 @@ import re from .common import AudioConversionError, PostProcessor -from ..compat import ( - compat_subprocess_get_DEVNULL, -) from ..utils import ( encodeArgument, encodeFilename, @@ -165,27 +162,45 @@ class FFmpegPostProcessor(PostProcessor): return self._paths[self.probe_basename] def get_audio_codec(self, path): - if not self.probe_available: - raise PostProcessingError('ffprobe or avprobe not found. Please install one.') + if not self.probe_available and not self.available: + raise PostProcessingError('ffprobe/avprobe and ffmpeg/avconv not found. Please install one.') try: - cmd = [ - encodeFilename(self.probe_executable, True), - encodeArgument('-show_streams'), - encodeFilename(self._ffmpeg_filename_argument(path), True)] + if self.probe_available: + cmd = [ + encodeFilename(self.probe_executable, True), + encodeArgument('-show_streams')] + else: + cmd = [ + encodeFilename(self.executable, True), + encodeArgument('-i')] + cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) - handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) - output = handle.communicate()[0] - if handle.wait() != 0: + self._downloader.to_screen( + '[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) + handle = subprocess.Popen( + cmd, stderr=subprocess.PIPE, + stdout=subprocess.PIPE, stdin=subprocess.PIPE) + stdout_data, stderr_data = handle.communicate() + expected_ret = 0 if self.probe_available else 1 + if handle.wait() != expected_ret: return None except (IOError, OSError): return None - audio_codec = None - for line in output.decode('ascii', 'ignore').split('\n'): - if line.startswith('codec_name='): - audio_codec = line.split('=')[1].strip() - elif line.strip() == 'codec_type=audio' and audio_codec is not None: - return audio_codec + output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore') + if self.probe_available: + audio_codec = None + for line in output.split('\n'): + if line.startswith('codec_name='): + audio_codec = line.split('=')[1].strip() + elif line.strip() == 'codec_type=audio' and audio_codec is not None: + return audio_codec + else: + # Stream #FILE_INDEX:STREAM_INDEX[STREAM_ID](LANGUAGE): CODEC_TYPE: CODEC_NAME + mobj = re.search( + r'Stream\s*#\d+:\d+(?:\[0x[0-9a-f]+\])?(?:\([a-z]{3}\))?:\s*Audio:\s*([0-9a-z]+)', + output) + if mobj: + return mobj.group(1) return None def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): From 0eba178fce80923515f4a9ac411e46648a19d78c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Jan 2019 04:04:58 +0700 Subject: [PATCH 191/226] [nhk] Extend _VALID_URL (closes #18968) --- youtube_dl/extractor/nhk.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 5c8cd76dc..d4acbcc3e 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -5,8 +5,8 @@ from ..utils import ExtractorError class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P<id>[^/]+/[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/(?:vod|ondemand)/(?P<id>[^/]+/[^/?#&]+)' + _TESTS = [{ # Videos available only for a limited period of time. Visit # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815', @@ -19,7 +19,10 @@ class NhkVodIE(InfoExtractor): 'episode': 'The Kimono as Global Fashion', }, 'skip': 'Videos available only for a limited period of time', - } + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', + 'only_matching': True, + }] _API_URL = 'http://api.nhk.or.jp/nhkworld/vodesdlist/v1/all/all/all.json?apikey=EJfK8jdS57GqlupFgAfAAwr573q01y6k' def _real_extract(self, url): From 1602a240a7742f9e0a02b3f0effd215d00d859f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Jan 2019 04:16:49 +0700 Subject: [PATCH 192/226] [drtv] Fix extraction (closes #18989) --- youtube_dl/extractor/drtv.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f757745ba..8d63ca433 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -77,10 +77,9 @@ class DRTVIE(InfoExtractor): r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), webpage, 'video id') - programcard = self._download_json( - 'http://www.dr.dk/mu/programcard/expanded/%s' % video_id, - video_id, 'Downloading video JSON') - data = programcard['Data'][0] + data = self._download_json( + 'https://www.dr.dk/mu-online/api/1.4/programcard/%s' % video_id, + video_id, 'Downloading video JSON', query={'expanded': 'true'}) title = remove_end(self._og_search_title( webpage, default=None), ' | TV | DR') or data['Title'] @@ -97,7 +96,7 @@ class DRTVIE(InfoExtractor): formats = [] subtitles = {} - for asset in data['Assets']: + for asset in [data['PrimaryAsset']]: kind = asset.get('Kind') if kind == 'Image': thumbnail = asset.get('Uri') From ae18d58297c16a300a89693360efd19be4a97e92 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 25 Jan 2019 11:01:27 +0100 Subject: [PATCH 193/226] [usatoday] fix extraction for videos with custom brightcove partner id(closes #18990) --- youtube_dl/extractor/usatoday.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/usatoday.py b/youtube_dl/extractor/usatoday.py index e5678dc78..b2103448d 100644 --- a/youtube_dl/extractor/usatoday.py +++ b/youtube_dl/extractor/usatoday.py @@ -3,21 +3,23 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, get_element_by_attribute, parse_duration, + try_get, update_url_query, - ExtractorError, ) from ..compat import compat_str class USATodayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)' - _TEST = { + _TESTS = [{ + # Brightcove Partner ID = 29906170001 'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/', - 'md5': '4d40974481fa3475f8bccfd20c5361f8', + 'md5': '033587d2529dc3411a1ab3644c3b8827', 'info_dict': { - 'id': '81729424', + 'id': '4799374959001', 'ext': 'mp4', 'title': 'US, France warn Syrian regime ahead of new peace talks', 'timestamp': 1457891045, @@ -25,8 +27,20 @@ class USATodayIE(InfoExtractor): 'uploader_id': '29906170001', 'upload_date': '20160313', } - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/29906170001/38a9eecc-bdd8-42a3-ba14-95397e48b3f8_default/index.html?videoId=%s' + }, { + # ui-video-data[asset_metadata][items][brightcoveaccount] = 28911775001 + 'url': 'https://www.usatoday.com/story/tech/science/2018/08/21/yellowstone-supervolcano-eruption-stop-worrying-its-blow/973633002/', + 'info_dict': { + 'id': '5824495846001', + 'ext': 'mp4', + 'title': 'Yellowstone more likely to crack rather than explode', + 'timestamp': 1534790612, + 'description': 'md5:3715e7927639a4f16b474e9391687c62', + 'uploader_id': '28911775001', + 'upload_date': '20180820', + } + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): display_id = self._match_id(url) @@ -35,10 +49,11 @@ class USATodayIE(InfoExtractor): if not ui_video_data: raise ExtractorError('no video on the webpage', expected=True) video_data = self._parse_json(ui_video_data, display_id) + item = try_get(video_data, lambda x: x['asset_metadata']['items'], dict) or {} return { '_type': 'url_transparent', - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_data['brightcove_id'], + 'url': self.BRIGHTCOVE_URL_TEMPLATE % (item.get('brightcoveaccount', '29906170001'), item.get('brightcoveid') or video_data['brightcove_id']), 'id': compat_str(video_data['id']), 'title': video_data['title'], 'thumbnail': video_data.get('thumbnail'), From 252abb1e8b881aa9d3942c436711ac33235b37cd Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Sat, 26 Jan 2019 15:29:19 +0700 Subject: [PATCH 194/226] [README.md] Mention more convenience extraction functions --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ba982907..c1572f771 100644 --- a/README.md +++ b/README.md @@ -1213,7 +1213,7 @@ Incorrect: 'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' ``` -### Use safe conversion functions +### Use convenience conversion and parsing functions Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. @@ -1221,6 +1221,8 @@ Use `url_or_none` for safe URL processing. Use `try_get` for safe metadata extraction from parsed JSON. +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. + Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. #### More examples From 845333acf6280761d19f91b3e018c418d922a0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 27 Jan 2019 04:14:54 +0700 Subject: [PATCH 195/226] [wakanim] Add extractor (closes #14374) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/wakanim.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/wakanim.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 574a47e6d..2ffcffa9e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1373,6 +1373,7 @@ from .vuclip import VuClipIE from .vvvvid import VVVVIDIE from .vyborymos import VyboryMosIE from .vzaar import VzaarIE +from .wakanim import WakanimIE from .walla import WallaIE from .washingtonpost import ( WashingtonPostIE, diff --git a/youtube_dl/extractor/wakanim.py b/youtube_dl/extractor/wakanim.py new file mode 100644 index 000000000..1d588bdd6 --- /dev/null +++ b/youtube_dl/extractor/wakanim.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + merge_dicts, + urljoin, +) + + +class WakanimIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu', + 'info_dict': { + 'id': '2997', + 'ext': 'mp4', + 'title': 'Episode 02', + 'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d', + 'series': 'The Asterisk War (OmU.)', + 'season_number': 1, + 'episode': 'Episode 02', + 'episode_number': 2, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + m3u8_url = urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url', + group='url')) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._search_regex( + (r'<h1[^>]+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, group='title') + + return merge_dicts(info, { + 'id': video_id, + 'title': title, + 'formats': formats, + }) From 458fd30f56785d514862dcb8a604a329d8e29ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 27 Jan 2019 04:36:58 +0700 Subject: [PATCH 196/226] [extractor/common] Extract season in _json_ld --- youtube_dl/extractor/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 95456b291..c4ea2882f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1249,7 +1249,10 @@ class InfoExtractor(object): info['title'] = episode_name part_of_season = e.get('partOfSeason') if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + info.update({ + 'season': unescapeHTML(part_of_season.get('name')), + 'season_number': int_or_none(part_of_season.get('seasonNumber')), + }) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) From 30cd1a5f3920d7485225a5d57b6ce41be4cde672 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 26 Jan 2019 22:52:54 +0100 Subject: [PATCH 197/226] [wakanim] detect DRM protected videos --- youtube_dl/extractor/wakanim.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/wakanim.py b/youtube_dl/extractor/wakanim.py index 1d588bdd6..f9a2395d9 100644 --- a/youtube_dl/extractor/wakanim.py +++ b/youtube_dl/extractor/wakanim.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, merge_dicts, urljoin, ) @@ -10,7 +11,7 @@ from ..utils import ( class WakanimIE(InfoExtractor): _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu', 'info_dict': { 'id': '2997', @@ -26,7 +27,11 @@ class WakanimIE(InfoExtractor): 'format': 'bestvideo', 'skip_download': True, }, - } + }, { + # DRM Protected + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -36,6 +41,12 @@ class WakanimIE(InfoExtractor): m3u8_url = urljoin(url, self._search_regex( r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url', group='url')) + # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls + encryption = self._search_regex( + r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', + m3u8_url, 'encryption', default=None) + if encryption and encryption in ('cenc', 'cbcs-aapl'): + raise ExtractorError('This video is DRM protected.', expected=True) formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', From 1fcc91663bc84a599cc613ff1fa0e4bc15f42a9e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 27 Jan 2019 10:53:38 +0100 Subject: [PATCH 198/226] [vice] fix extraction for locked videos(closes #16248) --- youtube_dl/extractor/vice.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 538258617..8fdfd743d 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -94,7 +94,6 @@ class ViceIE(AdobePassIE): 'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1', 'only_matching': True, }] - _PREPLAY_HOST = 'vms.vice' @staticmethod def _extract_urls(webpage): @@ -158,9 +157,8 @@ class ViceIE(AdobePassIE): }) try: - host = 'www.viceland' if is_locked else self._PREPLAY_HOST preplay = self._download_json( - 'https://%s.com/%s/video/preplay/%s' % (host, locale, video_id), + 'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id), video_id, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): From bf8ebc9cfe1ae2b62baf2116f84748db03c4df7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 27 Jan 2019 21:25:43 +0700 Subject: [PATCH 199/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1fda747bb..9d5c25273 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version <unreleased> + +Core ++ [extractor/common] Extract season in _json_ld +* [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection + (#681) + +Extractors +* [vice] Fix extraction for locked videos (#16248) ++ [wakanim] Detect DRM protected videos ++ [wakanim] Add support for wakanim.tv (#14374) +* [usatoday] Fix extraction for videos with custom brightcove partner id + (#18990) +* [drtv] Fix extraction (#18989) +* [nhk] Extend URL regular expression (#18968) +* [go] Fix Adobe Pass requests for Disney Now (#18901) ++ [openload] Add support for oload.club (#18969) + + version 2019.01.24 Core From e71be6ee9f239308765443d49d91358fa306e48a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 27 Jan 2019 21:28:09 +0700 Subject: [PATCH 200/226] release 2019.01.27 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 4 +++- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 63aefe013..f529e3f4b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.24*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.27*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.27** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.24 +[debug] youtube-dl version 2019.01.27 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a71b045d0..6c1739860 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -339,7 +339,7 @@ Incorrect: 'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' ``` -### Use safe conversion functions +### Use convenience conversion and parsing functions Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. @@ -347,6 +347,8 @@ Use `url_or_none` for safe URL processing. Use `try_get` for safe metadata extraction from parsed JSON. +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. + Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. #### More examples diff --git a/ChangeLog b/ChangeLog index 9d5c25273..d94fe36ec 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.27 Core + [extractor/common] Extract season in _json_ld diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d759d0273..6377bf815 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1069,6 +1069,7 @@ - **VVVVID** - **VyboryMos** - **Vzaar** + - **Wakanim** - **Walla** - **WalyTV** - **washingtonpost** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 18c1f8d4c..ec89cfc64 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.24' +__version__ = '2019.01.27' From 2b3afe6b0fee226bb1e61027d815df088cb40757 Mon Sep 17 00:00:00 2001 From: Andrew Udvare <audvare@gmail.com> Date: Sun, 27 Jan 2019 22:24:37 -0500 Subject: [PATCH 201/226] [postprocessor/ffmpeg] Disable "Last message repeated" messages which cause non-zero exit status --- youtube_dl/postprocessor/ffmpeg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index b952b0970..8ef03f43b 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -218,6 +218,7 @@ class FFmpegPostProcessor(PostProcessor): encodeFilename(self._ffmpeg_filename_argument(path), True) ]) cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + + ['-loglevel', 'repeat+info'] + files_cmd + [encodeArgument(o) for o in opts] + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) From 7f903dd8bfbc9c2de129d5b0be23ef62d8ca3df3 Mon Sep 17 00:00:00 2001 From: Tatsh <Tatsh@users.noreply.github.com> Date: Mon, 28 Jan 2019 10:57:14 -0500 Subject: [PATCH 202/226] [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding subtitles (closes #19042) Related issue: https://trac.ffmpeg.org/ticket/6016 --- youtube_dl/postprocessor/ffmpeg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index b952b0970..fff2021ff 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -407,6 +407,9 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', + # Don't copy Apple TV chapters track, bin_data (see #19042, #19024, + # https://trac.ffmpeg.org/ticket/6016) + '-map', '-0:d', ] if information['ext'] == 'mp4': opts += ['-c:s', 'mov_text'] From 61ff92e11ea876532697451b1ed727f42274b109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 29 Jan 2019 01:59:56 +0700 Subject: [PATCH 203/226] [postprocessor/ffmpeg] Wrap loglevel args in encodeArgument --- youtube_dl/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 33dbcad9f..88b9ae9be 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -218,7 +218,7 @@ class FFmpegPostProcessor(PostProcessor): encodeFilename(self._ffmpeg_filename_argument(path), True) ]) cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + - ['-loglevel', 'repeat+info'] + + [encodeArgument('-loglevel'), encodeArgument('repeat+info')] + files_cmd + [encodeArgument(o) for o in opts] + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) From a81daba2311cb4d6c5bc7e62b47438a78aa5c10f Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Mon, 28 Jan 2019 20:20:46 +0100 Subject: [PATCH 204/226] [zattoo] Add support for tv.salt.ch --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/zattoo.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2ffcffa9e..9d776ff45 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1497,6 +1497,7 @@ from .zattoo import ( QuantumTVIE, QuicklineIE, QuicklineLiveIE, + SaltTVIE, SAKTVIE, VTXTVIE, WalyTVIE, diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 896276301..ee514666b 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -420,3 +420,14 @@ class EinsUndEinsTVIE(ZattooIE): 'url': 'https://www.1und1.tv/watch/abc/123-abc', 'only_matching': True, }] + + +class SaltTVIE(ZattooIE): + _NETRC_MACHINE = 'salttv' + _HOST = 'tv.salt.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tv.salt.ch/watch/abc/123-abc', + 'only_matching': True, + }] From 41c2c254d3c30afde395e8abbe0ced2c53485a78 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Jan 2019 22:39:08 +0100 Subject: [PATCH 205/226] [fox] fix extraction for free videos(#19060) --- youtube_dl/extractor/fox.py | 46 ++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index b1c91f095..2d6c97ec9 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -1,10 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -# import json -# import uuid +import json +import uuid from .adobepass import AdobePassIE +from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, @@ -47,38 +48,31 @@ class FOXIE(AdobePassIE): 'url': 'https://www.nationalgeographic.com/tv/watch/f690e05ebbe23ab79747becd0cc223d1/', 'only_matching': True, }] - # _access_token = None + _access_token = None - # def _call_api(self, path, video_id, data=None): - # headers = { - # 'X-Api-Key': '238bb0a0c2aba67922c48709ce0c06fd', - # } - # if self._access_token: - # headers['Authorization'] = 'Bearer ' + self._access_token - # return self._download_json( - # 'https://api2.fox.com/v2.0/' + path, video_id, data=data, headers=headers) + def _call_api(self, path, video_id, data=None): + headers = { + 'X-Api-Key': '238bb0a0c2aba67922c48709ce0c06fd', + } + if self._access_token: + headers['Authorization'] = 'Bearer ' + self._access_token + return self._download_json( + 'https://api2.fox.com/v2.0/' + path, + video_id, data=data, headers=headers) - # def _real_initialize(self): - # self._access_token = self._call_api( - # 'login', None, json.dumps({ - # 'deviceId': compat_str(uuid.uuid4()), - # }).encode())['accessToken'] + def _real_initialize(self): + self._access_token = self._call_api( + 'login', None, json.dumps({ + 'deviceId': compat_str(uuid.uuid4()), + }).encode())['accessToken'] def _real_extract(self, url): video_id = self._match_id(url) - video = self._download_json( - 'https://api.fox.com/fbc-content/v1_5/video/%s' % video_id, - video_id, headers={ - 'apikey': 'abdcbed02c124d393b39e818a4312055', - 'Content-Type': 'application/json', - 'Referer': url, - }) - # video = self._call_api('vodplayer/' + video_id, video_id) + video = self._call_api('vodplayer/' + video_id, video_id) title = video['name'] - release_url = video['videoRelease']['url'] - # release_url = video['url'] + release_url = video['url'] data = try_get( video, lambda x: x['trackingData']['properties'], dict) or {} From 6df196f32e68ec22bd854c4d779b9d94e04e63b2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 29 Jan 2019 00:31:49 +0100 Subject: [PATCH 206/226] [fox] add support for locked videos using cookies(closes #19060) --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/fox.py | 50 +++++++++++----------- youtube_dl/extractor/nationalgeographic.py | 22 ++++++++++ 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9d776ff45..b40be42e6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -692,7 +692,10 @@ from .myvi import ( MyviEmbedIE, ) from .myvidster import MyVidsterIE -from .nationalgeographic import NationalGeographicVideoIE +from .nationalgeographic import ( + NationalGeographicVideoIE, + NationalGeographicTVIE, +) from .naver import NaverIE from .nba import NBAIE from .nbc import ( diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 2d6c97ec9..568656542 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -5,19 +5,23 @@ import json import uuid from .adobepass import AdobePassIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( + ExtractorError, int_or_none, parse_age_limit, parse_duration, try_get, unified_timestamp, - update_url_query, ) class FOXIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fox\.com|nationalgeographic\.com/tv)/watch/(?P<id>[\da-fA-F]+)' + _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' _TESTS = [{ # clip 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', @@ -32,6 +36,7 @@ class FOXIE(AdobePassIE): 'upload_date': '20170901', 'creator': 'FOX', 'series': 'Gotham', + 'age_limit': 14, }, 'params': { 'skip_download': True, @@ -44,15 +49,14 @@ class FOXIE(AdobePassIE): # episode, geo-restricted, tv provided required 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', 'only_matching': True, - }, { - 'url': 'https://www.nationalgeographic.com/tv/watch/f690e05ebbe23ab79747becd0cc223d1/', - 'only_matching': True, }] + _HOME_PAGE_URL = 'https://www.fox.com/' + _API_KEY = 'abdcbed02c124d393b39e818a4312055' _access_token = None def _call_api(self, path, video_id, data=None): headers = { - 'X-Api-Key': '238bb0a0c2aba67922c48709ce0c06fd', + 'X-Api-Key': self._API_KEY, } if self._access_token: headers['Authorization'] = 'Bearer ' + self._access_token @@ -61,10 +65,16 @@ class FOXIE(AdobePassIE): video_id, data=data, headers=headers) def _real_initialize(self): - self._access_token = self._call_api( - 'login', None, json.dumps({ - 'deviceId': compat_str(uuid.uuid4()), - }).encode())['accessToken'] + if not self._access_token: + mvpd_auth = self._get_cookies(self._HOME_PAGE_URL).get('mvpd-auth') + if mvpd_auth: + self._access_token = (self._parse_json(compat_urllib_parse_unquote( + mvpd_auth.value), None, fatal=False) or {}).get('accessToken') + if not self._access_token: + self._access_token = self._call_api( + 'login', None, json.dumps({ + 'deviceId': compat_str(uuid.uuid4()), + }).encode())['accessToken'] def _real_extract(self, url): video_id = self._match_id(url) @@ -73,25 +83,15 @@ class FOXIE(AdobePassIE): title = video['name'] release_url = video['url'] - - data = try_get( - video, lambda x: x['trackingData']['properties'], dict) or {} - - rating = video.get('contentRating') - if data.get('authRequired'): - resource = self._get_mvpd_resource( - 'fbc-fox', title, video.get('guid'), rating) - release_url = update_url_query( - release_url, { - 'auth': self._extract_mvpd_auth( - url, video_id, 'fbc-fox', resource) - }) m3u8_url = self._download_json(release_url, video_id)['playURL'] formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) + data = try_get( + video, lambda x: x['trackingData']['properties'], dict) or {} + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( video.get('duration')) or parse_duration(video.get('duration')) timestamp = unified_timestamp(video.get('datePublished')) @@ -117,7 +117,7 @@ class FOXIE(AdobePassIE): 'description': video.get('description'), 'duration': duration, 'timestamp': timestamp, - 'age_limit': parse_age_limit(rating), + 'age_limit': parse_age_limit(video.get('contentRating')), 'creator': creator, 'series': series, 'season_number': int_or_none(video.get('seasonNumber')), diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 165964ca0..ee12e2b47 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .fox import FOXIE from ..utils import ( smuggle_url, url_basename, @@ -58,3 +59,24 @@ class NationalGeographicVideoIE(InfoExtractor): {'force_smil_url': True}), 'id': guid, } + + +class NationalGeographicTVIE(FOXIE): + _VALID_URL = r'https?://(?:www\.)?nationalgeographic\.com/tv/watch/(?P<id>[\da-fA-F]+)' + _TESTS = [{ + 'url': 'https://www.nationalgeographic.com/tv/watch/6a875e6e734b479beda26438c9f21138/', + 'info_dict': { + 'id': '6a875e6e734b479beda26438c9f21138', + 'ext': 'mp4', + 'title': 'Why Nat Geo? Valley of the Boom', + 'description': 'The lives of prominent figures in the tech world, including their friendships, rivalries, victories and failures.', + 'timestamp': 1542662458, + 'upload_date': '20181119', + 'age_limit': 14, + }, + 'params': { + 'skip_download': True, + }, + }] + _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/' + _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd' From a2d821d7112fb1423f99ddf309a843c80cc3be2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 29 Jan 2019 23:33:09 +0700 Subject: [PATCH 207/226] [drtv] Improve extraction (closes #19039) + Add support for EncryptedUri videos + Extract more metadata * Fix subtitles extraction --- youtube_dl/extractor/drtv.py | 133 +++++++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 8d63ca433..c5f211128 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,15 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals +import binascii +import hashlib +import re + + from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import compat_urllib_parse_unquote from ..utils import ( + bytes_to_intlist, ExtractorError, int_or_none, + intlist_to_bytes, float_or_none, mimetype2ext, - parse_iso8601, - remove_end, + str_or_none, + unified_timestamp, update_url_query, + url_or_none, ) @@ -20,23 +30,31 @@ class DRTVIE(InfoExtractor): IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', - 'md5': '7ae17b4e18eb5d29212f424a7511c184', + 'md5': '25e659cccc9a2ed956110a299fdf5983', 'info_dict': { 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', 'title': 'Klassen - Dårlig taber (10)', 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', - 'timestamp': 1471991907, - 'upload_date': '20160823', + 'timestamp': 1539085800, + 'upload_date': '20181009', 'duration': 606.84, + 'series': 'Klassen', + 'season': 'Klassen I', + 'season_number': 1, + 'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b', + 'episode': 'Episode 10', + 'episode_number': 10, + 'release_year': 2016, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', 'info_dict': { - 'id': 'christiania-pusher-street-ryddes-drdkrjpo', + 'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6', 'ext': 'mp4', - 'title': 'LIVE Christianias rydning af Pusher Street er i gang', + 'title': 'christiania pusher street ryddes drdkrjpo', 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', 'timestamp': 1472800279, 'upload_date': '20160902', @@ -45,17 +63,18 @@ class DRTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # with SignLanguage formats 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', 'info_dict': { 'id': 'historien-om-danmark-stenalder', 'ext': 'mp4', - 'title': 'Historien om Danmark: Stenalder (1)', + 'title': 'Historien om Danmark: Stenalder', 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', - 'timestamp': 1490401996, - 'upload_date': '20170325', - 'duration': 3502.04, + 'timestamp': 1546628400, + 'upload_date': '20190104', + 'duration': 3502.56, 'formats': 'mincount:20', }, 'params': { @@ -74,19 +93,26 @@ class DRTVIE(InfoExtractor): video_id = self._search_regex( (r'data-(?:material-identifier|episode-slug)="([^"]+)"', - r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), - webpage, 'video id') + r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), + webpage, 'video id', default=None) + + if not video_id: + video_id = compat_urllib_parse_unquote(self._search_regex( + r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', + webpage, 'urn')) data = self._download_json( 'https://www.dr.dk/mu-online/api/1.4/programcard/%s' % video_id, video_id, 'Downloading video JSON', query={'expanded': 'true'}) - title = remove_end(self._og_search_title( - webpage, default=None), ' | TV | DR') or data['Title'] + title = str_or_none(data.get('Title')) or re.sub( + r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '', + self._og_search_title(webpage)) description = self._og_search_description( webpage, default=None) or data.get('Description') - timestamp = parse_iso8601(data.get('CreatedTime')) + timestamp = unified_timestamp( + data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime')) thumbnail = None duration = None @@ -96,16 +122,51 @@ class DRTVIE(InfoExtractor): formats = [] subtitles = {} - for asset in [data['PrimaryAsset']]: + assets = [] + primary_asset = data.get('PrimaryAsset') + if isinstance(primary_asset, dict): + assets.append(primary_asset) + secondary_assets = data.get('SecondaryAssets') + if isinstance(secondary_assets, list): + for secondary_asset in secondary_assets: + if isinstance(secondary_asset, dict): + assets.append(secondary_asset) + + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) + + def decrypt_uri(e): + n = int(e[2:10], 16) + a = e[10 + n:] + data = bytes_to_intlist(hex_to_bytes(e[10:10 + n])) + key = bytes_to_intlist(hashlib.sha256( + ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()) + iv = bytes_to_intlist(hex_to_bytes(a)) + decrypted = aes_cbc_decrypt(data, key, iv) + return intlist_to_bytes( + decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0] + + for asset in assets: kind = asset.get('Kind') if kind == 'Image': - thumbnail = asset.get('Uri') + thumbnail = url_or_none(asset.get('Uri')) elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') asset_target = asset.get('Target') for link in asset.get('Links', []): uri = link.get('Uri') + if not uri: + encrypted_uri = link.get('EncryptedUri') + if not encrypted_uri: + continue + try: + uri = decrypt_uri(encrypted_uri) + except Exception: + self.report_warning( + 'Unable to decrypt EncryptedUri', video_id) + continue + uri = url_or_none(uri) if not uri: continue target = link.get('Target') @@ -139,19 +200,22 @@ class DRTVIE(InfoExtractor): 'vcodec': 'none' if kind == 'AudioResource' else None, 'preference': preference, }) - subtitles_list = asset.get('SubtitlesList') - if isinstance(subtitles_list, list): - LANGS = { - 'Danish': 'da', - } - for subs in subtitles_list: - if not subs.get('Uri'): - continue - lang = subs.get('Language') or 'da' - subtitles.setdefault(LANGS.get(lang, lang), []).append({ - 'url': subs['Uri'], - 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' - }) + subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist') + if isinstance(subtitles_list, list): + LANGS = { + 'Danish': 'da', + } + for subs in subtitles_list: + if not isinstance(subs, dict): + continue + sub_uri = url_or_none(subs.get('Uri')) + if not sub_uri: + continue + lang = subs.get('Language') or 'da' + subtitles.setdefault(LANGS.get(lang, lang), []).append({ + 'url': sub_uri, + 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' + }) if not formats and restricted_to_denmark: self.raise_geo_restricted( @@ -169,6 +233,13 @@ class DRTVIE(InfoExtractor): 'duration': duration, 'formats': formats, 'subtitles': subtitles, + 'series': str_or_none(data.get('SeriesTitle')), + 'season': str_or_none(data.get('SeasonTitle')), + 'season_number': int_or_none(data.get('SeasonNumber')), + 'season_id': str_or_none(data.get('SeasonUrn')), + 'episode': str_or_none(data.get('EpisodeTitle')), + 'episode_number': int_or_none(data.get('EpisodeNumber')), + 'release_year': int_or_none(data.get('ProductionYear')), } From 41cff90c41006b30213c7f676bd3920a1612b717 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Mon, 28 Jan 2019 19:42:49 -0400 Subject: [PATCH 208/226] [yourporn] Fix extraction and extract duration (closes #18815, closes #18852) change cdn to cdn4 for the video_url --- youtube_dl/extractor/yourporn.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index c8dc29bd8..01e5f0c0e 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import urljoin +from ..utils import ( + parse_duration, + urljoin +) class YourPornIE(InfoExtractor): @@ -27,17 +30,21 @@ class YourPornIE(InfoExtractor): self._search_regex( r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info', group='data'), - video_id)[video_id]).replace('/cdn/', '/cdn3/') + video_id)[video_id]).replace('/cdn/', '/cdn4/') title = (self._search_regex( r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', default=None) or self._og_search_description(webpage)).strip() + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex(r'duration:[^0-9]*([0-9:]+)', + webpage, 'duration', default=None)) return { 'id': video_id, 'url': video_url, 'title': title, + 'duration': duration, 'thumbnail': thumbnail, 'age_limit': 18 } From 9868f1ab1853484d7a6c38cd6fa0d94a11914cae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 29 Jan 2019 23:56:42 +0700 Subject: [PATCH 209/226] [yourporn] Improve (closes #19061) --- youtube_dl/extractor/yourporn.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index 01e5f0c0e..2c63f9752 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_duration, - urljoin + urljoin, ) @@ -17,7 +17,11 @@ class YourPornIE(InfoExtractor): 'ext': 'mp4', 'title': 'md5:c9f43630bd968267672651ba905a7d35', 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18 + 'duration': 165, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, }, } @@ -35,16 +39,16 @@ class YourPornIE(InfoExtractor): title = (self._search_regex( r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', default=None) or self._og_search_description(webpage)).strip() - thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'duration\s*:\s*<[^>]+>([\d:]+)', webpage, 'duration', + default=None)) - duration = parse_duration(self._search_regex(r'duration:[^0-9]*([0-9:]+)', - webpage, 'duration', default=None)) return { 'id': video_id, 'url': video_url, 'title': title, - 'duration': duration, 'thumbnail': thumbnail, - 'age_limit': 18 + 'duration': duration, + 'age_limit': 18, } From 5496754ae4c9097f37cfd9b307261cbbca438260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 30 Jan 2019 00:03:19 +0700 Subject: [PATCH 210/226] [fox] Remove unused imports --- youtube_dl/extractor/fox.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 568656542..0ffceeb7c 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -6,12 +6,10 @@ import uuid from .adobepass import AdobePassIE from ..compat import ( - compat_HTTPError, compat_str, compat_urllib_parse_unquote, ) from ..utils import ( - ExtractorError, int_or_none, parse_age_limit, parse_duration, From ca01e5f9039dd6c0d5abff5c7139f82c5d1dfba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 30 Jan 2019 00:05:32 +0700 Subject: [PATCH 211/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index d94fe36ec..8f5343b23 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Core +* [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding + subtitles (#19024, #19042) +* [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025) + +Extractors +* [yourporn] Fix extraction and extract duration (#18815, #18852, #19061) +* [drtv] Improve extraction (#19039) + + Add support for EncryptedUri videos + + Extract more metadata + * Fix subtitles extraction ++ [fox] Add support for locked videos using cookies (#19060) +* [fox] Fix extraction for free videos (#19060) ++ [zattoo] Add support for tv.salt.ch (#19059) + + version 2019.01.27 Core From 1063b4c7073ce056f694b1690dd5d5a1a06fb347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 30 Jan 2019 00:08:39 +0700 Subject: [PATCH 212/226] release 2019.01.30 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f529e3f4b..3944a4a38 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.27*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.27** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.30** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.27 +[debug] youtube-dl version 2019.01.30 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 8f5343b23..745fffeaa 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.30 Core * [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6377bf815..2918520c3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -546,6 +546,7 @@ - **MyVisionTV** - **n-tv.de** - **natgeo:video** + - **NationalGeographicTV** - **Naver** - **NBA** - **NBC** @@ -776,6 +777,7 @@ - **safari:api** - **safari:course**: safaribooksonline.com online courses - **SAKTV** + - **SaltTV** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ec89cfc64..97818d0c7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.27' +__version__ = '2019.01.30' From ce52c7c111602f41d7f9c498f2915fd255ba2eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 30 Jan 2019 06:15:23 +0700 Subject: [PATCH 213/226] [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (closes #19067) --- youtube_dl/postprocessor/ffmpeg.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 88b9ae9be..5bcb00ac0 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -217,11 +217,13 @@ class FFmpegPostProcessor(PostProcessor): encodeArgument('-i'), encodeFilename(self._ffmpeg_filename_argument(path), True) ]) - cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + - [encodeArgument('-loglevel'), encodeArgument('repeat+info')] + - files_cmd + - [encodeArgument(o) for o in opts] + - [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) + cmd = [encodeFilename(self.executable, True), encodeArgument('-y')] + # avconv does not have repeat option + if self.basename == 'ffmpeg': + cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')] + cmd += (files_cmd + + [encodeArgument(o) for o in opts] + + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) From c2a0fe2ea7422c437a27c8fac57c7e865517354b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 30 Jan 2019 06:17:25 +0700 Subject: [PATCH 214/226] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index 745fffeaa..e6de6ca03 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Core +* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) + + version 2019.01.30 Core From 7b0f9df23d9842ddb2a545a0ceaf594daa0e12ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 30 Jan 2019 06:19:36 +0700 Subject: [PATCH 215/226] release 2019.01.30.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3944a4a38..423a08e4d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.30** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.30.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.30.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.30 +[debug] youtube-dl version 2019.01.30.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e6de6ca03..4872cd9fc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.01.30.1 Core * [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 97818d0c7..be3bbdd73 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.30' +__version__ = '2019.01.30.1' From 645c4885cf38ecb244412dffda2760f4c0e72033 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 30 Jan 2019 14:43:44 +0100 Subject: [PATCH 216/226] [crackle] authorize media detail request(closes #16931) --- youtube_dl/extractor/crackle.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f73ef6b63..49bf3a4f9 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals, division +import hashlib +import hmac import re +import time from .common import InfoExtractor from ..compat import compat_HTTPError @@ -74,13 +77,16 @@ class CrackleIE(InfoExtractor): for country in countries: try: + # Authorization generation algorithm is reverse engineered from: + # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js + media_detail_url = 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country) + timestamp = time.strftime('%Y%m%d%H%M', time.gmtime()) + h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([media_detail_url, timestamp]).encode(), hashlib.sha1).hexdigest().upper() media = self._download_json( - 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s' - % (video_id, country), video_id, - 'Downloading media JSON as %s' % country, - 'Unable to download media JSON', query={ - 'disableProtocols': 'true', - 'format': 'json' + media_detail_url, video_id, 'Downloading media JSON as %s' % country, + 'Unable to download media JSON', headers={ + 'Accept': 'application/json', + 'Authorization': '|'.join([h, timestamp, '117', '1']), }) except ExtractorError as e: # 401 means geo restriction, trying next country From 15e832ff2a1bee42b299d3498439cf789c16fffa Mon Sep 17 00:00:00 2001 From: Batuhan's Unmaintained Account <batuhanosmantaskaya@gmail.com> Date: Wed, 30 Jan 2019 19:39:02 +0300 Subject: [PATCH 217/226] [openload] Add support for oload.info --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index b713e78b8..747aa298a 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -249,7 +249,7 @@ class OpenloadIE(InfoExtractor): (?:www\.)? (?: openload\.(?:co|io|link)| - oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club) + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info) ) )/ (?:f|embed)/ @@ -337,6 +337,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.club/f/Nr1L-aZ2dbQ', 'only_matching': True, + }, { + 'url': 'https://oload.info/f/5NEAbI2BDSk', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From 9613e14a92429046b162145dfd40dee5795ca409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 31 Jan 2019 00:15:45 +0700 Subject: [PATCH 218/226] [openload] Add support for openload.pw and oload.pw (closes #18930) --- youtube_dl/extractor/openload.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 747aa298a..a2ae25272 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -248,8 +248,8 @@ class OpenloadIE(InfoExtractor): (?P<host> (?:www\.)? (?: - openload\.(?:co|io|link)| - oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info) + openload\.(?:co|io|link|pw)| + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw) ) )/ (?:f|embed)/ @@ -340,6 +340,12 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.info/f/5NEAbI2BDSk', 'only_matching': True, + }, { + 'url': 'https://openload.pw/f/WyKgK8s94N0', + 'only_matching': True, + }, { + 'url': 'https://oload.pw/f/WyKgK8s94N0', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From 49fe4175ae165527a0b06b8d97cdc85d83041fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 1 Feb 2019 01:48:10 +0700 Subject: [PATCH 219/226] [drtv] Improve preference (closes #19079) --- youtube_dl/extractor/drtv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index c5f211128..0c7e350f0 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -171,10 +171,13 @@ class DRTVIE(InfoExtractor): continue target = link.get('Target') format_id = target or '' - preference = None - if asset_target in ('SpokenSubtitles', 'SignLanguage'): + if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'): preference = -1 format_id += '-%s' % asset_target + elif asset_target == 'Default': + preference = 1 + else: + preference = None if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', From 3ef2da2d21061bd44df0b0a0d27e82a365209662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Feb 2019 04:00:29 +0700 Subject: [PATCH 220/226] [soundcloud] Fix paged playlists extraction, add support for albums and update client id --- youtube_dl/extractor/soundcloud.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 81c81c8d5..030840fd8 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -34,7 +34,7 @@ class SoundcloudIE(InfoExtractor): (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?!stations/track) (?P<uploader>[\w\d-]+)/ - (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -157,7 +157,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb' + _CLIENT_ID = 'NmW1FlPaiL94ueEu7oziOWjYEzZzQDcK' @staticmethod def _extract_urls(webpage): @@ -368,7 +368,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): - _API_BASE = 'https://api.soundcloud.com' _API_V2_BASE = 'https://api-v2.soundcloud.com' def _extract_playlist(self, base_url, playlist_id, playlist_title): @@ -389,8 +388,12 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): next_href, playlist_id, 'Downloading track page %s' % (i + 1)) collection = response['collection'] - if not collection: - break + + if not isinstance(collection, list): + collection = [] + + # Empty collection may be returned, in this case we proceed + # straight to next_href def resolve_permalink_url(candidates): for cand in candidates: @@ -429,7 +432,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): (?:(?:www|m)\.)?soundcloud\.com/ (?P<user>[^/]+) (?:/ - (?P<rsrc>tracks|sets|reposts|likes|spotlight) + (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) )? /?(?:[?#].*)?$ ''' @@ -476,13 +479,17 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): 'title': 'Grynpyret (Spotlight)', }, 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/albums', + 'only_matching': True, }] _BASE_URL_MAP = { - 'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE, + 'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, } @@ -490,6 +497,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): _TITLE_MAP = { 'all': 'All', 'tracks': 'Tracks', + 'albums': 'Albums', 'sets': 'Playlists', 'reposts': 'Reposts', 'likes': 'Likes', From b6423e6ca215e1583e013cf7b2c1faf8d3dcace7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Feb 2019 04:11:32 +0700 Subject: [PATCH 221/226] [soundcloud:user] Update tests --- youtube_dl/extractor/soundcloud.py | 48 ++++++++++++++++-------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 030840fd8..13463ae4f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -438,40 +438,47 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): ''' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-akashic-chronicler', + 'url': 'https://soundcloud.com/soft-cell-official', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (All)', + 'id': '207965082', + 'title': 'Soft Cell (All)', }, - 'playlist_mincount': 74, + 'playlist_mincount': 28, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', + 'url': 'https://soundcloud.com/soft-cell-official/tracks', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Tracks)', + 'id': '207965082', + 'title': 'Soft Cell (Tracks)', }, - 'playlist_mincount': 37, + 'playlist_mincount': 27, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'url': 'https://soundcloud.com/soft-cell-official/albums', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Playlists)', + 'id': '207965082', + 'title': 'Soft Cell (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/jcv246/sets', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Playlists)', }, 'playlist_mincount': 2, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'url': 'https://soundcloud.com/jcv246/reposts', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Reposts)', + 'id': '12982173', + 'title': 'Jordi / cv (Reposts)', }, - 'playlist_mincount': 7, + 'playlist_mincount': 6, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'url': 'https://soundcloud.com/clalberg/likes', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Likes)', + 'id': '11817582', + 'title': 'clalberg (Likes)', }, - 'playlist_mincount': 321, + 'playlist_mincount': 5, }, { 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { @@ -479,9 +486,6 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): 'title': 'Grynpyret (Spotlight)', }, 'playlist_mincount': 1, - }, { - 'url': 'https://soundcloud.com/soft-cell-official/albums', - 'only_matching': True, }] _BASE_URL_MAP = { From e9fef7ee4e666b60bc7a757391f16e2be76f6cbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Feb 2019 05:44:31 +0700 Subject: [PATCH 222/226] [YoutubeDL] Fallback to ie_key of matching extractor while making download archive id when no explicit ie_key is provided (#19022) --- youtube_dl/YoutubeDL.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 80ed8d7e5..c168415ce 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2060,15 +2060,21 @@ class YoutubeDL(object): self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return # Future-proof against any change in case # and backwards compatibility with prior versions - extractor = info_dict.get('extractor_key') + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist if extractor is None: - if 'id' in info_dict: - extractor = info_dict.get('ie_key') # key in a playlist - if extractor is None: - return None # Incomplete video information - return extractor.lower() + ' ' + info_dict['id'] + # Try to find matching extractor for the URL and take its ie_key + for ie in self._ies: + if ie.suitable(info_dict['url']): + extractor = ie.ie_key() + break + else: + return + return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -2076,7 +2082,7 @@ class YoutubeDL(object): return False vid_id = self._make_archive_id(info_dict) - if vid_id is None: + if not vid_id: return False # Incomplete video information try: From b9bc1cff721b6f63e733c6ababeec45b92f1484b Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Fri, 1 Feb 2019 19:04:00 -0400 Subject: [PATCH 223/226] [drtuber] Extract duration --- youtube_dl/extractor/drtuber.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 5c41c8022..2baea585b 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -4,7 +4,9 @@ import re from .common import InfoExtractor from ..utils import ( + int_or_none, NO_DEFAULT, + parse_duration, str_to_int, ) @@ -65,6 +67,9 @@ class DrTuberIE(InfoExtractor): }) self._sort_formats(formats) + duration = int_or_none(video_data.get('duration')) or parse_duration( + video_data.get('duration_format')) + title = self._html_search_regex( (r'<h1[^>]+class=["\']title[^>]+>([^<]+)', r'<title>([^<]+)\s*@\s+DrTuber', @@ -103,4 +108,5 @@ class DrTuberIE(InfoExtractor): 'comment_count': comment_count, 'categories': categories, 'age_limit': self._rta_search(webpage), + 'duration': duration, } From 6cc6e0c34d0f67747be7bac91690820f47b26acb Mon Sep 17 00:00:00 2001 From: Cory Hall <corydantehall@gmail.com> Date: Thu, 31 Jan 2019 20:51:37 -0500 Subject: [PATCH 224/226] [soundcloud:pagedplaylist] Add ie and title to entries (#19022) rel: https://github.com/rg3/youtube-dl/issues/19022 --- youtube_dl/extractor/soundcloud.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 13463ae4f..1c8d3c53b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -395,18 +395,20 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): # Empty collection may be returned, in this case we proceed # straight to next_href - def resolve_permalink_url(candidates): - for cand in candidates: + def append_url_result(entries, item): + for cand in (item, item.get('track'), item.get('playlist')): if isinstance(cand, dict): permalink_url = cand.get('permalink_url') - entry_id = self._extract_id(cand) if permalink_url and permalink_url.startswith('http'): - return permalink_url, entry_id + return entries.append( + self.url_result( + permalink_url, + ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + video_id=self._extract_id(cand), + video_title=cand.get('title'))) for e in collection: - permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) - if permalink_url: - entries.append(self.url_result(permalink_url, video_id=entry_id)) + append_url_result(entries, e) next_href = response.get('next_href') if not next_href: From 7c5307f4c4e91ef6551d70cd844b93fbdc5c3cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Feb 2019 23:40:06 +0700 Subject: [PATCH 225/226] [soundcloud:pagedplaylist] Improve (closes #19086) --- youtube_dl/extractor/soundcloud.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 1c8d3c53b..5536e7851 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -18,6 +18,7 @@ from ..utils import ( int_or_none, unified_strdate, update_url_query, + url_or_none, ) @@ -395,20 +396,23 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): # Empty collection may be returned, in this case we proceed # straight to next_href - def append_url_result(entries, item): - for cand in (item, item.get('track'), item.get('playlist')): - if isinstance(cand, dict): - permalink_url = cand.get('permalink_url') - if permalink_url and permalink_url.startswith('http'): - return entries.append( - self.url_result( - permalink_url, - ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, - video_id=self._extract_id(cand), - video_title=cand.get('title'))) + def resolve_entry(candidates): + for cand in candidates: + if not isinstance(cand, dict): + continue + permalink_url = url_or_none(cand.get('permalink_url')) + if not permalink_url: + continue + return self.url_result( + permalink_url, + ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + video_id=self._extract_id(cand), + video_title=cand.get('title')) for e in collection: - append_url_result(entries, e) + entry = resolve_entry((e, e.get('track'), e.get('playlist'))) + if entry: + entries.append(entry) next_href = response.get('next_href') if not next_href: From 0efcb5a2fe0c3024d3e5affe74b3d0d416413ffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Feb 2019 00:33:45 +0700 Subject: [PATCH 226/226] [vporn] Remove extractor (closes #16276) Handled by generic extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/vporn.py | 123 ----------------------------- 2 files changed, 124 deletions(-) delete mode 100644 youtube_dl/extractor/vporn.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b40be42e6..693c16e49 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1362,7 +1362,6 @@ from .voxmedia import ( VoxMediaVolumeIE, VoxMediaIE, ) -from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE from .vrv import ( diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py deleted file mode 100644 index 858ac9e71..000000000 --- a/youtube_dl/extractor/vporn.py +++ /dev/null @@ -1,123 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - str_to_int, - urljoin, -) - - -class VpornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P<display_id>[^/]+)/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/', - 'md5': 'facf37c1b86546fa0208058546842c55', - 'info_dict': { - 'id': '497944', - 'display_id': 'violet-on-her-th-birthday', - 'ext': 'mp4', - 'title': 'Violet on her 19th birthday', - 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'kileyGrope', - 'categories': ['Masturbation', 'Teen'], - 'duration': 393, - 'age_limit': 18, - 'view_count': int, - }, - 'skip': 'video removed', - }, - { - 'url': 'http://www.vporn.com/female/hana-shower/523564/', - 'md5': 'ced35a4656198a1664cf2cda1575a25f', - 'info_dict': { - 'id': '523564', - 'display_id': 'hana-shower', - 'ext': 'mp4', - 'title': 'Hana Shower', - 'description': 'Hana showers at the bathroom.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Hmmmmm', - 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'], - 'duration': 588, - 'age_limit': 18, - 'view_count': int, - } - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!' - if errmsg in webpage: - raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) - - title = self._html_search_regex( - r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip() - description = self._html_search_regex( - r'class="(?:descr|description_txt)">(.*?)</div>', - webpage, 'description', fatal=False) - thumbnail = urljoin('http://www.vporn.com', self._html_search_regex( - r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', - default=None)) - - uploader = self._html_search_regex( - r'(?s)Uploaded by:.*?<a href="/user/[^"]+"[^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - - categories = re.findall(r'<a href="/cat/[^"]+"[^>]*>([^<]+)</a>', webpage) - - duration = parse_duration(self._search_regex( - r'Runtime:\s*</span>\s*(\d+ min \d+ sec)', - webpage, 'duration', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'class="views">([\d,\.]+) [Vv]iews<', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r"'Comments \(([\d,\.]+)\)'", - webpage, 'comment count', default=None)) - - formats = [] - - for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"(https?://[^"]+)"', webpage): - video_url = video[1] - fmt = { - 'url': video_url, - 'format_id': video[0], - } - m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)_(?P<vbr>\d+)k\.mp4$', video_url) - if m: - fmt.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'vbr': int(m.group('vbr')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'categories': categories, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - }