diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index cf8e6e411..73f46ec04 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.15 + [debug] youtube-dl version 2020.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index babbda464..7e3c9f669 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 5498983ff..b9bb3bd11 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index d46735951..265ea80c1 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.15 + [debug] youtube-dl version 2020.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 748b64756..e71778a3d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index cc7fc4323..94aa9f327 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version 2020.01.24 + +Extractors +* [youtube] Fix sigfunc name extraction (#23819) +* [stretchinternet] Fix extraction (#4319) +* [voicerepublic] Fix extraction +* [azmedien] Fix extraction (#23783) +* [businessinsider] Fix jwplatform id extraction (#22929, #22954) ++ [24video] Add support for 24video.vip (#23753) +* [ivi:compilation] Fix entries extraction (#23770) +* [ard] Improve extraction (#23761) + * Simplify extraction + + Extract age limit and series + * Bypass geo-restriction ++ [nbc] Add support for nbc multi network URLs (#23049) +* [americastestkitchen] Fix extraction +* [zype] Improve extraction + + Extract subtitles (#21258) + + Support URLs with alternative keys/tokens (#21258) + + Extract more metadata +* [orf:tvthek] Improve geo restricted videos detection (#23741) +* [soundcloud] Restore previews extraction (#23739) + + version 2020.01.15 Extractors diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index a0b09f5b1..a1372d389 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -238,7 +238,7 @@ class SoundcloudIE(InfoExtractor): 'ext': 'mp3', 'title': 'Mezzo Valzer', 'description': 'md5:4138d582f81866a530317bae316e8b61', - 'uploader': 'Giovanni Sarani', + 'uploader': 'Micronie', 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', @@ -524,7 +524,17 @@ class SoundcloudIE(InfoExtractor): class SoundcloudPlaylistBaseIE(SoundcloudIE): - def _extract_track_entries(self, tracks, token=None): + def _extract_set(self, playlist, token=None): + playlist_id = compat_str(playlist['id']) + tracks = playlist.get('tracks') or [] + if not all([t.get('permalink_url') for t in tracks]) and token: + tracks = self._download_json( + self._API_V2_BASE + 'tracks', playlist_id, + 'Downloading tracks', query={ + 'ids': ','.join([compat_str(t['id']) for t in tracks]), + 'playlistId': playlist_id, + 'playlistSecretToken': token, + }) entries = [] for track in tracks: track_id = str_or_none(track.get('id')) @@ -537,7 +547,10 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE): url += '?secret_token=' + token entries.append(self.url_result( url, SoundcloudIE.ie_key(), track_id)) - return entries + return self.playlist_result( + entries, playlist_id, + playlist.get('title'), + playlist.get('description')) class SoundcloudSetIE(SoundcloudPlaylistBaseIE): @@ -548,6 +561,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): 'info_dict': { 'id': '2284613', 'title': 'The Royal Concept EP', + 'description': 'md5:71d07087c7a449e8941a70a29e34671e', }, 'playlist_mincount': 5, }, { @@ -570,13 +584,10 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - entries = self._extract_track_entries(info['tracks'], token) - - return self.playlist_result( - entries, str_or_none(info.get('id')), info.get('title')) + return self._extract_set(info, token) -class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): COMMON_QUERY = { 'limit': 2000000000, @@ -774,10 +785,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): self._API_V2_BASE + 'playlists/' + playlist_id, playlist_id, 'Downloading playlist', query=query) - entries = self._extract_track_entries(data['tracks'], token) - - return self.playlist_result( - entries, playlist_id, data.get('title'), data.get('description')) + return self._extract_set(data, token) class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index a3c35a899..378fc7568 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -13,36 +13,18 @@ from ..utils import ( class SportDeutschlandIE(InfoExtractor): _VALID_URL = r'https?://sportdeutschland\.tv/(?P[^/?#]+)/(?P[^?#/]+)(?:$|[?#])' _TESTS = [{ - 'url': 'http://sportdeutschland.tv/badminton/live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', + 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', 'ext': 'mp4', - 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', - 'categories': ['Badminton'], + 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', + 'categories': ['Badminton-Deutschland'], 'view_count': int, - 'thumbnail': r're:^https?://.*\.jpg$', - 'description': r're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'timestamp': int, - 'upload_date': 're:^201408[23][0-9]$', + 'upload_date': '20200201', + 'description': 're:.*', # meaningless description for THIS video }, - 'params': { - 'skip_download': 'Live stream', - }, - }, { - 'url': 'http://sportdeutschland.tv/li-ning-badminton-wm-2014/lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', - 'info_dict': { - 'id': 'lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', - 'ext': 'mp4', - 'upload_date': '20140825', - 'description': 'md5:60a20536b57cee7d9a4ec005e8687504', - 'timestamp': 1408976060, - 'duration': 2732, - 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', - 'thumbnail': r're:^https?://.*\.jpg$', - 'view_count': int, - 'categories': ['Li-Ning Badminton WM 2014'], - - } }] def _real_extract(self, url): @@ -50,7 +32,7 @@ class SportDeutschlandIE(InfoExtractor): video_id = mobj.group('id') sport_id = mobj.group('sport') - api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( + api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( sport_id, video_id) req = sanitized_Request(api_url, headers={ 'Accept': 'application/vnd.vidibus.v2.html+json', diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 0901c3163..e12389cad 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -4,19 +4,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( determine_ext, dict_get, int_or_none, - orderedSet, + str_or_none, strip_or_none, try_get, - urljoin, - compat_str, ) @@ -237,23 +232,23 @@ class SVTPlayIE(SVTPlayBaseIE): class SVTSeriesIE(SVTPlayBaseIE): - _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)(?:.+?\btab=(?P[^&#]+))?' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', 'info_dict': { - 'id': 'rederiet', + 'id': '14445680', 'title': 'Rederiet', - 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', }, 'playlist_mincount': 318, }, { - 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', + 'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680', 'info_dict': { - 'id': 'rederiet-sasong2', + 'id': 'season-2-14445680', 'title': 'Rederiet - Säsong 2', - 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', }, - 'playlist_count': 12, + 'playlist_mincount': 12, }] @classmethod @@ -261,83 +256,87 @@ class SVTSeriesIE(SVTPlayBaseIE): return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): - series_id = self._match_id(url) + series_slug, season_id = re.match(self._VALID_URL, url).groups() - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - season_slug = qs.get('tab', [None])[0] - - if season_slug: - series_id += '-%s' % season_slug - - webpage = self._download_webpage( - url, series_id, 'Downloading series page') - - root = self._parse_json( - self._search_regex( - self._SVTPLAY_RE, webpage, 'content', group='json'), - series_id) + series = self._download_json( + 'https://api.svt.se/contento/graphql', series_slug, + 'Downloading series page', query={ + 'query': '''{ + listablesBySlug(slugs: ["%s"]) { + associatedContent(include: [productionPeriod, season]) { + items { + item { + ... on Episode { + videoSvtId + } + } + } + id + name + } + id + longDescription + name + shortDescription + } +}''' % series_slug, + })['data']['listablesBySlug'][0] season_name = None entries = [] - for season in root['relatedVideoContent']['relatedVideosAccordion']: + for season in series['associatedContent']: if not isinstance(season, dict): continue - if season_slug: - if season.get('slug') != season_slug: + if season_id: + if season.get('id') != season_id: continue season_name = season.get('name') - videos = season.get('videos') - if not isinstance(videos, list): + items = season.get('items') + if not isinstance(items, list): continue - for video in videos: - content_url = video.get('contentUrl') - if not content_url or not isinstance(content_url, compat_str): + for item in items: + video = item.get('item') or {} + content_id = video.get('videoSvtId') + if not content_id or not isinstance(content_id, compat_str): continue - entries.append( - self.url_result( - urljoin(url, content_url), - ie=SVTPlayIE.ie_key(), - video_title=video.get('title') - )) + entries.append(self.url_result( + 'svt:' + content_id, SVTPlayIE.ie_key(), content_id)) - metadata = root.get('metaData') - if not isinstance(metadata, dict): - metadata = {} - - title = metadata.get('title') - season_name = season_name or season_slug + title = series.get('name') + season_name = season_name or season_id if title and season_name: title = '%s - %s' % (title, season_name) - elif season_slug: - title = season_slug + elif season_id: + title = season_id return self.playlist_result( - entries, series_id, title, metadata.get('description')) + entries, season_id or series.get('id'), title, + dict_get(series, ('longDescription', 'shortDescription'))) class SVTPageIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P[^/?&#]+)' + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P(?:[^/]+/)*(?P[^/?&#]+))' _TESTS = [{ - 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa', 'info_dict': { - 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', - 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + 'id': '25298267', + 'title': 'Bakom masken – Lehners kamp mot mental ohälsa', }, - 'playlist_count': 7, + 'playlist_count': 4, }, { - 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien', 'info_dict': { - 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', - 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + 'id': '24243746', + 'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien', }, - 'playlist_count': 1, + 'playlist_count': 2, }, { # only programTitle 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', 'info_dict': { - 'id': '2900353', + 'id': '8439V2K', 'ext': 'mp4', 'title': 'Stjärnorna skojar till det - under SVT-intervjun', 'duration': 27, @@ -356,16 +355,26 @@ class SVTPageIE(InfoExtractor): return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): - playlist_id = self._match_id(url) + path, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) + article = self._download_json( + 'https://api.svt.se/nss-api/page/' + path, display_id, + query={'q': 'articles'})['articles']['content'][0] - entries = [ - self.url_result( - 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r'data-video-id=["\'](\d+)', webpage))] + entries = [] - title = strip_or_none(self._og_search_title(webpage, default=None)) + def _process_content(content): + if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'): + video_id = compat_str(content['image']['svtId']) + entries.append(self.url_result( + 'svt:' + video_id, SVTPlayIE.ie_key(), video_id)) - return self.playlist_result(entries, playlist_id, title) + for media in article.get('media', []): + _process_content(media) + + for obj in article.get('structuredBody', []): + _process_content(obj.get('content') or {}) + + return self.playlist_result( + entries, str_or_none(article.get('id')), + strip_or_none(article.get('title'))) diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py index 88b6baa31..b7fe082b9 100644 --- a/youtube_dl/extractor/tv5mondeplus.py +++ b/youtube_dl/extractor/tv5mondeplus.py @@ -3,31 +3,51 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - clean_html, determine_ext, extract_attributes, - get_element_by_class, int_or_none, parse_duration, - parse_iso8601, ) class TV5MondePlusIE(InfoExtractor): IE_DESC = 'TV5MONDE+' - _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', - 'md5': '12130fc199f020673138a83466542ec6', + _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P[^/?#]+)' + _TESTS = [{ + # movie + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit', + 'md5': '8cbde5ea7b296cf635073e27895e227f', 'info_dict': { - 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'id': '822a4756-0712-7329-1859-a13ac7fd1407', + 'display_id': 'rendez-vous-a-atlit', 'ext': 'mp4', - 'title': 'Tdah, mon amour - Enfants', - 'description': 'md5:230e3aca23115afcf8006d1bece6df74', - 'upload_date': '20170401', - 'timestamp': 1491022860, - } - } + 'title': 'Rendez-vous à Atlit', + 'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb', + 'upload_date': '20200130', + }, + }, { + # series episode + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree', + 'info_dict': { + 'id': '0df7007c-4900-3936-c601-87a13a93a068', + 'display_id': 'c-est-la-vie-ennemie-juree', + 'ext': 'mp4', + 'title': "C'est la vie - Ennemie jurée", + 'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e', + 'upload_date': '20200130', + 'series': "C'est la vie", + 'episode': 'Ennemie jurée', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', + 'only_matching': True, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30', + 'only_matching': True, + }] _GEO_BYPASS = False def _real_extract(self, url): @@ -37,11 +57,7 @@ class TV5MondePlusIE(InfoExtractor): if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: self.raise_geo_restricted(countries=['FR']) - series = get_element_by_class('video-detail__title', webpage) - title = episode = get_element_by_class( - 'video-detail__subtitle', webpage) or series - if series and series != title: - title = '%s - %s' % (series, title) + title = episode = self._html_search_regex(r'

([^<]+)', webpage, 'title') vpl_data = extract_attributes(self._search_regex( r'(<[^>]+class="video_player_loader"[^>]+>)', webpage, 'video player loader')) @@ -65,15 +81,37 @@ class TV5MondePlusIE(InfoExtractor): }) self._sort_formats(formats) + description = self._html_search_regex( + r'(?s)]+class=["\']episode-texte[^>]+>(.+?)', webpage, + 'description', fatal=False) + + series = self._html_search_regex( + r']+class=["\']episode-emission[^>]+>([^<]+)', webpage, + 'series', default=None) + + if series and series != title: + title = '%s - %s' % (series, title) + + upload_date = self._search_regex( + r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})', + webpage, 'upload date', default=None) + if upload_date: + upload_date = upload_date.replace('_', '') + + video_id = self._search_regex( + (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', + default=display_id) + return { - 'id': display_id, + 'id': video_id, 'display_id': display_id, 'title': title, - 'description': clean_html(get_element_by_class('video-detail__description', webpage)), + 'description': description, 'thumbnail': vpl_data.get('data-image'), 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)), - 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)), + 'upload_date': upload_date, 'formats': formats, - 'episode': episode, 'series': series, + 'episode': episode, } diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py index 0b863df2f..443f46e8a 100644 --- a/youtube_dl/extractor/tva.py +++ b/youtube_dl/extractor/tva.py @@ -9,8 +9,8 @@ from ..utils import ( class TVAIE(InfoExtractor): - _VALID_URL = r'https?://videos\.tva\.ca/details/_(?P\d+)' - _TEST = { + _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P\d+)' + _TESTS = [{ 'url': 'https://videos.tva.ca/details/_5596811470001', 'info_dict': { 'id': '5596811470001', @@ -24,7 +24,10 @@ class TVAIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://video.tva.ca/details/_5596811470001', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index a8c2502af..0db2dca41 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -575,8 +575,8 @@ class TwitchStreamIE(TwitchBaseIE): channel_id = self._match_id(url) stream = self._call_api( - 'kraken/streams/%s?stream_type=all' % channel_id, channel_id, - 'Downloading stream JSON').get('stream') + 'kraken/streams/%s?stream_type=all' % channel_id.lower(), + channel_id, 'Downloading stream JSON').get('stream') if not stream: raise ExtractorError('%s is offline' % channel_id, expected=True) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 851ad936c..d6b92b1c8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,28 +1,62 @@ from __future__ import unicode_literals -import base64 +import json import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - clean_html, - determine_ext, int_or_none, - js_to_json, parse_age_limit, - parse_duration, - try_get, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' + _API_BASE = 'https://prod-api.viewlift.com/' + _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' + _SITE_MAP = { + 'ftfnext': 'lax', + 'funnyforfree': 'snagfilms', + 'hoichoi': 'hoichoitv', + 'kiddovid': 'snagfilms', + 'laxsportsnetwork': 'lax', + 'legapallacanestro': 'lnp', + 'marquee': 'marquee-tv', + 'monumentalsportsnetwork': 'monumental-network', + 'moviespree': 'bingeflix', + 'pflmma': 'pfl', + 'snagxtreme': 'snagfilms', + 'theidentitytb': 'tampabay', + 'vayafilm': 'snagfilms', + } + _TOKENS = {} + + def _call_api(self, site, path, video_id, query): + token = self._TOKENS.get(site) + if not token: + token_query = {'site': site} + email, password = self._get_login_info(netrc_machine=site) + if email: + resp = self._download_json( + self._API_BASE + 'identity/signin', video_id, + 'Logging in', query=token_query, data=json.dumps({ + 'email': email, + 'password': password, + }).encode()) + else: + resp = self._download_json( + self._API_BASE + 'identity/anonymous-token', video_id, + 'Downloading authorization token', query=token_query) + self._TOKENS[site] = token = resp['authorizationToken'] + return self._download_json( + self._API_BASE + path, video_id, + headers={'Authorization': token}, query=query) class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + IE_NAME = 'viewlift:embed' + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -30,6 +64,9 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '74849a00-85a9-11e1-9660-123139220831', 'ext': 'mp4', 'title': '#whilewewatch', + 'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8', + 'timestamp': 1334350096, + 'upload_date': '20120413', } }, { # invalid labels, 360p is better that 480p @@ -39,7 +76,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', 'ext': 'mp4', 'title': 'Life in Limbo', - } + }, + 'skip': 'The video does not exist', }, { 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', 'only_matching': True, @@ -54,67 +92,68 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): return mobj.group('url') def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - if '>This film is not playable in your area.<' in webpage: - raise ExtractorError( - 'Film %s is not playable in your area.' % video_id, expected=True) + domain, film_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + try: + content_data = self._call_api( + site, 'entitlement/video/status', film_id, { + 'id': film_id + })['video'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') + if error_message == 'User does not have a valid subscription or has not purchased this content.': + self.raise_login_required() + raise ExtractorError(error_message, expected=True) + raise + gist = content_data['gist'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] formats = [] - has_bitrate = False - sources = self._parse_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, - 'sources', default='[]'), video_id, js_to_json) - for source in sources: - file_ = source.get('file') - if not file_: + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: continue - type_ = source.get('type') - ext = determine_ext(file_) - format_id = source.get('label') or ext - if all(v in ('m3u8', 'hls') for v in (type_, ext)): - formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - bitrate = int_or_none(self._search_regex( - [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], - file_, 'bitrate', default=None)) - if not has_bitrate and bitrate: - has_bitrate = True - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - formats.append({ - 'url': file_, - 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), - 'tbr': bitrate, - 'height': height, - }) - if not formats: - hls_url = self._parse_json(self._search_regex( - r'filmInfo\.src\s*=\s*({.+?});', - webpage, 'src'), video_id, js_to_json)['src'] - formats = self._extract_m3u8_formats( - hls_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') - self._sort_formats(formats, field_preference) + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - title = self._search_regex( - [r"title\s*:\s*'([^']+)'", r'([^<]+)'], - webpage, 'title') + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) - return { - 'id': video_id, + info = { + 'id': film_id, 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info class ViewLiftIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P%s)(?:/(?:films/title|show|(?:news/)?videos?))?/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX + IE_NAME = 'viewlift' + _VALID_URL = r'https?://(?:www\.)?(?P%s)(?P(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -151,10 +190,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'id': '00000148-7b53-de26-a9fb-fbf306f70020', 'display_id': 'augie_alone/s_2_ep_12_love', 'ext': 'mp4', - 'title': 'Augie, Alone:S. 2 Ep. 12 - Love', - 'description': 'md5:db2a5c72d994f16a780c1eb353a8f403', + 'title': 'S. 2 Ep. 12 - Love', + 'description': 'Augie finds love.', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 107, + 'upload_date': '20141012', + 'timestamp': 1413129540, + 'age_limit': 17, }, 'params': { 'skip_download': True, @@ -177,6 +219,9 @@ class ViewLiftIE(ViewLiftBaseIE): # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', 'only_matching': True, + }, { + 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', + 'only_matching': True, }] @classmethod @@ -184,119 +229,22 @@ class ViewLiftIE(ViewLiftBaseIE): return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, display_id) - - if ">Sorry, the Film you're looking for is not available.<" in webpage: - raise ExtractorError( - 'Film %s is not available.' % display_id, expected=True) - - initial_store_state = self._search_regex( - r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", - webpage, 'Initial Store State', default=None) - if initial_store_state: - modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( - initial_store_state).decode()), display_id)['page']['data']['modules'] - content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') - gist = content_data['gist'] - film_id = gist['id'] - title = gist['title'] - video_assets = try_get( - content_data, lambda x: x['streamingInfo']['videoAssets'], dict) - if not video_assets: - token = self._download_json( - 'https://prod-api.viewlift.com/identity/anonymous-token', - film_id, 'Downloading authorization token', - query={'site': 'snagfilms'})['authorizationToken'] - video_assets = self._download_json( - 'https://prod-api.viewlift.com/entitlement/video/status', - film_id, headers={ - 'Authorization': token, - 'Referer': url, - }, query={ - 'id': film_id - })['video']['streamingInfo']['videoAssets'] - - formats = [] - mpeg_video_assets = video_assets.get('mpeg') or [] - for video_asset in mpeg_video_assets: - video_asset_url = video_asset.get('url') - if not video_asset: - continue - bitrate = int_or_none(video_asset.get('bitrate')) - height = int_or_none(self._search_regex( - r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), - 'height', default=None)) - formats.append({ - 'url': video_asset_url, - 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), - 'tbr': bitrate, - 'height': height, - 'vcodec': video_asset.get('codec'), - }) - - hls_url = video_assets.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats, ('height', 'tbr', 'format_id')) - - info = { - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': gist.get('description'), - 'thumbnail': gist.get('videoImageUrl'), - 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating')), - 'timestamp': int_or_none(gist.get('publishDate'), 1000), - 'formats': formats, - } - for k in ('categories', 'tags'): - info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] - return info - else: - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), - display_id) - - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break - else: - title = self._html_search_regex( - (r'itemprop="title">([^<]+)<', - r'(?s)itemprop="title">(.+?)(.+?)', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) - - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + domain, path, display_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + modules = self._call_api( + site, 'content/pages', display_id, { + 'includeContent': 'true', + 'moduleOffset': 1, + 'path': path, + 'site': site, + })['modules'] + film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'ie_key': 'ViewLiftEmbed', + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index baa46d5f3..f378aa283 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -841,33 +841,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self._TITLE or self._html_search_regex( self._TITLE_RE, webpage, 'list title', fatal=False) - def _login_list_password(self, page_url, list_id, webpage): - login_form = self._search_regex( - r'(?s)]+?id="pw_form"(.*?)', - webpage, 'login form', default=None) - if not login_form: - return webpage - - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) - fields = self._hidden_inputs(login_form) - token, vuid = self._extract_xsrft_and_vuid(webpage) - fields['token'] = token - fields['password'] = password - post = urlencode_postdata(fields) - password_path = self._search_regex( - r'action="([^"]+)"', login_form, 'password URL') - password_url = compat_urlparse.urljoin(page_url, password_path) - password_request = sanitized_Request(password_url, post) - password_request.add_header('Content-type', 'application/x-www-form-urlencoded') - self._set_vimeo_cookie('vuid', vuid) - self._set_vimeo_cookie('xsrft', token) - - return self._download_webpage( - password_request, list_id, - 'Verifying the password', 'Wrong password') - def _title_and_entries(self, list_id, base_url): for pagenum in itertools.count(1): page_url = self._page_url(base_url, pagenum) @@ -876,7 +849,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): 'Downloading page %s' % pagenum) if pagenum == 1: - webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) # Try extracting href first since not all videos are available via @@ -923,7 +895,7 @@ class VimeoUserIE(VimeoChannelIE): _BASE_URL_TEMPL = 'https://vimeo.com/%s' -class VimeoAlbumIE(VimeoChannelIE): +class VimeoAlbumIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:album' _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'