From 34724e37bbe6fcfe31dbfb372989c64295c39779 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 4 Mar 2017 21:13:30 -0500 Subject: [PATCH] [cbs.com:playlist] Extract playlists of CBS shows. --- youtube_dl/extractor/cbs.py | 99 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 58f258c54..c709df097 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,12 +1,16 @@ from __future__ import unicode_literals +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE from ..utils import ( int_or_none, + js_to_json, find_xpath_attr, + RegexNotFoundError, xpath_element, xpath_text, update_url_query, + urljoin, ) @@ -20,6 +24,101 @@ class CBSBaseIE(ThePlatformFeedIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] +class CBSShowIE(InfoExtractor): + IE_DESC = 'CBS show playlists, including full episodes and clips' + IE_NAME = 'cbs.com:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)cbs.com/shows/(?P[\w-]+)' + _TEST = { + 'url': 'http://www.cbs.com/shows/the-late-show-with-stephen-colbert', + 'info_dict': { + 'id': 61456254, + 'title': 'The Late Show with Stephen Colbert', + }, + 'playlist_mincount': 14, + # If uncommented, the test harness tries to download all 30 playlist entries. + # Even limited to 10KB each, this can take 15 minutes. Not reasonable. + # 'playlist': [{ + # 'info_dict': { + # 'id': 'xxx', + # 'ext': 'xxx.mp4', + # }, + # }], + } + + def carousel_playlist(self, url, type): + carousel = self._download_json(url, 'Downloading %s carousel' % type) + episodes = carousel['result']['data'] + carousel_title = episodes[0]['series_title'] + + entries = [] + for ep in episodes: + entries.append(self.url_result( + urljoin(url, ep['app_url']), + 'CBS', + ep['content_id'], + ep['episode_title'])) + + return self.playlist_result(entries, playlist_title=carousel_title) + + def _real_extract(self, url): + show_name = self._match_id(url) + webpage = self._download_webpage(url, show_name) + + # not-quite JSON, no double-quotes: + # var show = new CBS.Show({id:61456254}); + show_id_json = self._search_regex(r'new CBS\.Show\(([^)]*)\);', webpage, 'show_id') + + show = self._parse_json(show_id_json, show_name, transform_source=js_to_json) + + # Found in http://www.cbs.com/assets/min/js/min/com.cbs.min.js?20170303-224247 + # unminified at http://www.cbs.com/assets/js/min/com.cbs.js + # http://www.cbs.com/carousels/shows/61456254/offset/0/limit/15/xs/0/ + # => {id: 240172, title: "Full Episodes", + episodes_url = urljoin(url, '/carousels/shows/%d/offset/0/limit/15/xs/0/' % show['id']) + + # var loader = new CBS.V2.CarouselLoader({ + # 'video-preview-carousel': function(element) { + # element.videoCarousel({ + # id : 241426, + # templates : 'carousels/videoAdaptive', + # scroll : 3, + # layout : 3, + # start : 0, + # saveState : false + # }); + # } + try: + clipdata = self._parse_json( + self._search_regex(r'element\.videoCarousel\(([^)]*)\);', webpage, + 'clip carousel'), + show_name, transform_source=js_to_json) + + # http://www.cbs.com/carousels/videosBySection/241426/offset/0/limit/15/xs/0/ + # => {id: 241426, title: "Clips", + clips_url = urljoin(url, + '/carousels/videosBySection/%d/offset/0/limit/15/xs/0' % clipdata['id']) + clips = self.carousel_playlist(clips_url, 'clips') + except RegexNotFoundError: + clips = { 'entries': [] } + + # We separately retrieve a carousel of full episodes, and also one of clips. + # Clips are identifiable as such because they lack an "episode_number" field, + # unlike full episodes. + # + # It might be desirable to specify only retrieving a playlist of one or the other, + # but there isn't a good way for users to pass such parameters to InfoExtractors + # (custom URLs, maybe? With cbs: URLs?). + # + # But since the playlist is filterable, only full episodes can be returned with: + # youtube_dl --match-filter 'episode_number' http://... + # and similarly, only clips can be returned with: + # youtube_dl --match-filter '!episode_number' http://... + + playlist = self.carousel_playlist(episodes_url, 'episodes') + playlist['entries'] += clips['entries'] + playlist['id'] = show['id'] + + return playlist class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index caf1dc766..ec3c48ad9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -150,7 +150,10 @@ from .cbc import ( CBCWatchVideoIE, CBCWatchIE, ) -from .cbs import CBSIE +from .cbs import ( + CBSIE, + CBSShowIE, + ) from .cbslocal import CBSLocalIE from .cbsinteractive import CBSInteractiveIE from .cbsnews import (