2014-07-22 17:34:34 +02:00
from __future__ import unicode_literals
2017-03-05 03:13:30 +01:00
from . common import InfoExtractor
2016-06-20 00:40:00 +02:00
from . theplatform import ThePlatformFeedIE
2015-11-21 17:18:17 +01:00
from . . utils import (
2016-04-01 08:33:37 +02:00
int_or_none ,
2017-03-05 03:13:30 +01:00
js_to_json ,
2016-04-01 08:33:37 +02:00
find_xpath_attr ,
2017-03-05 03:13:30 +01:00
RegexNotFoundError ,
2016-09-22 20:27:57 +02:00
xpath_element ,
xpath_text ,
update_url_query ,
2017-03-05 03:13:30 +01:00
urljoin ,
2015-11-21 17:18:17 +01:00
)
2013-12-16 03:53:43 +01:00
2016-06-20 00:40:00 +02:00
class CBSBaseIE ( ThePlatformFeedIE ) :
2016-04-01 11:12:29 +02:00
def _parse_smil_subtitles ( self , smil , namespace = None , subtitles_lang = ' en ' ) :
closed_caption_e = find_xpath_attr ( smil , self . _xpath_ns ( ' .//param ' , namespace ) , ' name ' , ' ClosedCaptionURL ' )
return {
' en ' : [ {
' ext ' : ' ttml ' ,
' url ' : closed_caption_e . attrib [ ' value ' ] ,
} ]
} if closed_caption_e is not None and closed_caption_e . attrib . get ( ' value ' ) else [ ]
2017-03-05 03:13:30 +01:00
class CBSShowIE ( InfoExtractor ) :
IE_DESC = ' CBS show playlists, including full episodes and clips '
IE_NAME = ' cbs.com:playlist '
2017-03-05 04:25:33 +01:00
_VALID_URL = r ' (?i)https?://(?:www \ .)cbs.com/shows/(?P<id>[ \ w-]+)/?$ '
2017-03-05 03:13:30 +01:00
_TEST = {
' url ' : ' http://www.cbs.com/shows/the-late-show-with-stephen-colbert ' ,
' info_dict ' : {
' id ' : 61456254 ,
' title ' : ' The Late Show with Stephen Colbert ' ,
} ,
' playlist_mincount ' : 14 ,
# If uncommented, the test harness tries to download all 30 playlist entries.
# Even limited to 10KB each, this can take 15 minutes. Not reasonable.
# 'playlist': [{
# 'info_dict': {
# 'id': 'xxx',
# 'ext': 'xxx.mp4',
# },
# }],
}
def carousel_playlist ( self , url , type ) :
carousel = self . _download_json ( url , ' Downloading %s carousel ' % type )
episodes = carousel [ ' result ' ] [ ' data ' ]
carousel_title = episodes [ 0 ] [ ' series_title ' ]
entries = [ ]
for ep in episodes :
entries . append ( self . url_result (
urljoin ( url , ep [ ' app_url ' ] ) ,
' CBS ' ,
ep [ ' content_id ' ] ,
ep [ ' episode_title ' ] ) )
return self . playlist_result ( entries , playlist_title = carousel_title )
def _real_extract ( self , url ) :
show_name = self . _match_id ( url )
webpage = self . _download_webpage ( url , show_name )
# not-quite JSON, no double-quotes:
# var show = new CBS.Show({id:61456254});
show_id_json = self . _search_regex ( r ' new CBS \ .Show \ (([^)]*) \ ); ' , webpage , ' show_id ' )
show = self . _parse_json ( show_id_json , show_name , transform_source = js_to_json )
# Found in http://www.cbs.com/assets/min/js/min/com.cbs.min.js?20170303-224247
# unminified at http://www.cbs.com/assets/js/min/com.cbs.js
# http://www.cbs.com/carousels/shows/61456254/offset/0/limit/15/xs/0/
# => {id: 240172, title: "Full Episodes",
episodes_url = urljoin ( url , ' /carousels/shows/ %d /offset/0/limit/15/xs/0/ ' % show [ ' id ' ] )
# var loader = new CBS.V2.CarouselLoader({
# 'video-preview-carousel': function(element) {
# element.videoCarousel({
# id : 241426,
# templates : 'carousels/videoAdaptive',
# scroll : 3,
# layout : 3,
# start : 0,
# saveState : false
# });
# }
try :
clipdata = self . _parse_json (
self . _search_regex ( r ' element \ .videoCarousel \ (([^)]*) \ ); ' , webpage ,
' clip carousel ' ) ,
show_name , transform_source = js_to_json )
# http://www.cbs.com/carousels/videosBySection/241426/offset/0/limit/15/xs/0/
# => {id: 241426, title: "Clips",
clips_url = urljoin ( url ,
' /carousels/videosBySection/ %d /offset/0/limit/15/xs/0 ' % clipdata [ ' id ' ] )
clips = self . carousel_playlist ( clips_url , ' clips ' )
except RegexNotFoundError :
clips = { ' entries ' : [ ] }
# We separately retrieve a carousel of full episodes, and also one of clips.
# Clips are identifiable as such because they lack an "episode_number" field,
# unlike full episodes.
#
# It might be desirable to specify only retrieving a playlist of one or the other,
# but there isn't a good way for users to pass such parameters to InfoExtractors
# (custom URLs, maybe? With cbs: URLs?).
#
# But since the playlist is filterable, only full episodes can be returned with:
# youtube_dl --match-filter 'episode_number' http://...
# and similarly, only clips can be returned with:
# youtube_dl --match-filter '!episode_number' http://...
playlist = self . carousel_playlist ( episodes_url , ' episodes ' )
playlist [ ' entries ' ] + = clips [ ' entries ' ]
playlist [ ' id ' ] = show [ ' id ' ]
return playlist
2016-04-01 11:12:29 +02:00
class CBSIE ( CBSBaseIE ) :
2016-06-20 00:55:19 +02:00
_VALID_URL = r ' (?:cbs:|https?://(?:www \ .)?(?:cbs \ .com/shows/[^/]+/video|colbertlateshow \ .com/(?:video|podcasts))/)(?P<id>[ \ w-]+) '
2013-12-16 03:53:43 +01:00
2014-07-22 16:56:42 +02:00
_TESTS = [ {
2014-07-22 17:34:34 +02:00
' url ' : ' http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/ ' ,
' info_dict ' : {
2016-04-01 08:33:37 +02:00
' id ' : ' _u7W953k6la293J7EPTd9oHkSPs6Xn6_ ' ,
' ext ' : ' mp4 ' ,
2014-07-22 17:34:34 +02:00
' title ' : ' Connect Chat feat. Garth Brooks ' ,
' description ' : ' Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS! ' ,
' duration ' : 1495 ,
2016-04-01 19:06:11 +02:00
' timestamp ' : 1385585425 ,
' upload_date ' : ' 20131127 ' ,
' uploader ' : ' CBSI-NEW ' ,
2013-12-16 03:53:43 +01:00
} ,
2016-08-20 14:25:32 +02:00
' params ' : {
# m3u8 download
' skip_download ' : True ,
} ,
2014-07-22 17:34:34 +02:00
' _skip ' : ' Blocked outside the US ' ,
2015-06-09 17:23:53 +02:00
} , {
' url ' : ' http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/ ' ,
' only_matching ' : True ,
} , {
2015-06-09 17:39:45 +02:00
' url ' : ' http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/ ' ,
2015-06-09 17:23:53 +02:00
' only_matching ' : True ,
2014-07-22 16:56:42 +02:00
} ]
2016-08-20 14:25:32 +02:00
2016-09-22 20:27:57 +02:00
def _extract_video_info ( self , content_id ) :
items_data = self . _download_xml (
' http://can.cbs.com/thunder/player/videoPlayerService.php ' ,
content_id , query = { ' partner ' : ' cbs ' , ' contentId ' : content_id } )
video_data = xpath_element ( items_data , ' .//item ' )
title = xpath_text ( video_data , ' videoTitle ' , ' title ' , True )
tp_path = ' dJ5BDC/media/guid/2198311517/ %s ' % content_id
tp_release_url = ' http://link.theplatform.com/s/ ' + tp_path
asset_types = [ ]
subtitles = { }
formats = [ ]
for item in items_data . findall ( ' .//item ' ) :
asset_type = xpath_text ( item , ' assetType ' )
if not asset_type or asset_type in asset_types :
2016-08-20 14:25:32 +02:00
continue
2016-09-22 20:27:57 +02:00
asset_types . append ( asset_type )
query = {
' mbr ' : ' true ' ,
' assetTypes ' : asset_type ,
}
if asset_type . startswith ( ' HLS ' ) or asset_type in ( ' OnceURL ' , ' StreamPack ' ) :
query [ ' formats ' ] = ' MPEG4,M3U '
elif asset_type in ( ' RTMP ' , ' WIFI ' , ' 3G ' ) :
query [ ' formats ' ] = ' MPEG4,FLV '
tp_formats , tp_subtitles = self . _extract_theplatform_smil (
update_url_query ( tp_release_url , query ) , content_id ,
' Downloading %s SMIL data ' % asset_type )
formats . extend ( tp_formats )
subtitles = self . _merge_subtitles ( subtitles , tp_subtitles )
2016-08-20 14:25:32 +02:00
self . _sort_formats ( formats )
2016-09-22 20:27:57 +02:00
info = self . _extract_theplatform_metadata ( tp_path , content_id )
2016-08-20 14:25:32 +02:00
info . update ( {
2016-09-22 20:27:57 +02:00
' id ' : content_id ,
' title ' : title ,
' series ' : xpath_text ( video_data , ' seriesTitle ' ) ,
' season_number ' : int_or_none ( xpath_text ( video_data , ' seasonNumber ' ) ) ,
' episode_number ' : int_or_none ( xpath_text ( video_data , ' episodeNumber ' ) ) ,
' duration ' : int_or_none ( xpath_text ( video_data , ' videoLength ' ) , 1000 ) ,
' thumbnail ' : xpath_text ( video_data , ' previewImageURL ' ) ,
2016-08-20 14:25:32 +02:00
' formats ' : formats ,
' subtitles ' : subtitles ,
} )
return info
2016-04-01 08:33:37 +02:00
2013-12-16 03:53:43 +01:00
def _real_extract ( self , url ) :
2016-06-20 00:40:00 +02:00
content_id = self . _match_id ( url )
2016-08-20 14:25:32 +02:00
return self . _extract_video_info ( content_id )