2014-03-22 23:05:31 +01:00
from __future__ import unicode_literals
2019-02-20 23:59:07 +01:00
import json
2013-06-23 20:07:51 +02:00
import re
from . common import InfoExtractor
2014-12-13 12:24:42 +01:00
from . . compat import (
2013-06-23 20:07:51 +02:00
compat_parse_qs ,
2019-02-20 23:59:07 +01:00
compat_urllib_parse ,
2015-07-17 19:41:47 +02:00
compat_urllib_parse_unquote ,
2014-12-13 12:24:42 +01:00
)
from . . utils import (
2013-07-17 01:14:30 +02:00
determine_ext ,
2013-06-23 20:07:51 +02:00
ExtractorError ,
2014-08-22 01:36:07 +02:00
int_or_none ,
2016-07-06 03:19:55 +02:00
get_element_by_attribute ,
mimetype2ext ,
2013-06-23 20:07:51 +02:00
)
2014-03-22 23:05:31 +01:00
class MetacafeIE ( InfoExtractor ) :
2016-07-06 03:19:55 +02:00
_VALID_URL = r ' https?://(?:www \ .)?metacafe \ .com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+) '
2013-06-23 20:07:51 +02:00
_DISCLAIMER = ' http://www.metacafe.com/family_filter/ '
_FILTER_POST = ' http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user '
2014-03-22 23:05:31 +01:00
IE_NAME = ' metacafe '
2013-11-01 11:55:35 +01:00
_TESTS = [
2014-03-22 23:05:31 +01:00
# Youtube video
{
' add_ie ' : [ ' Youtube ' ] ,
2014-11-23 21:20:46 +01:00
' url ' : ' http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/ ' ,
2014-03-22 23:05:31 +01:00
' info_dict ' : {
' id ' : ' _aUehQsCQtM ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20090102 ' ,
2014-03-22 23:13:15 +01:00
' title ' : ' The Electric Company | " Short I " | PBS KIDS GO! ' ,
2014-03-22 23:05:31 +01:00
' description ' : ' md5:2439a8ef6d5a70e380c22f5ad323e5a8 ' ,
' uploader ' : ' PBS ' ,
' uploader_id ' : ' PBS '
}
2013-11-01 11:55:35 +01:00
} ,
2014-03-22 23:05:31 +01:00
# Normal metacafe video
{
' url ' : ' http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/ ' ,
' md5 ' : ' 6e0bca200eaad2552e6915ed6fd4d9ad ' ,
' info_dict ' : {
' id ' : ' 11121940 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' News: Stuff You Won \' t Do with Your PlayStation 4 ' ,
' uploader ' : ' ign ' ,
' description ' : ' Sony released a massive FAQ on the PlayStation Blog detailing the PS4 \' s capabilities and limitations. ' ,
} ,
2016-07-06 03:19:55 +02:00
' skip ' : ' Page is temporarily unavailable. ' ,
2013-11-01 11:55:35 +01:00
} ,
2016-08-17 22:13:28 +02:00
# metacafe video with family filter
{
' url ' : ' http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/ ' ,
' md5 ' : ' b06082c5079bbdcde677a6291fbdf376 ' ,
' info_dict ' : {
' id ' : ' 2155630 ' ,
' ext ' : ' mp4 ' ,
2017-02-18 13:58:25 +01:00
' title ' : ' Adult Art By David Hart 156 ' ,
' uploader ' : ' 63346 ' ,
' description ' : ' md5:9afac8fc885252201ad14563694040fc ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
2016-08-17 22:13:28 +02:00
} ,
2014-03-22 23:05:31 +01:00
# AnyClip video
{
' url ' : ' http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/ ' ,
' info_dict ' : {
' id ' : ' an-dVVXnuY7Jh77J ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Andromeda Strain (1971): Stop the Bomb Part 3 ' ,
2016-07-06 03:19:55 +02:00
' uploader ' : ' AnyClip ' ,
' description ' : ' md5:cbef0460d31e3807f6feb4e7a5952e5b ' ,
2014-03-22 23:05:31 +01:00
} ,
2013-11-01 11:55:35 +01:00
} ,
2014-03-22 23:05:31 +01:00
# age-restricted video
{
' url ' : ' http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/ ' ,
' md5 ' : ' 98dde7c1a35d02178e8ab7560fe8bd09 ' ,
' info_dict ' : {
' id ' : ' 5186653 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' BBC INTERNAL Christmas Tape \' 79 - UNCENSORED Outtakes, Etc. ' ,
' uploader ' : ' Dwayne Pipe ' ,
' description ' : ' md5:950bf4c581e2c059911fa3ffbe377e4b ' ,
' age_limit ' : 18 ,
} ,
2013-12-04 23:43:50 +01:00
} ,
2014-03-22 23:05:31 +01:00
# cbs video
{
2014-03-22 23:08:11 +01:00
' url ' : ' http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/ ' ,
2014-03-22 23:05:31 +01:00
' info_dict ' : {
2014-03-22 23:08:11 +01:00
' id ' : ' 8VD4r_Zws8VP ' ,
2014-03-22 23:05:31 +01:00
' ext ' : ' flv ' ,
2014-03-22 23:08:11 +01:00
' title ' : ' Open: This is Face the Nation, February 9 ' ,
' description ' : ' md5:8a9ceec26d1f7ed6eab610834cc1a476 ' ,
' duration ' : 96 ,
2016-04-24 14:44:52 +02:00
' uploader ' : ' CBSI-NEW ' ,
' upload_date ' : ' 20140209 ' ,
' timestamp ' : 1391959800 ,
2014-03-22 23:05:31 +01:00
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2013-12-04 23:43:50 +01:00
} ,
2014-08-22 01:36:07 +02:00
# Movieclips.com video
{
' url ' : ' http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/ ' ,
' info_dict ' : {
' id ' : ' mv-Wy7ZU ' ,
' ext ' : ' mp4 ' ,
' title ' : ' My Week with Marilyn - Do You Love Me? ' ,
' description ' : ' From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie. ' ,
' uploader ' : ' movie_trailers ' ,
' duration ' : 176 ,
} ,
' params ' : {
' skip_download ' : ' requires rtmpdump ' ,
}
}
2013-11-01 11:55:35 +01:00
]
2013-06-27 20:18:35 +02:00
2013-06-23 20:07:51 +02:00
def report_disclaimer ( self ) :
2014-03-22 23:05:31 +01:00
self . to_screen ( ' Retrieving disclaimer ' )
2013-06-23 20:07:51 +02:00
def _real_extract ( self , url ) :
# Extract id and simplified title from URL
2016-07-06 03:19:55 +02:00
video_id , display_id = re . match ( self . _VALID_URL , url ) . groups ( )
2013-06-23 20:07:51 +02:00
2013-12-04 23:43:50 +01:00
# the video may come from an external site
2017-01-02 13:08:07 +01:00
m_external = re . match ( r ' ^( \ w {2} )-(.*)$ ' , video_id )
2013-12-04 23:43:50 +01:00
if m_external is not None :
prefix , ext_id = m_external . groups ( )
# Check if video comes from YouTube
if prefix == ' yt ' :
return self . url_result ( ' http://www.youtube.com/watch?v= %s ' % ext_id , ' Youtube ' )
# CBS videos use theplatform.com
if prefix == ' cb ' :
return self . url_result ( ' theplatform: %s ' % ext_id , ' ThePlatform ' )
2013-06-23 20:07:51 +02:00
2017-02-18 13:58:25 +01:00
headers = {
# Disable family filter
2019-02-20 23:59:07 +01:00
' Cookie ' : ' user= %s ; ' % compat_urllib_parse . quote ( json . dumps ( { ' ffilter ' : False } ) )
2017-02-18 13:58:25 +01:00
}
2013-11-01 11:55:35 +01:00
# AnyClip videos require the flashversion cookie so that we get the link
# to the mp4 file
2016-07-06 03:19:55 +02:00
if video_id . startswith ( ' an- ' ) :
2017-02-18 13:58:25 +01:00
headers [ ' Cookie ' ] + = ' flashVersion=0; '
2016-07-06 03:19:55 +02:00
# Retrieve video webpage to extract further information
webpage = self . _download_webpage ( url , video_id , headers = headers )
error = get_element_by_attribute (
' class ' , ' notfound-page-title ' , webpage )
if error :
raise ExtractorError ( error , expected = True )
video_title = self . _html_search_meta (
[ ' og:title ' , ' twitter:title ' ] , webpage , ' title ' , default = None ) or self . _search_regex ( r ' <h1>(.*?)</h1> ' , webpage , ' title ' )
2013-06-23 20:07:51 +02:00
# Extract URL, uploader and title from webpage
self . report_extraction ( video_id )
2014-08-21 13:37:19 +02:00
video_url = None
2015-12-05 16:12:02 +01:00
mobj = re . search ( r ' (?m)&(?:media|video)URL=([^&]+) ' , webpage )
2013-06-23 20:07:51 +02:00
if mobj is not None :
2015-07-17 19:41:47 +02:00
mediaURL = compat_urllib_parse_unquote ( mobj . group ( 1 ) )
2015-12-05 16:12:02 +01:00
video_ext = determine_ext ( mediaURL )
2013-06-23 20:07:51 +02:00
# Extract gdaKey if available
mobj = re . search ( r ' (?m)&gdaKey=(.*?)& ' , webpage )
if mobj is None :
video_url = mediaURL
else :
gdaKey = mobj . group ( 1 )
video_url = ' %s ?__gda__= %s ' % ( mediaURL , gdaKey )
2014-08-21 13:37:19 +02:00
if video_url is None :
2013-07-17 01:14:30 +02:00
mobj = re . search ( r ' <video src= " ([^ " ]+) " ' , webpage )
if mobj :
video_url = mobj . group ( 1 )
video_ext = ' mp4 '
2014-08-21 13:37:19 +02:00
if video_url is None :
flashvars = self . _search_regex (
r ' name= " flashvars " value= " (.*?) " ' , webpage , ' flashvars ' ,
default = None )
if flashvars :
2014-08-21 13:25:17 +02:00
vardict = compat_parse_qs ( flashvars )
2013-07-17 01:14:30 +02:00
if ' mediaData ' not in vardict :
2014-03-22 23:05:31 +01:00
raise ExtractorError ( ' Unable to extract media URL ' )
mobj = re . search (
r ' " mediaURL " : " (?P<mediaURL>http.*?) " ,(.*?) " key " : " (?P<key>.*?) " ' , vardict [ ' mediaData ' ] [ 0 ] )
2013-07-17 01:14:30 +02:00
if mobj is None :
2014-03-22 23:05:31 +01:00
raise ExtractorError ( ' Unable to extract media URL ' )
2013-07-17 01:14:30 +02:00
mediaURL = mobj . group ( ' mediaURL ' ) . replace ( ' \\ / ' , ' / ' )
video_url = ' %s ?__gda__= %s ' % ( mediaURL , mobj . group ( ' key ' ) )
video_ext = determine_ext ( video_url )
2014-08-22 01:36:07 +02:00
if video_url is None :
player_url = self . _search_regex (
r " swfobject \ .embedSWF \ ( ' ([^ ' ]+) ' " ,
webpage , ' config URL ' , default = None )
if player_url :
config_url = self . _search_regex (
r ' config=(.+)$ ' , player_url , ' config URL ' )
config_doc = self . _download_xml (
config_url , video_id ,
note = ' Downloading video config ' )
smil_url = config_doc . find ( ' .//properties ' ) . attrib [ ' smil_file ' ]
smil_doc = self . _download_xml (
smil_url , video_id ,
note = ' Downloading SMIL document ' )
base_url = smil_doc . find ( ' ./head/meta ' ) . attrib [ ' base ' ]
video_url = [ ]
for vn in smil_doc . findall ( ' .//video ' ) :
br = int ( vn . attrib [ ' system-bitrate ' ] )
play_path = vn . attrib [ ' src ' ]
video_url . append ( {
' format_id ' : ' smil- %d ' % br ,
' url ' : base_url ,
' play_path ' : play_path ,
' page_url ' : url ,
' player_url ' : player_url ,
' ext ' : play_path . partition ( ' : ' ) [ 0 ] ,
} )
2016-07-06 03:19:55 +02:00
if video_url is None :
flashvars = self . _parse_json ( self . _search_regex (
r ' flashvars \ s*= \ s*( { .*}); ' , webpage , ' flashvars ' ,
default = None ) , video_id , fatal = False )
if flashvars :
video_url = [ ]
for source in flashvars . get ( ' sources ' ) :
source_url = source . get ( ' src ' )
if not source_url :
continue
2016-07-06 10:11:46 +02:00
ext = mimetype2ext ( source . get ( ' type ' ) ) or determine_ext ( source_url )
if ext == ' m3u8 ' :
2016-07-06 03:19:55 +02:00
video_url . extend ( self . _extract_m3u8_formats (
source_url , video_id , ' mp4 ' ,
' m3u8_native ' , m3u8_id = ' hls ' , fatal = False ) )
else :
video_url . append ( {
' url ' : source_url ,
' ext ' : ext ,
} )
2013-06-23 20:07:51 +02:00
2014-08-22 01:36:07 +02:00
if video_url is None :
raise ExtractorError ( ' Unsupported video type ' )
2014-08-21 13:37:19 +02:00
2016-07-06 03:19:55 +02:00
description = self . _html_search_meta (
[ ' og:description ' , ' twitter:description ' , ' description ' ] ,
webpage , ' title ' , fatal = False )
thumbnail = self . _html_search_meta (
[ ' og:image ' , ' twitter:image ' ] , webpage , ' title ' , fatal = False )
2013-07-17 10:45:24 +02:00
video_uploader = self . _html_search_regex (
2014-11-23 21:39:15 +01:00
r ' submitter=(.*?);|googletag \ .pubads \ ( \ ) \ .setTargeting \ ( " (?:channel|submiter) " , " ([^ " ]+) " \ ); ' ,
webpage , ' uploader nickname ' , fatal = False )
2014-08-22 01:36:07 +02:00
duration = int_or_none (
2016-07-06 03:19:55 +02:00
self . _html_search_meta ( ' video:duration ' , webpage , default = None ) )
2014-08-22 01:36:07 +02:00
age_limit = (
18
2015-12-05 16:12:50 +01:00
if re . search ( r ' (?: " contentRating " :| " rating " ,) " restricted " ' , webpage )
2014-08-22 01:36:07 +02:00
else 0 )
2013-06-23 20:07:51 +02:00
2014-08-22 01:36:07 +02:00
if isinstance ( video_url , list ) :
formats = video_url
2013-11-01 11:55:35 +01:00
else :
2014-08-22 01:36:07 +02:00
formats = [ {
' url ' : video_url ,
' ext ' : video_ext ,
} ]
self . _sort_formats ( formats )
2016-07-06 03:19:55 +02:00
2013-07-17 10:49:49 +02:00
return {
2014-03-22 23:05:31 +01:00
' id ' : video_id ,
2016-07-06 03:19:55 +02:00
' display_id ' : display_id ,
2013-07-17 10:45:35 +02:00
' description ' : description ,
2013-07-17 01:14:30 +02:00
' uploader ' : video_uploader ,
2014-03-22 23:05:31 +01:00
' title ' : video_title ,
2014-08-21 13:25:17 +02:00
' thumbnail ' : thumbnail ,
2013-11-01 11:55:35 +01:00
' age_limit ' : age_limit ,
2014-08-22 01:36:07 +02:00
' formats ' : formats ,
' duration ' : duration ,
2013-07-17 10:49:49 +02:00
}