diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c2bd5d8ae..128e6e681 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.01 +[debug] youtube-dl version 2018.06.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.gitignore b/.gitignore index fbf7cecb2..f064a0d9e 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,4 @@ youtube-dl.zsh *.iml tmp/ +venv/ diff --git a/ChangeLog b/ChangeLog index 916b8edb8..8eb7469d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,210 @@ +version 2018.06.25 + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + +version 2018.06.19 + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + +version 2018.06.18 + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + +version 2018.06.14 + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + +version 2018.06.11 + +Extractors +* [npo] Extend URL regular expression and add support for npostart.nl (#16682) ++ [inc] Add support for another embed schema (#16666) +* [tv4] Fix format extraction (#16650) ++ [nexx] Add support for free cdn (#16538) ++ [pbs] Add another cove id pattern (#15373) ++ [rbmaradio] Add support for 192k format (#16631) + + +version 2018.06.04 + +Extractors ++ [camtube] Add support for camtube.co ++ [twitter:card] Extract guest token (#16609) ++ [chaturbate] Use geo verification headers ++ [bbc] Add support for bbcthree (#16612) +* [youtube] Move metadata extraction after video availability check ++ [youtube] Extract track and artist ++ [safari] Add support for new URL schema (#16614) +* [adn] Fix extraction + + +version 2018.06.02 + +Core +* [utils] Improve determine_ext + +Extractors ++ [facebook] Add support for tahoe player videos (#15441, #16554) +* [cbc] Improve extraction (#16583, #16593) +* [openload] Improve ext extraction (#16595) ++ [twitter:card] Add support for another endpoint (#16586) ++ [openload] Add support for oload.win and oload.download (#16592) +* [audimedia] Fix extraction (#15309) ++ [francetv] Add support for sport.francetvinfo.fr (#15645) +* [mlb] Improve extraction (#16587) +- [nhl] Remove old extractors +* [rbmaradio] Check formats availability (#16585) + + +version 2018.05.30 + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + +version 2018.05.26 + +Core +* [utils] Improve parse_age_limit + +Extractors +* [audiomack] Stringify video id (#15310) +* [izlesene] Fix extraction (#16233, #16271, #16407) ++ [indavideo] Add support for generic embeds (#11989) +* [indavideo] Fix extraction (#11221) +* [indavideo] Sign download URLs (#16174) ++ [peertube] Add support for PeerTube based sites (#16301, #16329) +* [imgur] Fix extraction (#16537) ++ [hidive] Add support for authentication (#16534) ++ [nbc] Add support for stream.nbcsports.com (#13911) ++ [viewlift] Add support for hoichoi.tv (#16536) +* [go90] Extract age limit and detect DRM protection(#10127) +* [viewlift] fix extraction for snagfilms.com (#15766) +* [globo] Improve extraction (#4189) + * Add support for authentication + * Simplify URL signing + * Extract DASH and MSS formats +* [leeco] Fix extraction (#16464) +* [teamcoco] Add fallback for format extraction (#16484) +* [teamcoco] Improve URL regular expression (#16484) +* [imdb] Improve extraction (#4085, #14557) + + +version 2018.05.18 + +Extractors +* [vimeo:likes] Relax URL regular expression and fix single page likes + extraction (#16475) +* [pluralsight] Fix clip id extraction (#16460) ++ [mychannels] Add support for mychannels.com (#15334) +- [moniker] Remove extractor (#15336) +* [pbs] Fix embed data extraction (#16474) ++ [mtv] Add support for paramountnetwork.com and bellator.com (#15418) +* [youtube] Fix hd720 format position +* [dailymotion] Remove fragment part from m3u8 URLs (#8915) +* [3sat] Improve extraction (#15350) + * Extract all formats + * Extract more format metadata + * Improve format sorting + * Use hls native downloader + * Detect and bypass geo-restriction ++ [dtube] Add support for d.tube (#15201) +* [options] Fix typo (#16450) +* [youtube] Improve format filesize extraction (#16453) +* [youtube] Make uploader extraction non fatal (#16444) +* [youtube] Fix extraction for embed restricted live streams (#16433) +* [nbc] Improve info extraction (#16440) +* [twitch:clips] Fix extraction (#16429) +* [redditr] Relax URL regular expression (#16426, #16427) +* [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) ++ [nick] Add support for nickjr.de (#13230) +* [teamcoco] Fix extraction (#16374) + + +version 2018.05.09 + +Core +* [YoutubeDL] Ensure ext exists for automatic captions +* Introduce --geo-bypass-ip-block + +Extractors ++ [udemy] Extract asset captions ++ [udemy] Extract stream URLs (#16372) ++ [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) ++ [cloudflarestream] Add support for cloudflarestream.com (#16375) +* [watchbox] Fix extraction (#16356) +* [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) ++ [itv:btcc] Add support for itv.com/btcc (#16139) +* [tunein] Use live title for live streams (#16347) +* [itv] Improve extraction (#16253) + + version 2018.05.01 Core diff --git a/README.md b/README.md index 5af0f387b..499a0c206 100644 --- a/README.md +++ b/README.md @@ -93,8 +93,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable experimental SOCKS proxy, specify - a proper scheme. For example + To enable SOCKS proxy, specify a proper + scheme. For example socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds @@ -106,16 +106,18 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --geo-verification-proxy URL Use this proxy to verify the IP address for some geo-restricted sites. The default proxy specified by --proxy (or none, if the - options is not present) is used for the + option is not present) is used for the actual downloading. --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header - (experimental) --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 - country code (experimental) + country code + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with + explicitly provided IP block in CIDR + notation ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) @@ -206,7 +208,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with - expected file size (experimental) + expected file size --hls-prefer-native Use the native HLS downloader instead of ffmpeg --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS diff --git a/devscripts/gh-pages/update-copyright.py b/devscripts/gh-pages/update-copyright.py index e6c3abc8d..61487f925 100755 --- a/devscripts/gh-pages/update-copyright.py +++ b/devscripts/gh-pages/update-copyright.py @@ -13,7 +13,7 @@ year = str(datetime.datetime.now().year) for fn in glob.glob('*.html*'): with io.open(fn, encoding='utf-8') as f: content = f.read() - newc = re.sub(r'(?PCopyright © 2006-)(?P[0-9]{4})', 'Copyright © 2006-' + year, content) + newc = re.sub(r'(?PCopyright © 2011-)(?P[0-9]{4})', 'Copyright © 2011-' + year, content) if content != newc: tmpFn = fn + '.part' with io.open(tmpFn, 'wt', encoding='utf-8') as outf: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c5a48002b..a78fabb02 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -15,7 +15,6 @@ - **8tracks** - **91porn** - **9c9media** - - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** @@ -48,6 +47,7 @@ - **anitube.se** - **Anvato** - **AnySex** + - **APA** - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 @@ -100,6 +100,7 @@ - **Beatport** - **Beeg** - **BehindKink** + - **Bellator** - **BellMedia** - **Bet** - **Bigflix** @@ -122,10 +123,13 @@ - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen + - **BusinessInsider** - **BuzzFeed** - **BYUtv** - **Camdemy** - **CamdemyFolder** + - **CamModels** + - **CamTube** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr @@ -163,6 +167,7 @@ - **ClipRs** - **Clipsyndicate** - **CloserToTruth** + - **CloudflareStream** - **cloudtime**: CloudTime - **Cloudy** - **Clubic** @@ -232,6 +237,7 @@ - **DrTuber** - **drtv** - **drtv:live** + - **DTube** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -260,6 +266,7 @@ - **Europa** - **EveryonesMixtape** - **ExpoTV** + - **Expressen** - **ExtremeTube** - **EyedoTV** - **facebook** @@ -283,7 +290,6 @@ - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - - **foxnews:insider** - **FoxSports** - **france2.fr:generation-what** - **FranceCulture** @@ -361,7 +367,6 @@ - **ImgurAlbum** - **Ina** - **Inc** - - **Indavideo** - **IndavideoEmbed** - **InfoQ** - **Instagram** @@ -373,6 +378,7 @@ - **Ir90Tv** - **ITTF** - **ITV** + - **ITVBTCC** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV @@ -445,11 +451,12 @@ - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - - **MakersChannel** - **MakerTV** - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **Markiza** + - **MarkizaPage** - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA @@ -483,7 +490,6 @@ - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** - - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** - **MotherlessGroup** @@ -505,6 +511,7 @@ - **mva:course**: Microsoft Virtual Academy courses - **Mwave** - **MwaveMeetGreet** + - **MyChannels** - **MySpace** - **MySpace:album** - **MySpass** @@ -522,6 +529,7 @@ - **nbcolympics** - **nbcolympics:stream** - **NBCSports** + - **NBCSportsStream** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk - **ndr:embed** @@ -548,9 +556,6 @@ - **nfl.com** - **NhkVod** - **nhl.com** - - **nhl.com:news**: NHL news - - **nhl.com:videocenter** - - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** - **nickelodeon:br** @@ -615,11 +620,13 @@ - **PacktPubCourse** - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV + - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - **PearVideo** + - **PeerTube** - **People** - **PerformGroup** - **periscope**: Periscope @@ -786,7 +793,7 @@ - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** - - **Spike** + - **sport.francetvinfo.fr** - **Sport5** - **SportBoxEmbed** - **SportDeutschland** @@ -888,6 +895,7 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** + - **TVNet** - **TVNoe** - **TVNow** - **TVNowList** diff --git a/setup.cfg b/setup.cfg index 5208f7ae2..af9a554c6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv ignore = E402,E501,E731,E741 diff --git a/test/test_utils.py b/test/test_utils.py index 14503ab53..e63af0166 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -361,6 +361,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') + self.assertEqual(determine_ext('foobar', None), None) def test_find_xpath_attr(self): testxml = ''' @@ -519,6 +520,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_age_limit('PG-13'), 13) self.assertEqual(parse_age_limit('TV-14'), 14) self.assertEqual(parse_age_limit('TV-MA'), 17) + self.assertEqual(parse_age_limit('TV14'), 14) + self.assertEqual(parse_age_limit('TV_G'), 0) def test_parse_duration(self): self.assertEqual(parse_duration(None), None) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ad3598805..38ba43a97 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -211,7 +211,7 @@ class YoutubeDL(object): At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification - on geo-restricted sites. (Experimental) + on geo-restricted sites. socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -259,7 +259,7 @@ class YoutubeDL(object): - "warn": only emit a warning - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) - source_address: (Experimental) Client-side IP address to bind to. + source_address: Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. sleep_interval: Number of seconds to sleep before each download when @@ -281,11 +281,14 @@ class YoutubeDL(object): match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For - HTTP header (experimental) + HTTP header geo_bypass_country: Two-letter ISO 3166-2 country code that will be used for explicit geographic restriction bypassing via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header + geo_bypass_ip_block: + IP range in CIDR notation that will be used similarly to + geo_bypass_country The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. @@ -302,8 +305,8 @@ class YoutubeDL(object): http_chunk_size. The following options are used by the post processors: - prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, - otherwise prefer avconv. + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. postprocessor_args: A list of additional command-line arguments for the postprocessor. @@ -1479,23 +1482,28 @@ class YoutubeDL(object): if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') subtitles = info_dict.get('subtitles') - if subtitles: - for _, subtitle in subtitles.items(): - for subtitle_format in subtitle: - if subtitle_format.get('url'): - subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if subtitle_format.get('ext') is None: - subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: - self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return + info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], subtitles, - info_dict.get('automatic_captions')) + info_dict['id'], subtitles, automatic_captions) # We now pick which formats have to be downloaded if info_dict.get('formats') is None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9bb952457..ba435ea42 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -430,6 +430,7 @@ def _real_main(argv=None): 'config_location': opts.config_location, 'geo_bypass': opts.geo_bypass, 'geo_bypass_country': opts.geo_bypass_country, + 'geo_bypass_ip_block': opts.geo_bypass_ip_block, # just for deprecation check 'autonumber': opts.autonumber if opts.autonumber is True else None, 'usetitle': opts.usetitle if opts.usetitle is True else None, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4a611f183..7b770340f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2787,6 +2787,12 @@ except NameError: # Python 3 compat_numeric_types = (int, float, complex) +try: + compat_integer_types = (int, long) +except NameError: # Python 3 + compat_integer_types = (int, ) + + if sys.version_info < (2, 7): def compat_socket_create_connection(address, timeout, source_address=None): host, port = address @@ -2974,6 +2980,7 @@ __all__ = [ 'compat_http_client', 'compat_http_server', 'compat_input', + 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', 'compat_numeric_types', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index edd125ee2..5979833c0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -45,7 +45,6 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - (experimental) external_downloader_args: A list of additional command-line arguments for the external downloader. hls_use_mpegts: Use the mpegts container for HLS videos. diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index a22875f69..5b1e96013 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -217,10 +217,11 @@ class HttpFD(FileDownloader): before = start # start measuring def retry(e): - if ctx.tmpfilename != '-': + to_stdout = ctx.tmpfilename == '-' + if not to_stdout: ctx.stream.close() ctx.stream = None - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) raise RetryDownload(e) while True: diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index b823b5171..fbb7f51b0 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -29,66 +29,68 @@ class RtmpFD(FileDownloader): proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True proc_stderr_closed = False - while not proc_stderr_closed: - # read line from stderr - line = '' - while True: - char = proc.stderr.read(1) - if not char: - proc_stderr_closed = True - break - if char in [b'\r', b'\n']: - break - line += char.decode('ascii', 'replace') - if not line: - # proc_stderr_closed is True - continue - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - percent = float(mobj.group(2)) - if not resume_percent: - resume_percent = percent - resume_downloaded_data_len = downloaded_data_len - time_now = time.time() - eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) - data_len = None - if percent > 0: - data_len = int(downloaded_data_len * 100 / percent) - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': downloaded_data_len, - 'total_bytes_estimate': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - else: - # no percent for live streams - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + try: + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) if mobj: downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len time_now = time.time() - speed = self.calc_speed(start, time_now, downloaded_data_len) + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) self._hook_progress({ + 'status': 'downloading', 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, 'tmpfilename': tmpfilename, 'filename': filename, - 'status': 'downloading', + 'eta': eta, 'elapsed': time_now - start, 'speed': speed, }) cursor_in_new_line = False - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen('') - cursor_in_new_line = True - self.to_screen('[rtmpdump] ' + line) - proc.wait() + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + finally: + proc.wait() if not cursor_in_new_line: self.to_screen('') return proc.returncode @@ -163,7 +165,15 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 - retval = run_rtmpdump(args) + started = time.time() + + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') if retval == RD_NO_CONNECT: self.report_error('[rtmpdump] Could not connect to RTMP server.') @@ -171,7 +181,7 @@ class RtmpFD(FileDownloader): while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % prevsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed args = basic_args + ['--resume'] if retval == RD_FAILED: @@ -188,13 +198,14 @@ class RtmpFD(FileDownloader): break if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % fsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, 'total_bytes': fsize, 'filename': filename, 'status': 'finished', + 'elapsed': time.time() - started, }) return True else: diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 512f04684..4ac323bf6 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -105,22 +105,22 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' - _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', + 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZY9247A021S00', + 'id': 'ZX9371A050S00', 'ext': 'mp4', - 'title': "Gaston's Visit", + 'title': "Gaston's Birthday", 'series': "Ben And Holly's Little Kingdom", - 'description': 'md5:18db170ad71cf161e006a4c688e33155', - 'upload_date': '20180318', + 'description': 'md5:f9de914d02f226968f598ac76f105bcf', + 'upload_date': '20180604', 'uploader_id': 'abc4kids', - 'timestamp': 1521400959, + 'timestamp': 1528140219, }, 'params': { 'skip_download': True, @@ -129,17 +129,16 @@ class ABCIViewIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_params = self._parse_json(self._search_regex( - r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) - title = video_params.get('title') or video_params['seriesTitle'] - stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + video_params = self._download_json( + 'https://iview.abc.net.au/api/programs/' + video_id, video_id) + title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) + stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) - house_number = video_params.get('episodeHouseNumber') - path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + house_number = video_params.get('episodeHouseNumber') or video_id + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( int(time.time()), house_number) sig = hmac.new( - 'android.content.res.Resources'.encode('utf-8'), + b'android.content.res.Resources', path.encode('utf-8'), hashlib.sha256).hexdigest() token = self._download_webpage( 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) @@ -169,18 +168,26 @@ class ABCIViewIE(InfoExtractor): 'ext': 'vtt', }] + is_live = video_params.get('livestream') == '1' + if is_live: + title = self._live_title(title) + return { 'id': video_id, - 'title': unescapeHTML(title), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), - 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'title': title, + 'description': video_params.get('description'), + 'thumbnail': video_params.get('thumbnail'), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], - 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), - 'episode': self._html_search_meta('episode_title', webpage, default=None), + 'season_number': int_or_none(self._search_regex( + r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), + 'episode_number': int_or_none(self._search_regex( + r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), + 'episode_id': house_number, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 041c61aff..1eb99c39a 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import binascii import json import os +import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt @@ -12,9 +15,12 @@ from ..compat import ( ) from ..utils import ( bytes_to_intlist, + bytes_to_long, ExtractorError, float_or_none, intlist_to_bytes, + long_to_bytes, + pkcs1pad, srt_subtitles_timecode, strip_or_none, urljoin, @@ -35,6 +41,7 @@ class ADNIE(InfoExtractor): } } _BASE_URL = 'http://animedigitalnetwork.fr' + _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) def _get_subtitles(self, sub_path, video_id): if not sub_path: @@ -42,16 +49,14 @@ class ADNIE(InfoExtractor): enc_subtitles = self._download_webpage( urljoin(self._BASE_URL, sub_path), - video_id, fatal=False, headers={ - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', - }) + video_id, fatal=False) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), + bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -112,11 +117,24 @@ class ADNIE(InfoExtractor): error = None if not links: links_url = player_config.get('linksurl') or options['videoUrl'] - links_data = self._download_json(urljoin( - self._BASE_URL, links_url), video_id) + token = options['token'] + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 'e': 60, + 't': token, + })) + padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + n, e = self._RSA_KEY + encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) + authorization = base64.b64encode(encrypted_message).decode() + links_data = self._download_json( + urljoin(self._BASE_URL, links_url), video_id, headers={ + 'Authorization': 'Bearer ' + authorization, + }) links = links_data.get('links') or {} metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') + sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token error = links_data.get('error') title = metas.get('title') or video_info['title'] diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index e4fa72f46..1fe5d5e56 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -52,7 +52,7 @@ class AnimeOnDemandIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 7a29cd2c6..f6a78eb5d 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -277,7 +277,9 @@ class AnvatoIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) mobj = re.match(self._VALID_URL, url) access_key, video_id = mobj.group('access_key_or_mcp', 'id') diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py new file mode 100644 index 000000000..a30a935aa --- /dev/null +++ b/youtube_dl/extractor/apa.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + js_to_json, +) + + +class APAIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'md5': '2b12292faeb0a7d930c778c7a5b4759b', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + }, { + 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', + 'only_matching': True, + }, { + 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', + 'only_matching': True, + }, { + 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + jwplatform_id = self._search_regex( + r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, + 'jwplatform id', default=None) + + if jwplatform_id: + return self.url_result( + 'jwplatform:' + jwplatform_id, ie='JWPlatform', + video_id=video_id) + + sources = self._parse_json( + self._search_regex( + r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + if not isinstance(source, dict): + continue + source_url = source.get('file') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': source_url, + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 1a31ebe08..ae1c09427 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -74,7 +74,7 @@ class AtresPlayerIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index aa6925623..6bd48ef15 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -5,13 +5,12 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, - sanitized_Request, ) class AudiMediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P[^/?#]+)' + _TESTS = [{ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { @@ -24,41 +23,46 @@ class AudiMediaIE(InfoExtractor): 'duration': 74022, 'view_count': int, } - } - # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) - _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' + }, { + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) raw_payload = self._search_regex([ - r'class="amtv-embed"[^>]+id="([^"]+)"', - r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', + r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"', + r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"', + r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"', + r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})', ], webpage, 'raw payload') - _, stage_mode, video_id, lang = raw_payload.split('-') + _, stage_mode, video_id, _ = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) if stage_mode not in ('s', 'e'): - request = sanitized_Request( - 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), - headers={'X-Auth-Token': self._AUTH_TOKEN}) - json_data = self._download_json(request, video_id)['results'] + video_data = self._download_json( + 'https://www.audimedia.tv/api/video/v1/videos/' + video_id, + video_id, query={ + 'embed[]': ['video_versions', 'thumbnail_image'], + })['results'] formats = [] - stream_url_hls = json_data.get('stream_url_hls') + stream_url_hls = video_data.get('stream_url_hls') if stream_url_hls: formats.extend(self._extract_m3u8_formats( stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - stream_url_hds = json_data.get('stream_url_hds') + stream_url_hds = video_data.get('stream_url_hds') if stream_url_hds: formats.extend(self._extract_f4m_formats( stream_url_hds + '?hdcore=3.4.0', video_id, f4m_id='hds', fatal=False)) - for video_version in json_data.get('video_versions'): + for video_version in video_data.get('video_versions', []): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue @@ -79,11 +83,11 @@ class AudiMediaIE(InfoExtractor): return { 'id': video_id, - 'title': json_data['title'], - 'description': json_data.get('subtitle'), - 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), - 'timestamp': parse_iso8601(json_data.get('publication_date')), - 'duration': int_or_none(json_data.get('duration')), - 'view_count': int_or_none(json_data.get('view_count')), + 'title': video_data['title'], + 'description': video_data.get('subtitle'), + 'thumbnail': video_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(video_data.get('publication_date')), + 'duration': int_or_none(video_data.get('duration')), + 'view_count': int_or_none(video_data.get('view_count')), 'formats': formats, } diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index f3bd4d444..62049b921 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -65,7 +65,7 @@ class AudiomackIE(InfoExtractor): return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} return { - 'id': api_response.get('id', album_url_tag), + 'id': compat_str(api_response.get('id', album_url_tag)), 'uploader': api_response.get('artist'), 'title': api_response.get('title'), 'url': api_response['url'], diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 633c57553..34f1b3d83 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -44,7 +44,7 @@ class BambuserIE(InfoExtractor): } def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 8b20c03d6..293d82b0f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..utils import ( float_or_none, get_element_by_class, int_or_none, + js_to_json, parse_duration, parse_iso8601, try_get, @@ -20,7 +21,6 @@ from ..utils import ( urljoin, ) from ..compat import ( - compat_etree_fromstring, compat_HTTPError, compat_urlparse, ) @@ -333,14 +333,9 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - try: - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): - media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) - else: - raise + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): @@ -772,6 +767,17 @@ class BBCIE(BBCCoUkIE): # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } }] @classmethod @@ -994,6 +1000,36 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) + if bbc3_config: + bbc3_playlist = try_get( + bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 8820a3914..f36a2452d 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -12,7 +12,7 @@ class BellMediaIE(InfoExtractor): (?: ctv| tsn| - bnn| + bnn(?:bloomberg)?| thecomedynetwork| discovery| discoveryvelocity| @@ -27,17 +27,16 @@ class BellMediaIE(InfoExtractor): much\.com )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ - 'url': 'http://www.ctv.ca/video/player?vid=706966', - 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', + 'md5': '36d3ef559cfe8af8efe15922cd3ce950', 'info_dict': { - 'id': '706966', - 'ext': 'mp4', - 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', - 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', - 'upload_date': '20150919', - 'timestamp': 1442624700, + 'id': '1403070', + 'ext': 'flv', + 'title': 'David Cockfield\'s Top Picks', + 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', + 'upload_date': '20180525', + 'timestamp': 1527288600, }, - 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', 'only_matching': True, @@ -70,6 +69,7 @@ class BellMediaIE(InfoExtractor): 'investigationdiscovery': 'invdisc', 'animalplanet': 'aniplan', 'etalk': 'ctv', + 'bnnbloomberg': 'bnn', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 3e3348ef5..4d6b051fe 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -114,7 +114,7 @@ class BiliBiliIE(InfoExtractor): if 'anime/' not in url: cid = self._search_regex( - r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0e4eaef65..14f9a14ed 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -572,7 +572,8 @@ class BrightcoveNewIE(AdobePassIE): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism' or container == 'WVM': + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -629,6 +630,14 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) errors = json_data.get('errors') if not formats and errors: @@ -669,7 +678,10 @@ class BrightcoveNewIE(AdobePassIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + 'ip_blocks': smuggled_data.get('geo_ip_blocks'), + }) account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py new file mode 100644 index 000000000..dfcf9bc6b --- /dev/null +++ b/youtube_dl/extractor/businessinsider.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BusinessInsiderIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'hZRllCfw', + 'ext': 'mp4', + 'title': "Here's how much radiation you're exposed to in everyday life", + 'description': 'md5:9a0d6e2c279948aadaa5e84d6d9b99bd', + 'upload_date': '20170709', + 'timestamp': 1499606400, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', + 'only_matching': True, + }, { + 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex( + (r'data-media-id=["\']([a-zA-Z0-9]{8})', + r'id=["\']jwplayer_([a-zA-Z0-9]{8})', + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})'), + webpage, 'jwplatform id') + return self.url_result( + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=video_id) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py new file mode 100644 index 000000000..ee0165dba --- /dev/null +++ b/youtube_dl/extractor/cammodels.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), + ) + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + + manifest = self._download_json( + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = media.get('location') + if not media_url or not isinstance(media_url, compat_str): + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ + 'ext': 'mp4', + # hls skips fragments, preferring rtmp + 'preference': -1, + }) + else: + continue + formats.append(f) + self._sort_formats(formats) + + return { + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py new file mode 100644 index 000000000..c7d40f849 --- /dev/null +++ b/youtube_dl/extractor/camtube.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 54b4b9be9..43f95c739 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -17,9 +17,11 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + orderedSet, parse_duration, parse_iso8601, parse_age_limit, + strip_or_none, int_or_none, ExtractorError, ) @@ -129,15 +131,23 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'([^<]+)', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) entries.extend([ self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) + for media_id in orderedSet(media_ids)]) return self.playlist_result( - entries, display_id, - self._og_search_title(webpage, fatal=False), + entries, display_id, strip_or_none(title), self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index e3eba4be9..e2b828d8a 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -31,7 +31,8 @@ class ChaturbateIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url, video_id, headers=self.geo_verification_headers()) m3u8_urls = [] diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py new file mode 100644 index 000000000..e6d92cca2 --- /dev/null +++ b/youtube_dl/extractor/cloudflarestream.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CloudflareStreamIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:watch\.)?cloudflarestream\.com/| + embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo= + ) + (?P[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', + 'only_matching': True, + }, { + 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://cloudflarestream.com/%s/manifest/video.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False) + formats.extend(self._extract_mpd_formats( + 'https://cloudflarestream.com/%s/manifest/video.mpd' % video_id, + video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a9939b0fd..78f053f18 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -19,6 +19,7 @@ from ..compat import ( compat_cookies, compat_etree_fromstring, compat_getpass, + compat_integer_types, compat_http_client, compat_os_name, compat_str, @@ -339,15 +340,17 @@ class InfoExtractor(object): _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. (experimental) + country code provided with geo_bypass_country. _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted countries for this extractor. One of these countries will be used by geo restriction bypass mechanism right away in order to bypass - geo restriction, of course, if the mechanism is not disabled. (experimental) + geo restriction, of course, if the mechanism is not disabled. - NB: both these geo attributes are experimental and may change in future - or be completely removed. + _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted + IP blocks in CIDR notation for this extractor. One of these IP blocks + will be used by geo restriction bypass mechanism similarly + to _GEO_COUNTRIES. Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. @@ -358,6 +361,7 @@ class InfoExtractor(object): _x_forwarded_for_ip = None _GEO_BYPASS = True _GEO_COUNTRIES = None + _GEO_IP_BLOCKS = None _WORKING = True def __init__(self, downloader=None): @@ -392,12 +396,15 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" - self._initialize_geo_bypass(self._GEO_COUNTRIES) + self._initialize_geo_bypass({ + 'countries': self._GEO_COUNTRIES, + 'ip_blocks': self._GEO_IP_BLOCKS, + }) if not self._ready: self._real_initialize() self._ready = True - def _initialize_geo_bypass(self, countries): + def _initialize_geo_bypass(self, geo_bypass_context): """ Initialize geo restriction bypass mechanism. @@ -408,28 +415,82 @@ class InfoExtractor(object): HTTP requests. This method will be used for initial geo bypass mechanism initialization - during the instance initialization with _GEO_COUNTRIES. + during the instance initialization with _GEO_COUNTRIES and + _GEO_IP_BLOCKS. - You may also manually call it from extractor's code if geo countries + You may also manually call it from extractor's code if geo bypass information is not available beforehand (e.g. obtained during - extraction) or due to some another reason. + extraction) or due to some other reason. In this case you should pass + this information in geo bypass context passed as first argument. It may + contain following fields: + + countries: List of geo unrestricted countries (similar + to _GEO_COUNTRIES) + ip_blocks: List of geo unrestricted IP blocks in CIDR notation + (similar to _GEO_IP_BLOCKS) + """ if not self._x_forwarded_for_ip: - country_code = self._downloader.params.get('geo_bypass_country', None) - # If there is no explicit country for geo bypass specified and - # the extractor is known to be geo restricted let's fake IP - # as X-Forwarded-For right away. - if (not country_code and - self._GEO_BYPASS and - self._downloader.params.get('geo_bypass', True) and - countries): - country_code = random.choice(countries) - if country_code: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + + # Geo bypass mechanism is explicitly disabled by user + if not self._downloader.params.get('geo_bypass', True): + return + + if not geo_bypass_context: + geo_bypass_context = {} + + # Backward compatibility: previously _initialize_geo_bypass + # expected a list of countries, some 3rd party code may still use + # it this way + if isinstance(geo_bypass_context, (list, tuple)): + geo_bypass_context = { + 'countries': geo_bypass_context, + } + + # The whole point of geo bypass mechanism is to fake IP + # as X-Forwarded-For HTTP header based on some IP block or + # country code. + + # Path 1: bypassing based on IP block in CIDR notation + + # Explicit IP block specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + ip_block = self._downloader.params.get('geo_bypass_ip_block', None) + + # Otherwise use random IP block from geo bypass context but only + # if extractor is known as geo bypassable + if not ip_block: + ip_blocks = geo_bypass_context.get('ip_blocks') + if self._GEO_BYPASS and ip_blocks: + ip_block = random.choice(ip_blocks) + + if ip_block: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s as X-Forwarded-For.' + % self._x_forwarded_for_ip) + return + + # Path 2: bypassing based on country code + + # Explicit country code specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + country = self._downloader.params.get('geo_bypass_country', None) + + # Otherwise use random country code from geo bypass context but + # only if extractor is known as geo bypassable + if not country: + countries = geo_bypass_context.get('countries') + if self._GEO_BYPASS and countries: + country = random.choice(countries) + + if country: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) if self._downloader.params.get('verbose', False): self._downloader.to_screen( '[debug] Using fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) + % (self._x_forwarded_for_ip, country.upper())) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" @@ -488,8 +549,26 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - """ Returns the response handle """ + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ if note is None: self.report_download_webpage(video_id) elif note is not False: @@ -518,6 +597,10 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + return err.fp + if errnote is False: return False if errnote is None: @@ -530,13 +613,17 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): - """ Returns a tuple (page content as string, URL handle) """ + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) if urlh is False: assert not fatal return False @@ -625,13 +712,52 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): - """ Returns the data of the page as a string """ + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=5, encoding=None, data=None, + headers={}, query={}, expected_status=None): + """ + Return the data of the page as a string. + + Arguments: + url_or_request -- plain text URL as a string or + a compat_urllib_request.Requestobject + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + tries -- number of tries + timeout -- sleep interval between tries + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -647,11 +773,17 @@ class InfoExtractor(object): def _download_xml_handle( self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res xml_string, urlh = res @@ -659,15 +791,21 @@ class InfoExtractor(object): xml_string, video_id, transform_source=transform_source, fatal=fatal), urlh - def _download_xml(self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}): - """Return the xml as an xml.etree.ElementTree.Element""" + def _download_xml( + self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}, expected_status=None): + """ + Return the xml as an xml.etree.ElementTree.Element. + + See _download_webpage docstring for arguments specification. + """ res = self._download_xml_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): @@ -685,11 +823,17 @@ class InfoExtractor(object): def _download_json_handle( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (JSON object, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res json_string, urlh = res @@ -700,11 +844,18 @@ class InfoExtractor(object): def _download_json( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ res = self._download_json_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): @@ -1955,7 +2106,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) @@ -2286,6 +2451,8 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) + for f in media_info['formats']: + f.setdefault('http_headers', {})['Referer'] = base_url if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index fc014f8b5..f4a616455 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -19,8 +19,8 @@ from ..utils import ( class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' - _TEST = { + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' + _TESTS = [{ # geo restricted to CA 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { @@ -45,7 +45,10 @@ class CrackleIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 3efdc8c21..311da515d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -49,7 +49,7 @@ class CrunchyrollBaseIE(InfoExtractor): }) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index 55a127b76..03f8cefb7 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -11,10 +11,10 @@ class CTVNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', - 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'md5': '9b8624ba66351a23e0b6e1391971f9af', 'info_dict': { 'id': '901995', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Extended: \'That person cannot be me\' Johnson says', 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', 'timestamp': 1467286284, diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 8e45923e3..35b1e7a34 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -35,7 +35,7 @@ class CuriosityStreamBaseIE(InfoExtractor): return result['data'] def _real_initialize(self): - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return result = self._download_json( diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 0e7d587dd..9a74906cb 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,12 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json +import base64 +import hashlib import itertools +import json +import random +import re +import string from .common import InfoExtractor - +from ..compat import compat_struct_pack from ..utils import ( determine_ext, error_to_compat_str, @@ -64,7 +68,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, - 'view_count': int, }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -167,6 +170,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + if metadata.get('error', {}).get('type') == 'password_protected': + password = self._downloader.params.get('videopassword') + if password: + r = int(metadata['id'][1:], 36) + us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') + t = ''.join(random.choice(string.ascii_letters) for i in range(10)) + n = us64e(compat_struct_pack('I', r)) + i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) + metadata = self._download_json( + 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) + self._check_error(metadata) formats = [] @@ -180,9 +194,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): continue ext = mimetype2ext(type_) or determine_ext(media_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + f['url'] = f['url'].split('#')[0] + formats.append(f) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) @@ -299,8 +316,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): error = info.get('error') - if info.get('error') is not None: - title = error['title'] + if error: + title = error.get('title') or error['message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': self.raise_geo_restricted(msg=title) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 3a6d0560e..dc0c41b8a 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -5,13 +5,15 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, - unified_strdate, + int_or_none, + unified_timestamp, ) class DctpTvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P[^/?#&]+)' - _TEST = { + _TESTS = [{ + # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', @@ -19,31 +21,49 @@ class DctpTvIE(InfoExtractor): 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', - 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 71.24, + 'timestamp': 1302172322, + 'upload_date': '20110407', }, 'params': { # rtmp download 'skip_download': True, }, - } + }, { + # 16x9 + 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', + 'only_matching': True, + }] + + _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com' def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + version = self._download_json( + '%s/version.json' % self._BASE_URL, display_id, + 'Downloading version JSON') - video_id = self._html_search_meta( - 'DC.identifier', webpage, 'video id', - default=None) or self._search_regex( - r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') + restapi_base = '%s/%s/restapi' % ( + self._BASE_URL, version['version_name']) - title = self._og_search_title(webpage) + info = self._download_json( + '%s/slugs/%s.json' % (restapi_base, display_id), display_id, + 'Downloading video info JSON') + + media = self._download_json( + '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])), + display_id, 'Downloading media JSON') + + uuid = media['uuid'] + title = media['title'] + ratio = '16x9' if media.get('is_wide') else '4x3' + play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) servers = self._download_json( 'http://www.dctp.tv/streaming_servers/', display_id, - note='Downloading server list', fatal=False) + note='Downloading server list JSON', fatal=False) if servers: endpoint = next( @@ -60,27 +80,35 @@ class DctpTvIE(InfoExtractor): formats = [{ 'url': endpoint, 'app': app, - 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'play_path': play_path, 'page_url': url, - 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', 'ext': 'flv', }] - description = self._html_search_meta('DC.description', webpage) - upload_date = unified_strdate( - self._html_search_meta('DC.date.created', webpage)) - thumbnail = self._og_search_thumbnail(webpage) - duration = float_or_none(self._search_regex( - r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', - default=None), scale=1000) + thumbnails = [] + images = media.get('images') + if isinstance(images, list): + for image in images: + if not isinstance(image, dict): + continue + image_url = image.get('url') + if not image_url or not isinstance(image_url, compat_str): + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': uuid, 'display_id': display_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'duration': duration, + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description') or media.get('teaser'), + 'timestamp': unified_timestamp(media.get('created')), + 'duration': float_or_none(media.get('duration_in_ms'), scale=1000), + 'thumbnails': thumbnails, + 'formats': formats, } diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 91449dcd8..3589bd428 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -5,7 +5,10 @@ import re import string from .discoverygo import DiscoveryGoBaseIE -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, try_get, @@ -55,15 +58,27 @@ class DiscoveryIE(DiscoveryGoBaseIE): video = next(cb for cb in content_blocks if cb.get('type') == 'video')['content']['items'][0] video_id = video['id'] - access_token = self._download_json( - 'https://www.%s.com/anonymous' % site, display_id, query={ - 'authRel': 'authorization', - 'client_id': try_get( - react_data, lambda x: x['application']['apiClientId'], - compat_str) or '3020a40c2356a645b4b4', - 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, - })['access_token'] + access_token = None + cookies = self._get_cookies(url) + + # prefer Affiliate Auth Token over Anonymous Auth Token + auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn') + if auth_storage_cookie and auth_storage_cookie.value: + auth_storage = self._parse_json(compat_urllib_parse_unquote( + compat_urllib_parse_unquote(auth_storage_cookie.value)), + video_id, fatal=False) or {} + access_token = auth_storage.get('a') or auth_storage.get('access_token') + + if not access_token: + access_token = self._download_json( + 'https://www.%s.com/anonymous' % site, display_id, query={ + 'authRel': 'authorization', + 'client_id': try_get( + react_data, lambda x: x['application']['apiClientId'], + compat_str) or '3020a40c2356a645b4b4', + 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, + })['access_token'] try: stream = self._download_json( @@ -72,7 +87,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): 'Authorization': 'Bearer ' + access_token, }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): e_description = self._parse_json( e.cause.read().decode(), display_id)['description'] if 'resource not available for country' in e_description: diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index b6653784c..fba1ef221 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE +from .dplay import DPlayIE from ..compat import ( compat_parse_qs, compat_urlparse, @@ -12,8 +12,13 @@ from ..compat import ( from ..utils import smuggle_url -class DiscoveryNetworksDeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P\d+)|(?:[^/]+/)*videos/(?P[^/?#]+))' +class DiscoveryNetworksDeIE(DPlayIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ + (?: + .*\#(?P<id>\d+)| + (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| + programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) + )''' _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -40,6 +45,14 @@ class DiscoveryNetworksDeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + alternate_id = mobj.group('alternate_id') + if alternate_id: + self._initialize_geo_bypass({ + 'countries': ['DE'], + }) + return self._get_disco_api_info( + url, '%s/%s' % (mobj.group('programme'), alternate_id), + 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') brightcove_id = mobj.group('id') if not brightcove_id: title = mobj.group('title') diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index b73446773..fe47f6dce 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -97,12 +97,83 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _get_disco_api_info(self, url, display_id, disco_host, realm): + disco_base = 'https://' + disco_host + token = self._download_json( + '%s/token' % disco_base, display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + headers = { + 'Referer': url, + 'Authorization': 'Bearer ' + token, + } + video = self._download_json( + '%s/content/videos/%s' % (disco_base, display_id), display_id, + headers=headers, query={ + 'include': 'show' + }) + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'] + formats = [] + for format_id, format_dict in self._download_json( + '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), + display_id, headers=headers)['data']['attributes']['streaming'].items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + series = None + try: + included = video.get('included') + if isinstance(included, list): + show = next(e for e in included if e.get('type') == 'show') + series = try_get( + show, lambda x: x['attributes']['name'], compat_str) + except StopIteration: + pass + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description'), + 'duration': float_or_none( + info.get('videoDuration'), scale=1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') domain = mobj.group('domain') - self._initialize_geo_bypass([mobj.group('country').upper()]) + self._initialize_geo_bypass({ + 'countries': [mobj.group('country').upper()], + }) webpage = self._download_webpage(url, display_id) @@ -111,72 +182,8 @@ class DPlayIE(InfoExtractor): if not video_id: host = mobj.group('host') - disco_base = 'https://disco-api.%s' % host - self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', - query={ - 'realm': host.replace('.', ''), - }) - video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, - headers={ - 'Referer': url, - 'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1', - }, query={ - 'include': 'show' - }) - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'] - formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id)['data']['attributes']['streaming'].items(): - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - } + return self._get_disco_api_info( + url, display_id, 'disco-api.' + host, host.replace('.', '')) info = self._download_json( 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ffbd2623d..ab32ba4ff 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -42,7 +42,7 @@ class DramaFeverBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index f138025d5..8d31258c1 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -8,7 +8,6 @@ from ..utils import ( unified_strdate, xpath_text, determine_ext, - qualities, float_or_none, ExtractorError, ) @@ -16,7 +15,8 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _GEO_COUNTRIES = ['DE'] + _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', @@ -43,7 +43,8 @@ class DreiSatIE(InfoExtractor): def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): param_groups = {} for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) + group_id = param_group.get(self._xpath_ns( + 'id', 'http://www.w3.org/XML/1998/namespace')) params = {} for param in param_group: params[param.get('name')] = param.get('value') @@ -54,7 +55,7 @@ class DreiSatIE(InfoExtractor): src = video.get('src') if not src: continue - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) group_id = video.get('paramGroup') param_group = param_groups[group_id] for proto in param_group['protocols'].split(','): @@ -75,66 +76,36 @@ class DreiSatIE(InfoExtractor): note='Downloading video info', errnote='Failed to download video info') - status_code = doc.find('./status/statuscode') - if status_code is not None and status_code.text != 'ok': - code = status_code.text - if code == 'notVisibleAnymore': + status_code = xpath_text(doc, './status/statuscode') + if status_code and status_code != 'ok': + if status_code == 'notVisibleAnymore': message = 'Video %s is not available' % video_id else: - message = '%s returned error: %s' % (self.IE_NAME, code) + message = '%s returned error: %s' % (self.IE_NAME, status_code) raise ExtractorError(message, expected=True) - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + title = xpath_text(doc, './/information/title', 'title', True) - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - return thumbnails - - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - - format_nodes = doc.findall('.//formitaeten/formitaet') - quality = qualities(['veryhigh', 'high', 'med', 'low']) - - def get_quality(elem): - return quality(xpath_text(elem, 'quality')) - format_nodes.sort(key=get_quality) - format_ids = [] + urls = [] formats = [] - for fnode in format_nodes: - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - if not is_available: + for fnode in doc.findall('.//formitaeten/formitaet'): + video_url = xpath_text(fnode, 'url') + if not video_url or video_url in urls: continue + urls.append(video_url) + + is_available = 'http://www.metafilegenerator' not in video_url + geoloced = 'static_geoloced_online' in video_url + if not is_available or geoloced: + continue + format_id = fnode.attrib['basetype'] - quality = xpath_text(fnode, './quality', 'quality') format_m = re.match(r'''(?x) (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) ''', format_id) ext = determine_ext(video_url, None) or format_m.group('container') - if ext not in ('smil', 'f4m', 'm3u8'): - format_id = format_id + '-' + quality - if format_id in format_ids: - continue if ext == 'meta': continue @@ -147,24 +118,23 @@ class DreiSatIE(InfoExtractor): if video_url.startswith('https://'): continue formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id=format_id, fatal=False)) else: - proto = format_m.group('proto').lower() + quality = xpath_text(fnode, './quality') + if quality: + format_id += '-' + quality - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None + tbr = int_or_none(self._search_regex( + r'_(\d+)k', video_url, 'bitrate', None)) + if tbr and vbr and not abr: + abr = tbr - vbr formats.append({ 'format_id': format_id, @@ -174,31 +144,50 @@ class DreiSatIE(InfoExtractor): 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, + 'tbr': tbr, + 'width': int_or_none(xpath_text(fnode, './width')), + 'height': int_or_none(xpath_text(fnode, './height')), + 'filesize': int_or_none(xpath_text(fnode, './filesize')), + 'protocol': format_m.group('proto').lower(), }) - format_ids.append(format_id) + + geolocation = xpath_text(doc, './/details/geolocation') + if not formats and geolocation and geolocation != 'none': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) self._sort_formats(formats) + thumbnails = [] + for node in doc.findall('.//teaserimages/teaserimage'): + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + thumbnail_key = node.get('key') + if thumbnail_key: + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': xpath_text(doc, './/information/detail'), + 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'uploader': xpath_text(doc, './/details/originChannelTitle'), + 'uploader_id': xpath_text(doc, './/details/originChannelId'), 'upload_date': upload_date, 'formats': formats, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + video_id = self._match_id(url) + details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id return self.extract_from_xml_url(video_id, details_url) diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py new file mode 100644 index 000000000..4ca97f860 --- /dev/null +++ b/youtube_dl/extractor/dtube.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re +from socket import timeout + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class DTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})' + _TEST = { + 'url': 'https://d.tube/#!/v/benswann/zqd630em', + 'md5': 'a03eaa186618ffa7a3145945543a251e', + 'info_dict': { + 'id': 'zqd630em', + 'ext': 'mp4', + 'title': 'Reality Check: FDA\'s Disinformation Campaign on Kratom', + 'description': 'md5:700d164e066b87f9eac057949e4227c2', + 'uploader_id': 'benswann', + 'upload_date': '20180222', + 'timestamp': 1519328958, + }, + 'params': { + 'format': '480p', + }, + } + + def _real_extract(self, url): + uploader_id, video_id = re.match(self._VALID_URL, url).groups() + result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({ + 'jsonrpc': '2.0', + 'method': 'get_content', + 'params': [uploader_id, video_id], + }).encode())['result'] + + metadata = json.loads(result['json_metadata']) + video = metadata['video'] + content = video['content'] + info = video.get('info', {}) + title = info.get('title') or result['title'] + + def canonical_url(h): + if not h: + return None + return 'https://ipfs.io/ipfs/' + h + + formats = [] + for q in ('240', '480', '720', '1080', ''): + video_url = canonical_url(content.get('video%shash' % q)) + if not video_url: + continue + format_id = (q + 'p') if q else 'Source' + try: + self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) + self._downloader._opener.open(video_url, timeout=5).close() + except timeout as e: + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, format_id)) + continue + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'height': int_or_none(q), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('description'), + 'thumbnail': canonical_url(info.get('snaphash')), + 'tags': content.get('tags') or metadata.get('tags'), + 'duration': info.get('duration'), + 'formats': formats, + 'timestamp': parse_iso8601(result.get('created')), + 'uploader_id': uploader_id, + } diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 3f760888e..20996962a 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -91,17 +91,6 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, - }, { - 'url': 'https://video.aktualne.cz/dvtv/babis-a-zeman-nesou-vinu-za-to-ze-nemame-jasno-v-tom-kdo-bud/r~026afb54fad711e79704ac1f6b220ee8/', - 'md5': '87defe16681b1429c91f7a74809823c6', - 'info_dict': { - 'id': 'f5ae72f6fad611e794dbac1f6b220ee8', - 'ext': 'mp4', - 'title': 'Babiš a Zeman nesou vinu za to, že nemáme jasno v tom, kdo bude vládnout, říká Pekarová Adamová', - }, - 'params': { - 'skip_download': True, - }, }] def _parse_video_metadata(self, js, video_id, live_js=None): diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py new file mode 100644 index 000000000..f61178012 --- /dev/null +++ b/youtube_dl/extractor/expressen.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, + unified_timestamp, +) + + +class ExpressenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', + 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'info_dict': { + 'id': '8690962', + 'ext': 'mp4', + 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', + 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 788, + 'timestamp': 1526639109, + 'upload_date': '20180518', + }, + }, { + 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + def extract_data(name): + return self._parse_json( + self._search_regex( + r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name, + webpage, 'info', group='value'), + display_id, transform_source=unescapeHTML) + + info = extract_data('video-tracking-info') + video_id = info['videoId'] + + data = extract_data('article-data') + stream = data['stream'] + + if determine_ext(stream) == 'm3u8': + formats = self._extract_m3u8_formats( + stream, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{ + 'url': stream, + }] + self._sort_formats(formats) + + title = info.get('titleRaw') or data['title'] + description = info.get('descriptionRaw') + thumbnail = info.get('socialMediaImage') or data.get('image') + duration = int_or_none(info.get('videoTotalSecondsDuration') or + data.get('totalSecondsDuration')) + timestamp = unified_timestamp(info.get('publishDate')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f90a549b3..95927dd7b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,6 +44,7 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( @@ -137,6 +138,7 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE @@ -144,6 +146,8 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .cammodels import CamModelsIE +from .camtube import CamTubeIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE @@ -195,6 +199,7 @@ from .clippit import ClippitIE from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE @@ -281,6 +286,7 @@ from .drtv import ( DRTVIE, DRTVLiveIE, ) +from .dtube import DTubeIE from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE @@ -329,6 +335,7 @@ from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE +from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE from .facebook import ( @@ -366,7 +373,6 @@ from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, FoxNewsArticleIE, - FoxNewsInsiderIE, ) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE @@ -376,6 +382,7 @@ from .francetv import ( FranceTVSiteIE, FranceTVEmbedIE, FranceTVInfoIE, + FranceTVInfoSportIE, FranceTVJeunesseIE, GenerationWhatIE, CultureboxIE, @@ -466,10 +473,7 @@ from .imgur import ( ) from .ina import InaIE from .inc import IncIE -from .indavideo import ( - IndavideoIE, - IndavideoEmbedIE, -) +from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internazionale import InternazionaleIE @@ -477,7 +481,10 @@ from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE -from .itv import ITVIE +from .itv import ( + ITVIE, + ITVBTCCIE, +) from .ivi import ( IviIE, IviCompilationIE @@ -576,13 +583,16 @@ from .mailru import ( MailRuMusicIE, MailRuMusicSearchIE, ) -from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE @@ -619,7 +629,6 @@ from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE -from .moniker import MonikerIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, @@ -640,6 +649,7 @@ from .mtv import ( from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE from .mwave import MwaveIE, MwaveMeetGreetIE +from .mychannels import MyChannelsIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import ( @@ -661,6 +671,7 @@ from .nbc import ( NBCOlympicsIE, NBCOlympicsStreamIE, NBCSportsIE, + NBCSportsStreamIE, NBCSportsVPlayerIE, ) from .ndr import ( @@ -700,12 +711,7 @@ from .nexx import ( from .nfb import NFBIE from .nfl import NFLIE from .nhk import NhkVodIE -from .nhl import ( - NHLVideocenterIE, - NHLNewsIE, - NHLVideocenterCategoryIE, - NHLIE, -) +from .nhl import NHLIE from .nick import ( NickIE, NickBrIE, @@ -714,10 +720,7 @@ from .nick import ( NickRuIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import ( - NineCNineMediaStackIE, - NineCNineMediaIE, -) +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE @@ -805,6 +808,7 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE from .periscope import ( @@ -1010,7 +1014,10 @@ from .spankbang import SpankBangIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE -from .spike import SpikeIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE @@ -1039,6 +1046,7 @@ from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, + SVTPageIE, SVTPlayIE, SVTSeriesIE, ) @@ -1142,6 +1150,7 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE +from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 220ada3a6..8a9ed96c2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -56,6 +56,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -208,6 +209,17 @@ class FacebookIE(InfoExtractor): # no title 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', + 'info_dict': { + 'id': '359649331226507', + 'ext': 'mp4', + 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'uploader': 'ESL One Dota 2', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -226,7 +238,7 @@ class FacebookIE(InfoExtractor): return urls def _login(self): - (useremail, password) = self._get_login_info() + useremail, password = self._get_login_info() if useremail is None: return @@ -312,16 +324,18 @@ class FacebookIE(InfoExtractor): if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) + def extract_from_jsmods_instances(js_data): + if js_data: + return extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) + if not video_data: server_js_data = self._parse_json( self._search_regex( r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) - if server_js_data: - video_data = extract_video_data(try_get( - server_js_data, lambda x: x['jsmods']['instances'], - list) or []) + video_data = extract_from_jsmods_instances(server_js_data) if not video_data: if not fatal_if_no_video: @@ -333,8 +347,33 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - else: - raise ExtractorError('Cannot parse data') + + # Video info not in first request, do a secondary request using + # tahoe player specific URL + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__user': 0, + '__a': 1, + '__pc': self._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex( + r'client_revision["\']\s*:\s*(\d+),', webpage, + 'client revision', default='3944515'), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) + + if not video_data: + raise ExtractorError('Cannot parse data') formats = [] for f in video_data: @@ -380,7 +419,8 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id( 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', + fatal=False) or self._og_search_title(webpage, fatal=False) timestamp = int_or_none(self._search_regex( r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 448647d72..435561147 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -46,7 +46,7 @@ class FC2IE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: return False diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index dc0662f74..63613cb85 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -58,6 +58,14 @@ class FoxNewsIE(AMPIE): }, ] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', + webpage)] + def _real_extract(self, url): host, video_id = re.match(self._VALID_URL, url).groups() @@ -68,21 +76,41 @@ class FoxNewsIE(AMPIE): class FoxNewsArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' - _TEST = { + _TESTS = [{ + # data-video-id 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'md5': '83d44e1aff1433e7a29a7b537d1700b5', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', 'description': 'Veterans react on \'The Kelly File\'', - 'timestamp': 1473299755, + 'timestamp': 1473301045, 'upload_date': '20160908', }, - } + }, { + # iframe embed + 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'info_dict': { + 'id': '5748266721001', + 'ext': 'flv', + 'title': 'Kyle Kashuv has a positive message for the Trump White House', + 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 229, + 'timestamp': 1520594670, + 'upload_date': '20180309', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -90,51 +118,10 @@ class FoxNewsArticleIE(InfoExtractor): video_id = self._html_search_regex( r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', - webpage, 'video ID', group='id') + webpage, 'video ID', group='id', default=None) + if video_id: + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) + return self.url_result( - 'http://video.foxnews.com/v/' + video_id, - FoxNewsIE.ie_key()) - - -class FoxNewsInsiderIE(InfoExtractor): - _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)' - IE_NAME = 'foxnews:insider' - - _TEST = { - 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', - 'md5': 'a10c755e582d28120c62749b4feb4c0c', - 'info_dict': { - 'id': '5099377331001', - 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', - 'ext': 'mp4', - 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', - 'description': 'Is campus censorship getting out of control?', - 'timestamp': 1472168725, - 'upload_date': '20160825', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': [FoxNewsIE.ie_key()], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - - return { - '_type': 'url_transparent', - 'ie_key': FoxNewsIE.ie_key(), - 'url': embed_url, - 'display_id': display_id, - 'title': title, - 'description': description, - } + FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index c02cd03de..6fc6b0da0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -379,6 +379,31 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): return self._make_url_result(video_id, catalogue) +class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): + IE_NAME = 'sport.francetvinfo.fr' + _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', + 'info_dict': { + 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', + 'ext': 'mp4', + 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', + 'timestamp': 1523639962, + 'upload_date': '20180413', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') + return self._make_url_result(video_id, 'Sport-web') + + class GenerationWhatIE(InfoExtractor): IE_NAME = 'france2.fr:generation-what' _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)' diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 107f658ba..07d01caec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -51,7 +51,7 @@ class FunimationIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index f71d9092e..8806dc48a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -91,7 +91,7 @@ class GDCVaultIE(InfoExtractor): ] def _login(self, webpage_url, display_id): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') return None diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 73980601c..aa04905ed 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -107,6 +107,11 @@ from .springboardplatform import SpringboardPlatformIE from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE +from .cloudflarestream import CloudflareStreamIE +from .peertube import PeerTubeIE +from .indavideo import IndavideoEmbedIE +from .apa import APAIE +from .foxnews import FoxNewsIE class GenericIE(InfoExtractor): @@ -1390,17 +1395,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # SVT embed - { - 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', - 'info_dict': { - 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', - 'duration': 27, - 'age_limit': 0, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -1471,21 +1465,6 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to parse JSON Expecting value'], }, - # Ooyala embed - { - 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', - 'info_dict': { - 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', - 'ext': 'mp4', - 'description': 'Index/Match versus VLOOKUP.', - 'title': 'This is what separates the Excel masters from the wannabes', - 'duration': 191.933, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - } - }, # Brightcove URL in single quotes { 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', @@ -2013,6 +1992,63 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # CloudflareStream embed + 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'add_ie': [CloudflareStreamIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + # PeerTube embed + 'url': 'https://joinpeertube.org/fr/home/', + 'info_dict': { + 'id': 'home', + 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', + }, + 'playlist_count': 2, + }, + { + # Indavideo embed + 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', + 'info_dict': { + 'id': '1693903', + 'ext': 'mp4', + 'title': 'Így kell otthon hamburgert sütni', + 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', + 'timestamp': 1426330212, + 'upload_date': '20150314', + 'uploader': 'StreetKitchen', + 'uploader_id': '546363', + }, + 'add_ie': [IndavideoEmbedIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + # APA embed via JWPlatform embed + 'url': 'http://www.vol.at/blue-man-group/5593454', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3025,6 +3061,31 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) + if cloudflarestream_urls: + return self.playlist_from_matches( + cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + + peertube_urls = PeerTubeIE._extract_urls(webpage, url) + if peertube_urls: + return self.playlist_from_matches( + peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) + if indavideo_urls: + return self.playlist_from_matches( + indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + + apa_urls = APAIE._extract_urls(webpage) + if apa_urls: + return self.playlist_from_matches( + apa_urls, video_id, video_title, ie=APAIE.ie_key()) + + foxnews_urls = FoxNewsIE._extract_urls(webpage) + if foxnews_urls: + return self.playlist_from_matches( + foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dc7b2661c..c2140c362 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -1,15 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import hashlib +import json import random import re -import math from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_str, - compat_chr, - compat_ord, ) from ..utils import ( ExtractorError, @@ -22,12 +23,7 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' - - _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' - _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' - - _RESIGN_EXPIRATION = 86400 - + _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', @@ -70,287 +66,51 @@ class GloboIE(InfoExtractor): 'only_matching': True, }] - class MD5(object): - HEX_FORMAT_LOWERCASE = 0 - HEX_FORMAT_UPPERCASE = 1 - BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' - BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' - PADDING = '=0xFF01DD' - hexcase = 0 - b64pad = '' + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return - def __init__(self): - pass - - class JSArray(list): - def __getitem__(self, y): - try: - return list.__getitem__(self, y) - except IndexError: - return 0 - - def __setitem__(self, i, y): - try: - return list.__setitem__(self, i, y) - except IndexError: - self.extend([0] * (i - len(self) + 1)) - self[-1] = y - - @classmethod - def hex_md5(cls, param1): - return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) - - @classmethod - def b64_md5(cls, param1, param2=None): - return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) - - @classmethod - def any_md5(cls, param1, param2): - return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) - - @classmethod - def rstr_md5(cls, param1): - return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) - - @classmethod - def rstr2hex(cls, param1): - _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' - _loc_3 = '' - for _loc_5 in range(0, len(param1)): - _loc_4 = compat_ord(param1[_loc_5]) - _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] - return _loc_3 - - @classmethod - def rstr2b64(cls, param1): - _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' - _loc_3 = '' - _loc_4 = len(param1) - for _loc_5 in range(0, _loc_4, 3): - _loc_6_1 = compat_ord(param1[_loc_5]) << 16 - _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 - _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 - _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 - for _loc_7 in range(0, 4): - if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: - _loc_3 += cls.b64pad - else: - _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] - return _loc_3 - - @staticmethod - def rstr2any(param1, param2): - _loc_3 = len(param2) - _loc_4 = [] - _loc_9 = [0] * ((len(param1) >> 2) + 1) - for _loc_5 in range(0, len(_loc_9)): - _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) - - while len(_loc_9) > 0: - _loc_8 = [] - _loc_7 = 0 - for _loc_5 in range(0, len(_loc_9)): - _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] - _loc_6 = math.floor(_loc_7 / _loc_3) - _loc_7 -= _loc_6 * _loc_3 - if len(_loc_8) > 0 or _loc_6 > 0: - _loc_8[len(_loc_8)] = _loc_6 - - _loc_4[len(_loc_4)] = _loc_7 - _loc_9 = _loc_8 - - _loc_10 = '' - _loc_5 = len(_loc_4) - 1 - while _loc_5 >= 0: - _loc_10 += param2[_loc_4[_loc_5]] - _loc_5 -= 1 - - return _loc_10 - - @classmethod - def str2rstr_utf8(cls, param1, param2=None): - _loc_3 = '' - _loc_4 = -1 - if not param2: - param2 = cls.PADDING - param1 = param1 + param2[1:9] - while True: - _loc_4 += 1 - if _loc_4 >= len(param1): - break - _loc_5 = compat_ord(param1[_loc_4]) - _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 - if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: - _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) - _loc_4 += 1 - if _loc_5 <= 127: - _loc_3 += compat_chr(_loc_5) - continue - if _loc_5 <= 2047: - _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) - continue - if _loc_5 <= 65535: - _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( - 128 | _loc_5 & 63) - continue - if _loc_5 <= 2097151: - _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( - 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) - return _loc_3 - - @staticmethod - def rstr2binl(param1): - _loc_2 = [0] * ((len(param1) >> 2) + 1) - for _loc_3 in range(0, len(_loc_2)): - _loc_2[_loc_3] = 0 - for _loc_3 in range(0, len(param1) * 8, 8): - _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 - return _loc_2 - - @staticmethod - def binl2rstr(param1): - _loc_2 = '' - for _loc_3 in range(0, len(param1) * 32, 8): - _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) - return _loc_2 - - @classmethod - def binl_md5(cls, param1, param2): - param1 = cls.JSArray(param1) - param1[param2 >> 5] |= 128 << param2 % 32 - param1[(param2 + 64 >> 9 << 4) + 14] = param2 - _loc_3 = 1732584193 - _loc_4 = -271733879 - _loc_5 = -1732584194 - _loc_6 = 271733878 - for _loc_7 in range(0, len(param1), 16): - _loc_8 = _loc_3 - _loc_9 = _loc_4 - _loc_10 = _loc_5 - _loc_11 = _loc_6 - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) - _loc_3 = cls.safe_add(_loc_3, _loc_8) - _loc_4 = cls.safe_add(_loc_4, _loc_9) - _loc_5 = cls.safe_add(_loc_5, _loc_10) - _loc_6 = cls.safe_add(_loc_6, _loc_11) - return [_loc_3, _loc_4, _loc_5, _loc_6] - - @classmethod - def md5_cmn(cls, param1, param2, param3, param4, param5, param6): - return cls.safe_add( - cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) - - @classmethod - def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) - - @classmethod - def safe_add(cls, param1, param2): - _loc_3 = (param1 & 65535) + (param2 & 65535) - _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) - return cls.lshift(_loc_4, 16) | _loc_3 & 65535 - - @classmethod - def bit_rol(cls, param1, param2): - return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) - - @staticmethod - def lshift(value, count): - r = (0xFFFFFFFF & value) << count - return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + try: + self._download_json( + 'https://login.globo.com/api/authentication', None, data=json.dumps({ + 'payload': { + 'email': email, + 'password': password, + 'serviceId': 4654, + }, + }).encode(), headers={ + 'Content-Type': 'application/json; charset=utf-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read(), None) + raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) + raise def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + 'http://api.globovideos.com/videos/%s/playlist' % video_id, + video_id)['videos'][0] title = video['title'] formats = [] for resource in video['resources']: resource_id = resource.get('_id') - if not resource_id or resource_id.endswith('manifest'): + resource_url = resource.get('url') + if not resource_id or not resource_url: continue security = self._download_json( - self._SECURITY_URL_TEMPLATE % (video_id, resource_id), - video_id, 'Downloading security hash for %s' % resource_id) + 'http://security.video.globo.com/videos/%s/hash' % video_id, + video_id, 'Downloading security hash for %s' % resource_id, query={ + 'player': 'flash', + 'version': '17.0.0.132', + 'resource_id': resource_id, + }) security_hash = security.get('hash') if not security_hash: @@ -361,22 +121,28 @@ class GloboIE(InfoExtractor): continue hash_code = security_hash[:2] - received_time = int(security_hash[2:12]) + received_time = security_hash[2:12] received_random = security_hash[12:22] received_md5 = security_hash[22:] - sign_time = received_time + self._RESIGN_EXPIRATION + sign_time = compat_str(int(received_time) + 86400) padding = '%010d' % random.randint(1, 10000000000) - signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) - signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + md5_data = (received_md5 + sign_time + padding + '0xFF01DD').encode() + signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') + signed_hash = hash_code + received_time + received_random + sign_time + padding + signed_md5 - resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats( + signed_url, resource_id, mpd_id='dash', fatal=False)) + elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'): + formats.extend(self._extract_ism_formats( + signed_url, resource_id, ism_id='mss', fatal=False)) else: formats.append({ 'url': signed_url, diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 9c7b1bd37..e781405f2 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -123,7 +123,7 @@ class GoIE(AdobePassIE): 'adobe_requestor_id': requestor_id, }) else: - self._initialize_geo_bypass(['US']) + self._initialize_geo_bypass({'countries': ['US']}) entitlement = self._download_json( 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', video_id, data=urlencode_postdata(data)) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 9b2e1c164..35dde42d0 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + ExtractorError, int_or_none, + parse_age_limit, parse_iso8601, ) @@ -23,6 +25,7 @@ class Go90IE(InfoExtractor): 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', 'timestamp': 1491868800, 'upload_date': '20170411', + 'age_limit': 14, } } @@ -33,6 +36,8 @@ class Go90IE(InfoExtractor): video_id, headers={ 'Content-Type': 'application/json; charset=utf-8', }, data=b'{"client":"web","device_type":"pc"}') + if video_data.get('requires_drm'): + raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] episode_number = int_or_none(video_data.get('episode_number')) @@ -123,4 +128,5 @@ class Go90IE(InfoExtractor): 'season_number': season_number, 'episode_number': episode_number, 'subtitles': subtitles, + 'age_limit': parse_age_limit(video_data.get('rating')), } diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index eee517071..39fabe8a5 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -17,6 +17,8 @@ class HiDiveIE(InfoExtractor): # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, # so disabling geo bypass completely _GEO_BYPASS = False + _NETRC_MACHINE = 'hidive' + _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', @@ -31,8 +33,26 @@ class HiDiveIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires Authentication', }] + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + webpage = self._download_webpage(self._LOGIN_URL, None) + form = self._search_regex( + r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', + webpage, 'login form') + data = self._hidden_inputs(form) + data.update({ + 'Email': email, + 'Password': password, + }) + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title, key = mobj.group('title', 'key') @@ -43,6 +63,7 @@ class HiDiveIE(InfoExtractor): data=urlencode_postdata({ 'Title': title, 'Key': key, + 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', })) restriction = settings.get('restrictionReason') @@ -79,6 +100,7 @@ class HiDiveIE(InfoExtractor): subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, }) + self._sort_formats(formats) season_number = int_or_none(self._search_regex( r's(\d+)', key, 'season number', default=None)) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 6424d34ac..9ba1aa703 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -66,7 +66,7 @@ class HRTiBaseIE(InfoExtractor): self._logout_url = modules['user']['resources']['logout']['uri'] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # TODO: figure out authentication with cookies if username is None or password is None: self.raise_login_required() diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 425421968..4bafa54a2 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -7,23 +7,23 @@ from ..compat import compat_str from ..utils import ( determine_ext, mimetype2ext, + parse_duration, qualities, - remove_end, ) class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'Ice Age: Continental Drift Trailer (No. 2)', - 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', + 'title': 'No. 2 from Ice Age: Continental Drift (2012)', + 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -40,82 +40,67 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', 'only_matching': True, + }, { + 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) - descr = self._html_search_regex( - r'(?s)<span itemprop="description">(.*?)</span>', - webpage, 'description', fatal=False) - player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id - player_page = self._download_webpage( - player_url, video_id, 'Downloading player page') - # the player page contains the info for the default format, we have to - # fetch other pages for the rest of the formats - extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page) - format_pages = [ - self._download_webpage( - f_url, video_id, 'Downloading info for %s format' % f_name) - for f_url, f_name in extra_formats] - format_pages.append(player_page) + webpage = self._download_webpage( + 'https://www.imdb.com/videoplayer/vi' + video_id, video_id) + video_metadata = self._parse_json(self._search_regex( + r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage, + 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id] + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'<title>(.+?)', webpage, 'title', fatal=False) or video_metadata['title'] quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] - for format_page in format_pages: - json_data = self._search_regex( - r']+class="imdb-player-data"[^>]*?>(.*?)', - format_page, 'json data', flags=re.DOTALL) - info = self._parse_json(json_data, video_id, fatal=False) - if not info: + for encoding in video_metadata.get('encodings', []): + if not encoding or not isinstance(encoding, dict): continue - format_info = info.get('videoPlayerObject', {}).get('video', {}) - if not format_info: + video_url = encoding.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): continue - video_info_list = format_info.get('videoInfoList') - if not video_info_list or not isinstance(video_info_list, list): + ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - for video_info in video_info_list: - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): - continue - if (video_info.get('videoMimeType') == 'application/x-mpegURL' or - determine_ext(video_url) == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + format_id = encoding.get('definition') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'quality': quality(format_id), + }) self._sort_formats(formats) return { 'id': video_id, - 'title': remove_end(self._og_search_title(webpage), ' - IMDb'), + 'title': title, 'formats': formats, - 'description': descr, - 'thumbnail': format_info.get('slate'), + 'description': video_metadata.get('description'), + 'thumbnail': video_metadata.get('slate', {}).get('url'), + 'duration': parse_duration(video_metadata.get('duration')), } class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/(?P[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d{9})(?!/videoplayer/vi\d+)' _TEST = { - 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', + 'url': 'https://www.imdb.com/list/ls009921623/', 'info_dict': { - 'id': 'JFs9NWw6XI0', - 'title': 'March 23, 2012 Releases', + 'id': '009921623', + 'title': 'The Bourne Legacy', + 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.', }, - 'playlist_count': 7, + 'playlist_count': 8, } def _real_extract(self, url): @@ -123,9 +108,13 @@ class ImdbListIE(InfoExtractor): webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') - for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)] + for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)] list_title = self._html_search_regex( - r'

(.*?)

', webpage, 'list title') + r']+class="[^"]*header[^"]*"[^>]*>(.*?)', + webpage, 'list title') + list_description = self._html_search_regex( + r']+class="[^"]*list-description[^"]*"[^>]*>

(.*?)

', + webpage, 'list description') - return self.playlist_result(entries, list_id, list_title) + return self.playlist_result(entries, list_id, list_title, list_description) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 67c24a51c..2901960a5 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -21,7 +20,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,7 +28,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/gallery/YcAQlkx', @@ -37,8 +36,6 @@ class ImgurIE(InfoExtractor): 'id': 'YcAQlkx', 'ext': 'mp4', 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - 'description': 'Imgur: The most awesome images on the Internet.' - } }, { 'url': 'http://imgur.com/topic/Funny/N8rOudd', @@ -50,8 +47,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - compat_urlparse.urljoin(url, video_id), video_id) + gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) + webpage = self._download_webpage(gifv_url, video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -107,7 +104,7 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'title': self._og_search_title(webpage), } diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py index 241ec83c4..d5b258a0f 100644 --- a/youtube_dl/extractor/inc.py +++ b/youtube_dl/extractor/inc.py @@ -21,6 +21,21 @@ class IncIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # div with id=kaltura_player_1_kqs38cgm + 'url': 'https://www.inc.com/oscar-raymundo/richard-branson-young-entrepeneurs.html', + 'info_dict': { + 'id': '1_kqs38cgm', + 'ext': 'mp4', + 'title': 'Branson: "In the end, you have to say, Screw it. Just do it."', + 'description': 'md5:21b832d034f9af5191ca5959da5e9cb6', + 'timestamp': 1364403232, + 'upload_date': '20130327', + 'uploader_id': 'incdigital@inc.com', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', 'only_matching': True, @@ -31,10 +46,13 @@ class IncIE(InfoExtractor): webpage = self._download_webpage(url, display_id) partner_id = self._search_regex( - r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') + r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, + 'partner id', default='1034971') - kaltura_id = self._parse_json(self._search_regex( - r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), + kaltura_id = self._search_regex( + r'id=(["\'])kaltura_player_(?P.+?)\1', webpage, 'kaltura id', + default=None, group='id') or self._parse_json(self._search_regex( + r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), display_id)['vid_kaltura_id'] return self.url_result( diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 11cf3c609..2b5b2b5b0 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -1,11 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + update_url_query, ) @@ -13,7 +17,7 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P[\da-f]+)' _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', - 'md5': 'f79b009c66194acacd40712a6778acfa', + 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', 'ext': 'mp4', @@ -36,6 +40,20 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\'](?P(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) @@ -45,7 +63,14 @@ class IndavideoEmbedIE(InfoExtractor): title = video['title'] - video_urls = video.get('video_files', []) + video_urls = [] + + video_files = video.get('video_files') + if isinstance(video_files, list): + video_urls.extend(video_files) + elif isinstance(video_files, dict): + video_urls.extend(video_files.values()) + video_file = video.get('video_file') if video: video_urls.append(video_file) @@ -58,11 +83,23 @@ class IndavideoEmbedIE(InfoExtractor): if flv_url not in video_urls: video_urls.append(flv_url) - formats = [{ - 'url': video_url, - 'height': int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)), - } for video_url in video_urls] + filesh = video.get('filesh') + + formats = [] + for video_url in video_urls: + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if filesh: + if not height: + continue + token = filesh.get(compat_str(height)) + if token is None: + continue + video_url = update_url_query(video_url, {'token': token}) + formats.append({ + 'url': video_url, + 'height': height, + }) self._sort_formats(formats) timestamp = video.get('date') @@ -89,55 +126,3 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': tags, 'formats': formats, } - - -class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' - _TESTS = [{ - 'url': 'http://indavideo.hu/video/Vicces_cica_1', - 'md5': '8c82244ba85d2a2310275b318eb51eac', - 'info_dict': { - 'id': '1335611', - 'display_id': 'Vicces_cica_1', - 'ext': 'mp4', - 'title': 'Vicces cica', - 'description': 'Játszik a tablettel. :D', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Jet_Pack', - 'uploader_id': '491217', - 'timestamp': 1390821212, - 'upload_date': '20140127', - 'duration': 7, - 'age_limit': 0, - 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], - }, - }, { - 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', - 'only_matching': True, - }, { - 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko', - 'only_matching': True, - }, { - 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci', - 'only_matching': True, - }, { - 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt', - 'only_matching': True, - }, { - 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - embed_url = self._search_regex( - r']+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') - - return { - '_type': 'url_transparent', - 'ie_key': 'IndavideoEmbed', - 'url': embed_url, - 'display_id': display_id, - } diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index fdfa7de9e..4b081bd46 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -239,7 +239,7 @@ class IqiyiIE(InfoExtractor): return ohdave_rsa_encrypt(data, e, N) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 18a7d7f8c..d05a7b68d 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -7,19 +7,22 @@ import json import re from .common import InfoExtractor +from .brightcove import BrightcoveNewIE from ..compat import ( compat_str, compat_etree_register_namespace, ) from ..utils import ( + determine_ext, + ExtractorError, extract_attributes, + int_or_none, + merge_dicts, + parse_duration, + smuggle_url, xpath_with_ns, xpath_element, xpath_text, - int_or_none, - parse_duration, - ExtractorError, - determine_ext, ) @@ -41,6 +44,14 @@ class ITVIE(InfoExtractor): # unavailable via data-playlist-url 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', 'only_matching': True, + }, { + # InvalidVodcrid + 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034', + 'only_matching': True, + }, { + # ContentUnavailable + 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024', + 'only_matching': True, }] def _real_extract(self, url): @@ -119,63 +130,65 @@ class ITVIE(InfoExtractor): resp_env = self._download_xml( params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env)) - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code != 'InvalidEntity': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) - else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), - }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + headers=headers, data=etree.tostring(req_env), fatal=False) + if resp_env: + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_code = xpath_text(resp_env, './/faultcode') + fault_string = xpath_text(resp_env, './/faultstring') + if fault_code == 'InvalidGeoRegion': + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') @@ -250,4 +263,49 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, }) - return info + + webpage_info = self._search_json_ld(webpage, video_id, default={}) + if not webpage_info.get('title'): + webpage_info['title'] = self._html_search_regex( + r'(?s)]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or webpage_info['episode'] + + return merge_dicts(info, webpage_info) + + +class ITVBTCCIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'info_dict': { + 'id': 'btcc-2018-all-the-action-from-brands-hatch', + 'title': 'BTCC 2018: All the action from Brands Hatch', + }, + 'playlist_mincount': 9, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + # ITV does not like some GB IP ranges, so here are some + # IP blocks it accepts + 'geo_ip_blocks': [ + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + ], + 'referrer': url, + }), + ie=BrightcoveNewIE.ie_key(), video_id=video_id) + for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + + title = self._og_search_title(webpage, fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index b1d72177d..f8fca6c8f 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -1,10 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, float_or_none, @@ -57,12 +58,33 @@ class IzleseneIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://www.izlesene.com/video/%s' % video_id - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('http://www.izlesene.com/video/%s' % video_id, video_id) + + video = self._parse_json( + self._search_regex( + r'videoObj\s*=\s*({.+?})\s*;\s*\n', webpage, 'streams'), + video_id) + + title = video.get('videoTitle') or self._og_search_title(webpage) + + formats = [] + for stream in video['media']['level']: + source_url = stream.get('source') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(url, 'mp4') + quality = stream.get('value') + height = int_or_none(quality) + formats.append({ + 'format_id': '%sp' % quality if quality else 'sd', + 'url': compat_urllib_parse_unquote(source_url), + 'ext': ext, + 'height': height, + }) + self._sort_formats(formats) - title = self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) - thumbnail = self._proto_relative_url( + thumbnail = video.get('posterURL') or self._proto_relative_url( self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( @@ -71,41 +93,15 @@ class IzleseneIE(InfoExtractor): timestamp = parse_iso8601(self._html_search_meta( 'uploadDate', webpage, 'upload date')) - duration = float_or_none(self._html_search_regex( - r'"videoduration"\s*:\s*"([^"]+)"', - webpage, 'duration', fatal=False), scale=1000) + duration = float_or_none(video.get('duration') or self._html_search_regex( + r'videoduration["\']?\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'duration', fatal=False, group='value'), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) comment_count = self._html_search_regex( r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - content_url = self._html_search_meta( - 'contentURL', webpage, 'content URL', fatal=False) - ext = determine_ext(content_url, 'mp4') - - # Might be empty for some videos. - streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') - - formats = [] - if streams: - for stream in streams.split('|'): - quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() - formats.append({ - 'format_id': '%sp' % quality if quality else 'sd', - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - else: - stream_url = self._search_regex( - r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') - formats.append({ - 'format_id': 'sd', - 'url': compat_urllib_parse_unquote(stream_url), - 'ext': ext, - }) - return { 'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index a764023e9..d9f8dbfd2 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -18,7 +18,7 @@ class JojIE(InfoExtractor): joj:| https?://media\.joj\.sk/embed/ ) - (?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + (?P[^/?#^]+) ''' _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', @@ -29,16 +29,24 @@ class JojIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3118, } + }, { + 'url': 'https://media.joj.sk/embed/9i1cxv', + 'only_matching': True, }, { 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', 'only_matching': True, + }, { + 'url': 'joj:9i1cxv', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): - return re.findall( - r']+\bsrc=["\'](?P(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - webpage) + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', + webpage)] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index ffe10154b..8dd1ce0d0 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -130,7 +130,7 @@ class LeIE(InfoExtractor): media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, - 'splatid': 101, + 'splatid': 105, 'format': 1, 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 2803d7e8d..729d8de50 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -282,7 +282,9 @@ class LimelightMediaIE(LimelightBaseIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) pc, mobile, metadata = self._extract( video_id, 'getPlaylistByMediaId', diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py new file mode 100644 index 000000000..def960a0c --- /dev/null +++ b/youtube_dl/extractor/markiza.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + orderedSet, + parse_duration, + try_get, +) + + +class MarkizaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P\d+)(?:[_/]|$)' + _TESTS = [{ + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139078', + 'ext': 'mp4', + 'title': 'Oteckovia 109', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2760, + }, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny', + 'info_dict': { + 'id': '85430', + 'title': 'Televízne noviny', + }, + 'playlist_count': 23, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/embed/85295', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://videoarchiv.markiza.sk/json/video_jwplayer7.json', + video_id, query={'id': video_id}) + + info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash') + + if info.get('_type') == 'playlist': + info.update({ + 'id': video_id, + 'title': try_get( + data, lambda x: x['details']['name'], compat_str), + }) + else: + info['duration'] = parse_duration( + try_get(data, lambda x: x['details']['duration'], compat_str)) + return info + + +class MarkizaPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P\d+)_' + _TESTS = [{ + 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139355', + 'ext': 'mp4', + 'title': 'Oteckovia 110', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2604, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas', + 'only_matching': True, + }, { + 'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach', + 'only_matching': True, + }, { + 'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili', + 'only_matching': True, + }, { + 'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba', + 'only_matching': True, + }, { + 'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + # Downloading for some hosts (e.g. dajto, doma) fails with 500 + # although everything seems to be OK, so considering 500 + # status code to be expected. + url, playlist_id, expected_status=500) + + entries = [ + self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) + for video_id in orderedSet(re.findall( + r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)', + webpage))] + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9760eafd5..9f2b60dcc 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -42,6 +42,22 @@ class MediasetIE(InfoExtractor): 'categories': ['reality'], }, 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', + 'md5': '1276f966ac423d16ba255ce867de073e', + 'info_dict': { + 'id': '846685', + 'ext': 'mp4', + 'title': 'Puntata del 25 maggio', + 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6565, + 'creator': 'mediaset', + 'upload_date': '20180525', + 'series': 'Matrix', + 'categories': ['infotainment'], + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', @@ -70,16 +86,33 @@ class MediasetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + video = self._download_json( + 'https://www.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading media info', query={ + 'id': video_id + })['video'] + + title = video['title'] + media_id = video.get('guid') or video_id + video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn.aspx', + 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', video_id, 'Downloading video CDN JSON', query={ - 'streamid': video_id, + 'streamid': media_id, 'format': 'json', })['videoList'] formats = [] for format_url in video_list: - if '.ism' in format_url: + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'ism' or '.ism' in format_url: formats.extend(self._extract_ism_formats( format_url, video_id, ism_id='mss', fatal=False)) else: @@ -89,30 +122,23 @@ class MediasetIE(InfoExtractor): }) self._sort_formats(formats) - mediainfo = self._download_json( - 'http://plr.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading video info JSON', query={ - 'id': video_id, - })['video'] - - title = mediainfo['title'] - creator = try_get( - mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + video, lambda x: x['brand-info']['publisher'], compat_str) category = try_get( - mediainfo, lambda x: x['brand-info']['category'], compat_str) + video, lambda x: x['brand-info']['category'], compat_str) categories = [category] if category else None return { 'id': video_id, 'title': title, - 'description': mediainfo.get('short-description'), - 'thumbnail': mediainfo.get('thumbnail'), - 'duration': parse_duration(mediainfo.get('duration')), + 'description': video.get('short-description'), + 'thumbnail': video.get('thumbnail'), + 'duration': parse_duration(video.get('duration')), 'creator': creator, - 'upload_date': unified_strdate(mediainfo.get('production-date')), - 'webpage_url': mediainfo.get('url'), - 'series': mediainfo.get('brand-value'), + 'upload_date': unified_strdate(video.get('production-date')), + 'webpage_url': video.get('url'), + 'series': video.get('brand-value'), + 'season': video.get('season'), 'categories': categories, 'formats': formats, } diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py index 959a10589..636731195 100644 --- a/youtube_dl/extractor/minoto.py +++ b/youtube_dl/extractor/minoto.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_codecs, +) class MinotoIE(InfoExtractor): @@ -26,7 +29,7 @@ class MinotoIE(InfoExtractor): formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) else: fmt_profile = fmt.get('profile') or {} - f = { + formats.append({ 'format_id': fmt_profile.get('name-short'), 'format_note': fmt_profile.get('name'), 'url': fmt_url, @@ -35,16 +38,8 @@ class MinotoIE(InfoExtractor): 'filesize': int_or_none(fmt.get('filesize')), 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), - } - codecs = fmt.get('codecs') - if codecs: - codecs = codecs.split(',') - if len(codecs) == 2: - f.update({ - 'vcodec': codecs[0], - 'acodec': codecs[1], - }) - formats.append(f) + 'codecs': parse_codecs(fmt.get('codecs')), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a56b7690f..b7bccb504 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -179,6 +179,10 @@ class MixcloudIE(InfoExtractor): formats.append({ 'format_id': 'http', 'url': decrypted, + 'downloader_options': { + # Mixcloud starts throttling at >~5M + 'http_chunk_size': 5242880, + }, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 675ff6873..b907f6b49 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,96 +1,90 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) +from .nhl import NHLBaseIE -class MLBIE(InfoExtractor): +class MLBIE(NHLBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*mlb\.com/ + (?:[\da-z_-]+\.)*(?Pmlb)\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| + (?:[^/]+/)*c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) - (?Pn?\d+)| - (?:[^/]+/)*(?P[^/]+) + (?P\d+) ) ''' + _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { - 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', 'info_dict': { 'id': '34698933', 'ext': 'mp4', 'title': "Ackley's spectacular catch", 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'duration': 66, - 'timestamp': 1405980600, - 'upload_date': '20140721', + 'timestamp': 1405995000, + 'upload_date': '20140722', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', - 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', + 'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663', + 'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f', 'info_dict': { 'id': '34496663', 'ext': 'mp4', 'title': 'Stanton prepares for Derby', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'duration': 46, - 'timestamp': 1405105800, + 'timestamp': 1405120200, 'upload_date': '20140711', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', - 'md5': '0e6e73d509321e142409b695eadd541f', + 'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115', + 'md5': '99bb9176531adc600b90880fb8be9328', 'info_dict': { 'id': '34578115', 'ext': 'mp4', 'title': 'Cespedes repeats as Derby champ', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'duration': 488, - 'timestamp': 1405399936, + 'timestamp': 1405414336, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', - 'md5': 'b8fd237347b844365d74ea61d4245967', + 'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915', + 'md5': 'da8b57a12b060e7663ee1eebd6f330ec', 'info_dict': { 'id': '34577915', 'ext': 'mp4', 'title': 'Bautista on Home Run Derby', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'duration': 52, - 'timestamp': 1405390722, + 'timestamp': 1405405122, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'aafaf5b0186fee8f32f20508092f8111', + 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', + 'md5': 'e09e37b552351fddbf4d9e699c924d68', 'info_dict': { 'id': '75609783', 'ext': 'mp4', 'title': 'Must C: Pillar climbs for catch', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429124820, + 'timestamp': 1429139220, 'upload_date': '20150415', } }, @@ -111,7 +105,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783', 'only_matching': True, }, { @@ -120,58 +114,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', + 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', 'only_matching': True, } ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if not video_id: - video_path = mobj.group('path') - webpage = self._download_webpage(url, video_path) - video_id = self._search_regex( - [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') - - detail = self._download_xml( - 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' - % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) - - title = detail.find('./headline').text - description = detail.find('./big-blurb').text - duration = parse_duration(detail.find('./duration').text) - timestamp = parse_iso8601(detail.attrib['date'][:-5]) - - thumbnails = [{ - 'url': thumbnail.text, - } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')] - - formats = [] - for media_url in detail.findall('./url'): - playback_scenario = media_url.attrib['playback_scenario'] - fmt = { - 'url': media_url.text, - 'format_id': playback_scenario, - } - m = re.search(r'(?P\d+)K_(?P\d+)X(?P\d+)', playback_scenario) - if m: - fmt.update({ - 'vbr': int(m.group('vbr')) * 1000, - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py deleted file mode 100644 index b208820fe..000000000 --- a/youtube_dl/extractor/moniker.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os.path -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - remove_start, - sanitized_Request, - urlencode_postdata, -) - - -class MonikerIE(InfoExtractor): - IE_DESC = 'allmyvideos.net and vidspot.net' - _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P[a-zA-Z0-9_-]+)' - - _TESTS = [{ - 'url': 'http://allmyvideos.net/jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://vidspot.net/l2ngsmhs8ci5', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'l2ngsmhs8ci5', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', - 'only_matching': True, - }, { - 'url': 'http://vidspot.net/2/v-ywDf99', - 'md5': '5f8254ce12df30479428b0152fb8e7ba', - 'info_dict': { - 'id': 'ywDf99', - 'ext': 'mp4', - 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)', - 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.', - }, - }, { - 'url': 'http://allmyvideos.net/v/v-HXZm5t', - 'only_matching': True, - }] - - def _real_extract(self, url): - orig_video_id = self._match_id(url) - video_id = remove_start(orig_video_id, 'embed-') - url = url.replace(orig_video_id, video_id) - assert re.match(self._VALID_URL, url) is not None - orig_webpage = self._download_webpage(url, video_id) - - if '>File Not Found<' in orig_webpage: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - error = self._search_regex( - r'class="err">([^<]+)<', orig_webpage, 'error', default=None) - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - builtin_url = self._search_regex( - r']+src=(["\'])(?P.+?/builtin-.+?)\1', - orig_webpage, 'builtin URL', default=None, group='url') - - if builtin_url: - req = sanitized_Request(builtin_url) - req.add_header('Referer', url) - webpage = self._download_webpage(req, video_id, 'Downloading builtin page') - title = self._og_search_title(orig_webpage).strip() - description = self._og_search_description(orig_webpage).strip() - else: - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = dict(fields) - - post = urlencode_postdata(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - description = None - - # Could be several links with different quality - links = re.findall(r'"file" : "?(.+?)",', webpage) - # Assume the links are ordered in quality - formats = [{ - 'url': l, - 'quality': i, - } for i, l in enumerate(links)] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index e24396e79..bed5645f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -77,8 +77,11 @@ class MotherlessIE(InfoExtractor): title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - video_url = self._html_search_regex( - r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') + video_url = (self._html_search_regex( + (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'video URL', default=None, group='url') or + 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( r'Views\s+([^<]+)<', @@ -120,7 +123,7 @@ class MotherlessIE(InfoExtractor): class MotherlessGroupIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' _TESTS = [{ 'url': 'http://motherless.com/g/movie_scenes', 'info_dict': { diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/mychannels.py similarity index 59% rename from youtube_dl/extractor/makerschannel.py rename to youtube_dl/extractor/mychannels.py index f5d00e61d..b1ffe7848 100644 --- a/youtube_dl/extractor/makerschannel.py +++ b/youtube_dl/extractor/mychannels.py @@ -6,17 +6,17 @@ import re from .common import InfoExtractor -class MakersChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?Pvideo|production)_id=(?P[0-9]+)' +class MyChannelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?Pvideo|production)_id=(?P[0-9]+)' _TEST = { - 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849', - 'md5': '624a512c6969236b5967bf9286345ad1', + 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', + 'md5': 'b8993daad4262dd68d89d651c0c52c45', 'info_dict': { - 'id': '849', + 'id': 'wUUDZZep6vQD', 'ext': 'mp4', - 'title': 'Landing a bus on a plane is an epic win', - 'uploader': 'ZoomIn', - 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139', + 'title': 'Miss Holland joins VOTE LEAVE', + 'description': 'Miss Holland | #13 Not a potato', + 'uploader': 'Miss Holland', } } @@ -27,12 +27,12 @@ class MakersChannelIE(InfoExtractor): def extract_data_val(attr, fatal=False): return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) - minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') return { '_type': 'url_transparent', 'url': 'minoto:%s' % minoto_id, - 'id': extract_data_val('video-id', True), + 'id': url_id, 'title': extract_data_val('title', True), 'description': extract_data_val('description'), 'thumbnail': extract_data_val('image'), diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 9dc8f9ebc..c843f8649 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,7 +1,8 @@ from __future__ import unicode_literals -import re import base64 +import json +import re from .common import InfoExtractor from .theplatform import ThePlatformIE @@ -9,6 +10,7 @@ from .adobepass import AdobePassIE from ..utils import ( find_xpath_attr, smuggle_url, + try_get, unescapeHTML, update_url_query, int_or_none, @@ -78,10 +80,14 @@ class NBCIE(AdobePassIE): def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() permalink = 'http' + permalink - video_data = self._download_json( + response = self._download_json( 'https://api.nbc.com/v3/videos', video_id, query={ 'filter[permalink]': permalink, - })['data'][0]['attributes'] + 'fields[videos]': 'description,entitlement,episodeNumber,guid,keywords,seasonNumber,title,vChipRating', + 'fields[shows]': 'shortTitle', + 'include': 'show.shortTitle', + }) + video_data = response['data'][0]['attributes'] query = { 'mbr': 'true', 'manifest': 'm3u', @@ -103,10 +109,11 @@ class NBCIE(AdobePassIE): 'title': title, 'url': theplatform_url, 'description': video_data.get('description'), - 'keywords': video_data.get('keywords'), + 'tags': video_data.get('keywords'), 'season_number': int_or_none(video_data.get('seasonNumber')), 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('showName'), + 'episode': title, + 'series': try_get(response, lambda x: x['included'][0]['attributes']['shortTitle']), 'ie_key': 'ThePlatform', } @@ -169,6 +176,65 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') +class NBCSportsStreamIE(AdobePassIE): + _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P\d+)' + _TEST = { + 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559', + 'info_dict': { + 'id': '206559', + 'ext': 'mp4', + 'title': 'Amgen Tour of California Women\'s Recap', + 'description': 'md5:66520066b3b5281ada7698d0ea2aa894', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires Adobe Pass Authentication', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + live_source = self._download_json( + 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, + video_id) + video_source = live_source['videoSources'][0] + title = video_source['title'] + source_url = None + for k in ('source', 'msl4source', 'iossource', 'hlsv4'): + sk = k + 'Url' + source_url = video_source.get(sk) or video_source.get(sk + 'Alt') + if source_url: + break + else: + source_url = video_source['ottStreamUrl'] + is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' + resource = self._get_mvpd_resource('nbcsports', title, video_id, '') + token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) + tokenized_url = self._download_json( + 'https://token.playmakerservices.com/cdn', + video_id, data=json.dumps({ + 'requestorId': 'nbcsports', + 'pid': video_id, + 'application': 'NBCSports', + 'version': 'v1', + 'platform': 'desktop', + 'cdn': 'akamai', + 'url': video_source['sourceUrl'], + 'token': base64.b64encode(token.encode()).decode(), + 'resourceId': base64.b64encode(resource.encode()).decode(), + }).encode())['tokenizedUrl'] + formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': live_source.get('description'), + 'formats': formats, + 'is_live': is_live, + } + + class CSNNEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 5e46a75c0..82d526c22 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -29,14 +29,13 @@ class NexxIE(InfoExtractor): _TESTS = [{ # movie 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '828cea195be04e66057b846288295ba1', + 'md5': '31899fd683de49ad46f4ee67e53e83fe', 'info_dict': { 'id': '128907', 'ext': 'mp4', 'title': 'Stiftung Warentest', 'alt_title': 'Wie ein Test abläuft', 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', - 'release_year': 2013, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2509, @@ -62,6 +61,7 @@ class NexxIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { # does not work via arc 'url': 'nexx:741:1269984', @@ -71,12 +71,26 @@ class NexxIE(InfoExtractor): 'ext': 'mp4', 'title': '1 TAG ohne KLO... wortwörtlich! 😑', 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'description': 'md5:4604539793c49eda9443ab5c5b1d612f', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 607, 'timestamp': 1518614955, 'upload_date': '20180214', }, + }, { + # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html + 'url': 'nexx:747:1533779', + 'md5': '6bf6883912b82b7069fb86c2297e9893', + 'info_dict': { + 'id': '1533779', + 'ext': 'mp4', + 'title': 'Aufregung um ausgebrochene Raubtiere', + 'alt_title': 'Eifel-Zoo', + 'description': 'md5:f21375c91c74ad741dcb164c427999d2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 111, + 'timestamp': 1527874460, + 'upload_date': '20180601', + }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -141,6 +155,139 @@ class NexxIE(InfoExtractor): self._handle_error(result) return result['result'] + def _extract_free_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'free' + + hash = video['general']['hash'] + + ps = compat_str(stream_data['originalDomain']) + if stream_data['applyFolderHierarchy'] == 1: + s = ('%04d' % int(video_id))[::-1] + ps += '/%s/%s' % (s[0:2], s[2:4]) + ps += '/%s/%s_' % (video_id, hash) + + t = 'http://%s' + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] + + def p0(p): + return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' + + formats = [] + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(int(p[0])) + ',' + t += '.mp4.csmil/master.%s' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + http_base = t = '/'.join(k) + http_base = http_base % stream_data['cdnPathHTTP'] + t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + tbr = int(p[0]) + filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) + f = { + 'url': http_base + '/' + filename, + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = p[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + a = filename + ':%s' % (tbr * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + else: + assert False + + if cdn_provider == 'ce': + formats.extend(self._extract_mpd_formats( + t % (stream_data['cdnPathDASH'], 'mpd'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_m3u8_formats( + t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) + + return formats + + def _extract_azure_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'azure' + + azure_locator = stream_data['azureLocator'] + + def get_cdn_shield_base(shield_type='', static=False): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + + language = video['general'].get('language_raw') or '' + + azure_stream_base = get_cdn_shield_base() + is_ml = ',' in language + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' + + protection_token = try_get( + video, lambda x: x['protectiondata']['token'], compat_str) + if protection_token: + azure_manifest_url += '?hdnts=%s' % protection_token + + formats = self._extract_m3u8_formats( + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', True) + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') @@ -220,72 +367,15 @@ class NexxIE(InfoExtractor): general = video['general'] title = general['title'] - stream_data = video['streamdata'] - language = general.get('language_raw') or '' + cdn = video['streamdata']['cdnType'] - # TODO: reverse more cdns - - cdn = stream_data['cdnType'] - assert cdn == 'azure' - - azure_locator = stream_data['azureLocator'] - - def get_cdn_shield_base(shield_type='', static=False): - for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) - if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) - else: - if 'fb' in stream_data['azureAccount']: - prefix = 'df' if static else 'f' - else: - prefix = 'd' if static else 'p' - account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) - return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) - - azure_stream_base = get_cdn_shield_base() - is_ml = ',' in language - azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( - azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' - - protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) - if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token - - formats = self._extract_m3u8_formats( - azure_manifest_url % '(format=m3u8-aapl)', - video_id, 'mp4', 'm3u8_native', - m3u8_id='%s-hls' % cdn, fatal=False) - formats.extend(self._extract_mpd_formats( - azure_manifest_url % '(format=mpd-time-csf)', - video_id, mpd_id='%s-dash' % cdn, fatal=False)) - formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - - azure_progressive_base = get_cdn_shield_base('Prog', True) - azure_file_distribution = stream_data.get('azureFileDistribution') - if azure_file_distribution: - fds = azure_file_distribution.split(',') - if fds: - for fd in fds: - ss = fd.split(':') - if len(ss) == 2: - tbr = int_or_none(ss[0]) - if tbr: - f = { - 'url': '%s%s/%s_src_%s_%d.mp4' % ( - azure_progressive_base, azure_locator, video_id, ss[1], tbr), - 'format_id': '%s-http-%d' % (cdn, tbr), - 'tbr': tbr, - } - width_height = ss[1].split('x') - if len(width_height) == 2: - f.update({ - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - }) - formats.append(f) + if cdn == 'azure': + formats = self._extract_azure_formats(video, video_id) + elif cdn == 'free': + formats = self._extract_free_formats(video, video_id) + else: + # TODO: reverse more cdns + assert False self._sort_formats(formats) diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 62ce800c0..cf440f713 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,18 +1,10 @@ from __future__ import unicode_literals import re -import json -import os from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( - unified_strdate, determine_ext, int_or_none, parse_iso8601, @@ -20,236 +12,77 @@ from ..utils import ( ) -class NHLBaseInfoExtractor(InfoExtractor): - @staticmethod - def _fix_json(json_string): - return json_string.replace('\\\'', '\'') +class NHLBaseIE(InfoExtractor): + def _real_extract(self, url): + site, tmp_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://%s/%s/%sid/v1/%s/details/web-v1.json' + % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + if video_data.get('type') != 'video': + video_data = video_data['media'] + video = video_data.get('video') + if video: + video_data = video + else: + videos = video_data.get('videos') + if videos: + video_data = videos[0] - def _real_extract_video(self, video_id): - vid_parts = video_id.split(',') - if len(vid_parts) == 3: - video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0')) - json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id - data = self._download_json( - json_url, video_id, transform_source=self._fix_json) - return self._extract_video(data[0]) + video_id = compat_str(video_data['id']) + title = video_data['title'] - def _extract_video(self, info): - video_id = info['id'] - self.report_extraction(video_id) + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), + }) + self._sort_formats(formats) - initial_video_url = info['publishPoint'] - if info['formats'] == '1': - parsed_url = compat_urllib_parse_urlparse(initial_video_url) - filename, ext = os.path.splitext(parsed_url.path) - path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse_urlencode({ - 'type': 'fvod', - 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) + thumbnails = [] + cuts = video_data.get('image', {}).get('cuts') or [] + if isinstance(cuts, dict): + cuts = cuts.values() + for thumbnail_data in cuts: + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text - else: - video_url = initial_video_url - - join = compat_urlparse.urljoin - ret = { - 'id': video_id, - 'title': info['name'], - 'url': video_url, - 'description': info['description'], - 'duration': int(info['duration']), - 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), - 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), - } - if video_url.startswith('rtmp:'): - mobj = re.match(r'(?Prtmp://[^/]+/(?P[a-z0-9/]+))/(?Pmp4:.*)', video_url) - ret.update({ - 'tc_url': mobj.group('tc_url'), - 'play_path': mobj.group('play_path'), - 'app': mobj.group('app'), - 'no_resume': True, - }) - return ret - - -class NHLVideocenterIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter' - _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P[-0-9a-zA-Z,]+)' - - _TESTS = [{ - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', - 'md5': 'db704a4ea09e8d3988c85e36cc892d09', - 'info_dict': { - 'id': '453614', - 'ext': 'mp4', - 'title': 'Quick clip: Weise 4-3 goal vs Flames', - 'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.', - 'duration': 18, - 'upload_date': '20131006', - }, - }, { - 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', - 'md5': 'd22e82bc592f52d37d24b03531ee9696', - 'info_dict': { - 'id': '2014020024-628-h', - 'ext': 'mp4', - 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', - 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', - 'duration': 0, - 'upload_date': '20141011', - }, - }, { - 'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802', - 'md5': 'c78fc64ea01777e426cfc202b746c825', - 'info_dict': { - 'id': '58665', - 'ext': 'flv', - 'title': 'Classic Game In Six - April 22, 1979', - 'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.', - 'duration': 400, - 'upload_date': '20100129' - }, - }, { - 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/?id=736722', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en', - 'md5': '076fcb88c255154aacbf0a7accc3f340', - 'info_dict': { - 'id': '2014020299-X-h', - 'ext': 'mp4', - 'title': 'Penguins at Islanders / Game Highlights', - 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014', - 'duration': 268, - 'upload_date': '20141122', - } - }, { - 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4', - 'info_dict': { - 'id': '691469', - 'ext': 'mp4', - 'title': 'RAW | Craig MacTavish Full Press Conference', - 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.', - 'upload_date': '20141205', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - }, { - 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._real_extract_video(video_id) - - -class NHLNewsIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:news' - IE_DESC = 'NHL news' - _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P[-0-9a-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.nhl.com/ice/news.htm?id=750727', - 'md5': '4b3d1262e177687a3009937bd9ec0be8', - 'info_dict': { - 'id': '736722', - 'ext': 'mp4', - 'title': 'Cal Clutterbuck has been fined $2,000', - 'description': 'md5:45fe547d30edab88b23e0dd0ab1ed9e6', - 'duration': 37, - 'upload_date': '20150128', - }, - }, { - # iframe embed - 'url': 'http://sabres.nhl.com/club/news.htm?id=780189', - 'md5': '9f663d1c006c90ac9fb82777d4294e12', - 'info_dict': { - 'id': '836127', - 'ext': 'mp4', - 'title': 'Morning Skate: OTT vs. BUF (9/23/15)', - 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.", - 'duration': 93, - 'upload_date': '20150923', - }, - }] - - def _real_extract(self, url): - news_id = self._match_id(url) - webpage = self._download_webpage(url, news_id) - video_id = self._search_regex( - [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'", - r']+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'], - webpage, 'video id') - return self._real_extract_video(video_id) - - -class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter:category' - IE_DESC = 'NHL videocenter category' - _VALID_URL = r'https?://video\.(?P[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P[0-9]+)(?![&?]id=).*?)?$' - _TEST = { - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999', - 'info_dict': { - 'id': '999', - 'title': 'Highlights', - }, - 'playlist_count': 12, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - team = mobj.group('team') - webpage = self._download_webpage(url, team) - cat_id = self._search_regex( - [r'var defaultCatId = "(.+?)";', - r'{statusIndex:0,index:0,.*?id:(.*?),'], - webpage, 'category id') - playlist_title = self._html_search_regex( - r'tab0"[^>]*?>(.*?)', - webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - - data = compat_urllib_parse_urlencode({ - 'cid': cat_id, - # This is the default value - 'count': 12, - 'ptrs': 3, - 'format': 'json', - }) - path = '/videocenter/servlets/browse?' + data - request_url = compat_urlparse.urljoin(url, path) - response = self._download_webpage(request_url, playlist_title) - response = self._fix_json(response) - if not response.strip(): - self._downloader.report_warning('Got an empty response, trying ' - 'adding the "newvideos" parameter') - response = self._download_webpage(request_url + '&newvideos=true', - playlist_title) - response = self._fix_json(response) - videos = json.loads(response) return { - '_type': 'playlist', - 'title': playlist_title, - 'id': cat_id, - 'entries': [self._extract_video(v) for v in videos], + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, } -class NHLIE(InfoExtractor): +class NHLIE(NHLBaseIE): IE_NAME = 'nhl.com' _VALID_URL = r'https?://(?:www\.)?(?Pnhl|wch2016)\.com/(?:[^/]+/)*c-(?P\d+)' - _SITES_MAP = { - 'nhl': 'nhl', - 'wch2016': 'wch', - } + _CONTENT_DOMAIN = 'nhl.bamcontent.com' _TESTS = [{ # type=video 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', @@ -293,59 +126,3 @@ class NHLIE(InfoExtractor): 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', 'only_matching': True, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tmp_id, site = mobj.group('id'), mobj.group('site') - video_data = self._download_json( - 'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json' - % (self._SITES_MAP[site], tmp_id), tmp_id) - if video_data.get('type') == 'article': - video_data = video_data['media'] - - video_id = compat_str(video_data['id']) - title = video_data['title'] - - formats = [] - for playback in video_data.get('playbacks', []): - playback_url = playback.get('url') - if not playback_url: - continue - ext = determine_ext(playback_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False) - self._check_formats(m3u8_formats, video_id) - formats.extend(m3u8_formats) - else: - height = int_or_none(playback.get('height')) - formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), - 'url': playback_url, - 'width': int_or_none(playback.get('width')), - 'height': height, - }) - self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id')) - - thumbnails = [] - for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items(): - thumbnail_url = thumbnail_data.get('src') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_data.get('width')), - 'height': int_or_none(thumbnail_data.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('date')), - 'duration': parse_duration(video_data.get('duration')), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 256a24d86..5e34d776b 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -85,7 +85,7 @@ class NickBrIE(MTVServicesInfoExtractor): https?:// (?: (?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br| - (?:www\.)?nickjr\.nl + (?:www\.)?nickjr\.[a-z]{2} ) /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?\#.]+) ''' @@ -98,6 +98,9 @@ class NickBrIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', 'only_matching': True, + }, { + 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index df7f528be..dbe871f16 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -163,7 +163,7 @@ class NiconicoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: return True diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 8961309fd..65754c5e7 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( parse_iso8601, float_or_none, @@ -13,38 +12,11 @@ from ..utils import ( ) -class NineCNineMediaBaseIE(InfoExtractor): - _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' - - -class NineCNineMediaStackIE(NineCNineMediaBaseIE): - IE_NAME = '9c9media:stack' - _GEO_COUNTRIES = ['CA'] - _VALID_URL = r'9c9media:stack:(?P[^:]+):(?P\d+):(?P\d+):(?P\d+)' - - def _real_extract(self, url): - destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() - stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' - stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) - - formats = [] - formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', stack_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', stack_id, - f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - return { - 'id': stack_id, - 'formats': formats, - } - - -class NineCNineMediaIE(NineCNineMediaBaseIE): +class NineCNineMediaIE(InfoExtractor): IE_NAME = '9c9media' + _GEO_COUNTRIES = ['CA'] _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' def _real_extract(self, url): destination_code, content_id = re.match(self._VALID_URL, url).groups() @@ -58,13 +30,26 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id - content_package = self._download_json(content_package_url, content_id) + content_package = self._download_json( + content_package_url, content_id, query={ + '$include': '[HasClosedCaptions]', + }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + if content_package.get('Constraints', {}).get('Security', {}).get('Type'): raise ExtractorError('This video is DRM protected.', expected=True) - stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] - multistacks = len(stacks) > 1 + manifest_base_url = content_package_url + 'manifest.' + formats = [] + formats.extend(self._extract_m3u8_formats( + manifest_base_url + 'm3u8', content_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_base_url + 'f4m', content_id, + f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', content_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) thumbnails = [] for image in content.get('Images', []): @@ -85,10 +70,12 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): continue container.append(e_name) - description = content.get('Desc') or content.get('ShortDesc') season = content.get('Season', {}) - base_info = { - 'description': description, + + info = { + 'id': content_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), @@ -97,26 +84,19 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): 'series': content.get('Media', {}).get('Name'), 'tags': tags, 'categories': categories, + 'duration': float_or_none(content_package.get('Duration')), + 'formats': formats, } - entries = [] - for stack in stacks: - stack_id = compat_str(stack['Id']) - entry = { - '_type': 'url_transparent', - 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), - 'id': stack_id, - 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, - 'duration': float_or_none(stack.get('Duration')), - 'ie_key': 'NineCNineMediaStack', + if content_package.get('HasClosedCaptions'): + info['subtitles'] = { + 'en': [{ + 'url': manifest_base_url + 'vtt', + 'ext': 'vtt', + }, { + 'url': manifest_base_url + 'srt', + 'ext': 'srt', + }] } - entry.update(base_info) - entries.append(entry) - return { - '_type': 'multi_video', - 'id': content_id, - 'title': title, - 'description': description, - 'entries': entries, - } + return info diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index a9f9b10c4..58b371ed7 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -65,7 +65,7 @@ class NocoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index ff2153387..c2cb85a73 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -36,8 +36,8 @@ class NPOIE(NPOBaseIE): https?:// (?:www\.)? (?: - npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| + npo\.nl/(?:[^/]+/)*| + (?:ntr|npostart)\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| (?:zapp|npo3)\.nl/(?:[^/]+/){2,} ) @@ -160,8 +160,20 @@ class NPOIE(NPOBaseIE): }, { 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996', + 'only_matching': True, + }, { + 'url': 'https://npo.nl/KN_1698996', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if any(ie.suitable(url) + for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) + else super(NPOIE, cls).suitable(url)) + def _real_extract(self, url): video_id = self._match_id(url) return self._get_info(video_id) @@ -270,7 +282,7 @@ class NPOIE(NPOBaseIE): video_url = stream_info.get('url') if not video_url or video_url in urls: continue - urls.add(item_url) + urls.add(video_url) if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, ext='mp4', @@ -389,7 +401,7 @@ class NPOLiveIE(NPOBaseIE): class NPORadioIE(InfoExtractor): IE_NAME = 'npo.nl:radio' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P[^/]+)' _TEST = { 'url': 'http://www.npo.nl/radio/radio-1', @@ -404,6 +416,10 @@ class NPORadioIE(InfoExtractor): } } + @classmethod + def suitable(cls, url): + return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) + @staticmethod def _html_get_attribute_regex(attribute): return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b4f51f61..7157e2390 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -16,12 +16,22 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _api_host = None + def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://%s/mediaelement/%s' % (self._API_HOST, video_id), - video_id, 'Downloading mediaelement JSON') + api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS + + for api_host in api_hosts: + data = self._download_json( + 'http://%s/mediaelement/%s' % (api_host, video_id), + video_id, 'Downloading mediaelement JSON', + fatal=api_host == api_hosts[-1]) + if not data: + continue + self._api_host = api_host + break title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id @@ -191,7 +201,7 @@ class NRKIE(NRKBaseIE): ) (?P[^?#&]+) ''' - _API_HOST = 'v8-psapi.nrk.no' + _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -237,8 +247,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-we.nrk.no' - + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '4e9ca6629f09e588ed240fb11619922a', diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d0bdd60b8..d264fe206 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -301,6 +301,16 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', 'only_matching': True, + }, { + 'url': 'https://oload.win/f/kUEfGclsU9o', + 'only_matching': True, + }, { + 'url': 'https://oload.download/f/kUEfGclsU9o', + 'only_matching': True, + }, { + # Its title has not got its extension but url has it + 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @@ -362,8 +372,7 @@ class OpenloadIE(InfoExtractor): 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, - # Seems all videos have extensions in their titles - 'ext': determine_ext(title, 'mp4'), + 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), 'subtitles': subtitles, 'http_headers': headers, } diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 8ed3c6347..56a2a1083 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -42,7 +42,7 @@ class PacktPubIE(PacktPubBaseIE): _TOKEN = None def _real_initialize(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index d4b1d34ca..9eb027679 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -53,7 +53,7 @@ class PatreonIE(InfoExtractor): # needed. Keeping this commented for when this inevitably changes. ''' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f11d5da52..52ab2f158 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -360,6 +361,50 @@ class PBSIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', + 'info_dict': { + 'id': '2365936247', + 'ext': 'mp4', + 'title': 'Antiques Roadshow - Indianapolis, Hour 2', + 'description': 'md5:524b32249db55663e7231b6b8d1671a2', + 'duration': 3180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -422,6 +467,8 @@ class PBSIE(InfoExtractor): r']+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", + r']+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r']+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -456,7 +503,8 @@ class PBSIE(InfoExtractor): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -466,13 +514,27 @@ class PBSIE(InfoExtractor): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'[^:]+):| + https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _TESTS = [{ + 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'md5': '80f24ff364cc9d333529506a263e7feb', + 'info_dict': { + 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'ext': 'mp4', + 'title': 'wow', + 'description': 'wow such video, so gif', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1519297480, + 'upload_date': '20180222', + 'uploader': 'Luclu7', + 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', + 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', + 'license': 'Unknown', + 'duration': 3, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': list, + 'categories': list, + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, + }] + + @staticmethod + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P[^/]+)/videos/watch/(?P%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') + + video = self._download_json( + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = file_.get('fileUrl') + if not file_url or not isinstance(file_url, compat_str): + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + def account_data(field): + return try_get(video, lambda x: x['account'][field], compat_str) + + category = try_get(video, lambda x: x['category']['label'], compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName'), + 'uploader_id': account_data('uuid'), + 'uploder_url': account_data('url'), + 'license': try_get( + video, lambda x: x['licence']['label'], compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index aacc5d4bb..a207ca9cb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -94,7 +94,7 @@ class PluralsightIE(PluralsightBaseIE): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -140,10 +140,10 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to log in') - def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): + def _get_subtitles(self, author, clip_idx, lang, name, duration, video_id): captions_post = { 'a': author, - 'cn': clip_id, + 'cn': clip_idx, 'lc': lang, 'm': name, } @@ -195,13 +195,13 @@ class PluralsightIE(PluralsightBaseIE): author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] - clip_id = qs.get('clip', [None])[0] + clip_idx = qs.get('clip', [None])[0] course_name = qs.get('course', [None])[0] - if any(not f for f in (author, name, clip_id, course_name,)): + if any(not f for f in (author, name, clip_idx, course_name,)): raise ExtractorError('Invalid URL', expected=True) - display_id = '%s-%s' % (name, clip_id) + display_id = '%s-%s' % (name, clip_idx) course = self._download_course(course_name, url, display_id) @@ -217,7 +217,7 @@ class PluralsightIE(PluralsightBaseIE): clip_index = clip_.get('index') if clip_index is None: continue - if compat_str(clip_index) == clip_id: + if compat_str(clip_index) == clip_idx: clip = clip_ break @@ -225,6 +225,7 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to resolve clip') title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] QUALITIES = { 'low': {'width': 640, 'height': 480}, @@ -277,7 +278,7 @@ class PluralsightIE(PluralsightBaseIE): clip_post = { 'author': author, 'includeCaptions': False, - 'clipIndex': int(clip_id), + 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', 'moduleName': name, @@ -330,10 +331,10 @@ class PluralsightIE(PluralsightBaseIE): # TODO: other languages? subtitles = self.extract_subtitles( - author, clip_id, 'en', name, duration, display_id) + author, clip_idx, 'en', name, duration, display_id) return { - 'id': clip.get('clipName') or clip['name'], + 'id': clip_id, 'title': title, 'duration': duration, 'creator': author, diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 60ade06da..5726cab3a 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -43,7 +43,8 @@ class PornComIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), webpage, 'config', default='{}'), display_id, transform_source=js_to_json, fatal=False) @@ -69,7 +70,7 @@ class PornComIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), } for format_url, height, filesize in re.findall( - r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', + r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', webpage)] thumbnail = None duration = None diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index afa7b9161..ae7413fb5 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -53,7 +53,8 @@ class RBMARadioIE(InfoExtractor): 'format_id': compat_str(abr), 'abr': abr, 'vcodec': 'none', - } for abr in (96, 128, 256)] + } for abr in (96, 128, 192, 256)] + self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index bf200ea4d..8c016a77d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -19,7 +19,7 @@ class RDSIE(InfoExtractor): 'info_dict': { 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Fowler Jr. prend la direction de Jacksonville', 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', 'timestamp': 1430397346, diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 8372925be..7b0aa6232 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -47,7 +47,7 @@ class RedditIE(InfoExtractor): class RedditRIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -86,6 +86,10 @@ class RedditRIE(InfoExtractor): # youtube 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8b703800e..857434540 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -50,7 +50,7 @@ class RoosterTeethIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 28cc5522d..3b0f3080b 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - int_or_none, ExtractorError, + float_or_none, + int_or_none, + strip_or_none, ) @@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor): (?: video/[^?]+\?.*\bid=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*id= + auvio/[^/]+\?.*\b(?P<live>l)?id= )(?P<id>\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', + 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', - 'duration': 3099, + 'description': '(du 25/04/2014)', + 'duration': 3099.54, 'upload_date': '20140425', - 'timestamp': 1398456336, - 'uploader': 'rtbfsport', + 'timestamp': 1398456300, } }, { # geo restricted @@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor): }, { 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, }] _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _PROVIDERS = { @@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) error = data.get('error') if error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - data = data['data'] - provider = data.get('provider') if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' formats = [] - for key, format_id in self._QUALITIES: - format_url = data.get(key + 'Url') - if format_url: + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': fix_url(format_url), + 'height': height, }) - thumbnails = [] - for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): - if thumbnail_id != 'default': - thumbnails.append({ - 'url': self._IMAGE_HOST + thumbnail_url, - 'id': thumbnail_id, - }) + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) return { - 'id': video_id, + 'id': media_id, 'formats': formats, - 'title': data['title'], - 'description': data.get('description') or data.get('subtitle'), - 'thumbnails': thumbnails, - 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': int_or_none(data.get('created')), - 'view_count': int_or_none(data.get('viewCount')), - 'uploader': data.get('channel'), - 'tags': data.get('tags'), + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index cc6698f88..30e2a38b4 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -27,7 +27,7 @@ class SafariBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -74,7 +74,14 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?#&]+)\.html' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+) + ) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -94,22 +101,41 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, }] + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, video_id) - reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura reference id', group='id') - partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura widget id', group='id') - ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura uiconf id', group='id') + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') query = { 'wid': '_%s' % partner_id, @@ -159,10 +185,15 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)| + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| techbus\.safaribooksonline\.com ) - /(?P<id>[^/]+)/?(?:[#?]|$) + /(?P<id>[^/]+) ''' _TESTS = [{ @@ -179,8 +210,16 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'http://techbus.safaribooksonline.com/9780134426365', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + def _real_extract(self, url): course_id = self._match_id(url) diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 8fc66732a..07b766b4a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -64,7 +64,7 @@ class SinaIE(InfoExtractor): # The video id is in the redirected url self.to_screen('Getting video id') request = HEADRequest(url) - (_, urlh) = self._download_webpage_handle(request, 'NA', False) + _, urlh = self._download_webpage_handle(request, 'NA', False) return self._real_extract(urlh.geturl()) else: pseudo_id = mobj.group('pseudo_id') diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 69951e387..a363221bc 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -19,29 +19,33 @@ from ..utils import ( class SixPlayIE(InfoExtractor): IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450', - 'md5': '42310bffe4ba3982db112b9cd3467328', + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay.be)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', 'info_dict': { - 'id': '11638450', + 'id': '12041051', 'ext': 'mp4', - 'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6', - 'description': 'md5:308853f6a5f9e2d55a30fc0654de415f', - 'duration': 39, - 'series': 'Le meilleur pâtissier', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', }, - 'params': { - 'skip_download': True, - }, - } + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + }.get(domain, ('6play', 'm6web')) data = self._download_json( - 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id, - video_id, query={ + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ 'csa': 5, 'with': 'clips', }) @@ -65,7 +69,14 @@ class SixPlayIE(InfoExtractor): subtitles.setdefault('fr', []).append({'url': asset_url}) continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) + if not urlh: + continue + asset_url = urlh.geturl() asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 46332e5c2..81c81c8d5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -181,7 +181,6 @@ class SoundcloudIE(InfoExtractor): thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') - ext = 'mp3' result = { 'id': track_id, 'uploader': info.get('user', {}).get('username'), @@ -215,8 +214,11 @@ class SoundcloudIE(InfoExtractor): track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): - abr = int_or_none(self._search_regex( - r'_(\d+)_url', key, 'audio bitrate', default=None)) + ext, abr = 'mp3', None + mobj = re.search(r'_([^_]+)_(\d+)_url', key) + if mobj: + ext, abr = mobj.groups() + abr = int(abr) if key.startswith('http'): stream_formats = [{ 'format_id': key, @@ -234,13 +236,14 @@ class SoundcloudIE(InfoExtractor): }] elif key.startswith('hls'): stream_formats = self._extract_m3u8_formats( - stream_url, track_id, 'mp3', entry_protocol='m3u8_native', + stream_url, track_id, ext, entry_protocol='m3u8_native', m3u8_id=key, fatal=False) else: continue - for f in stream_formats: - f['abr'] = abr + if abr: + for f in stream_formats: + f['abr'] = abr formats.extend(stream_formats) @@ -250,7 +253,7 @@ class SoundcloudIE(InfoExtractor): formats.append({ 'format_id': 'fallback', 'url': update_url_query(info['stream_url'], query), - 'ext': ext, + 'ext': 'mp3', }) for f in formats: diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index fc995e8c1..4df7f4ddc 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -11,9 +11,9 @@ from .nexx import ( from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( - extract_attributes, - unified_strdate, - get_element_by_attribute, + parse_duration, + strip_or_none, + unified_timestamp, ) @@ -21,35 +21,38 @@ class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': '2c2754212136f35fb4b19767d242f66e', + 'md5': 'b57399839d055fccfeb9a0455c439868', 'info_dict': { - 'id': '1259285', + 'id': '563747', 'ext': 'mp4', 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, 'upload_date': '20130311', + 'timestamp': 1362994320, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', 'info_dict': { - 'id': '1309159', + 'id': '580988', 'ext': 'mp4', 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, 'upload_date': '20131115', + 'timestamp': 1384546642, }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', + 'md5': '97b91083a672d72976faa8433430afb9', 'info_dict': { - 'id': '1519126', + 'id': '601883', 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', 'upload_date': '20140904', + 'timestamp': 1409834160, } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -62,59 +65,28 @@ class SpiegelIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage, handle = self._download_webpage_handle(url, video_id) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - nexx_id = self._search_regex( - r'nexxOmniaId\s*:\s*(\d+)', webpage, 'nexx id', default=None) - if nexx_id: - domain_id = NexxIE._extract_domain_id(webpage) or '748' - return self.url_result( - 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), - video_id=nexx_id) - - video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) - - title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) - description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') - - base_url = self._search_regex( - [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], - webpage, 'server URL', group='url') - - xml_url = base_url + video_id + '.xml' - idoc = self._download_xml(xml_url, video_id) - - formats = [] - for n in list(idoc): - if n.tag.startswith('type') and n.tag != 'type6': - format_id = n.tag.rpartition('type')[2] - video_url = base_url + n.find('./filename').text - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int(n.find('./width').text), - 'height': int(n.find('./height').text), - 'abr': int(n.find('./audiobitrate').text), - 'vbr': int(n.find('./videobitrate').text), - 'vcodec': n.find('./codec').text, - 'acodec': 'MP4A', - }) - duration = float(idoc[0].findall('./duration')[0].text) - - self._check_formats(formats, video_id) - self._sort_formats(formats) + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' return { + '_type': 'url_transparent', 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), 'title': title, - 'description': description.strip() if description else None, - 'duration': duration, - 'upload_date': unified_strdate(video_data.get('data-video-date')), - 'formats': formats, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), } diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index a7b1b3b5f..e76522b45 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -1,55 +1,46 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -class SpikeIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spike\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' +class BellatorIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' _TESTS = [{ - 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', - 'md5': '1a9265f32b0c375793d6c4ce45255256', + 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { - 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4', 'ext': 'mp4', - 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', - 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', - 'timestamp': 1388120400, - 'upload_date': '20131227', + 'title': 'Douglas Lima vs. Paul Daley - Round 1', + 'description': 'md5:805a8dd29310fd611d32baba2f767885', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { - 'url': 'http://www.spike.com/full-episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-209', - 'md5': 'b25c6f16418aefb9ad5a6cae2559321f', + 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] + + _FEED_URL = 'http://www.spike.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + +class ParamountNetworkIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', 'info_dict': { 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', 'ext': 'mp4', 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', }, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m/', - 'only_matching': True, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', - 'only_matching': True, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] - _FEED_URL = 'http://www.spike.com/feeds/mrss/' - _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' - _CUSTOM_URL_REGEX = re.compile(r'spikenetworkapp://([^/]+/[-a-fA-F0-9]+)') + _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - - def _extract_mgid(self, webpage): - mgid = super(SpikeIE, self)._extract_mgid(webpage) - if mgid is None: - url_parts = self._search_regex(self._CUSTOM_URL_REGEX, webpage, 'episode_id') - video_type, episode_id = url_parts.split('/', 1) - mgid = 'mgid:arc:{0}:spike.com:{1}'.format(video_type, episode_id) - return mgid diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index f71eab8b2..0901c3163 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -12,6 +12,8 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + orderedSet, + strip_or_none, try_get, urljoin, compat_str, @@ -137,7 +139,12 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + (?: + svt:(?P<svt_id>[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -164,10 +171,40 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/kanaler/svt1', 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, }] + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, svt_id = mobj.group('id', 'svt_id') + + if svt_id: + return self._extract_by_video_id(svt_id) webpage = self._download_webpage(url, video_id) @@ -179,10 +216,6 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) - def adjust_title(info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -193,24 +226,14 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - adjust_title(info_dict) + self._adjust_title(info_dict) return info_dict - video_id = self._search_regex( + svt_id = self._search_regex( r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id', default=None) + webpage, 'video id') - if video_id: - data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, - video_id, headers=self.geo_verification_headers()) - info_dict = self._extract_video(data, video_id) - if not info_dict.get('title'): - info_dict['title'] = re.sub( - r'\s*\|\s*.+?$', '', - info_dict.get('episode') or self._og_search_title(webpage)) - adjust_title(info_dict) - return info_dict + return self._extract_by_video_id(svt_id, webpage) class SVTSeriesIE(SVTPlayBaseIE): @@ -292,3 +315,57 @@ class SVTSeriesIE(SVTPlayBaseIE): return self.playlist_result( entries, series_id, title, metadata.get('description')) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'info_dict': { + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'info_dict': { + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + }, + 'playlist_count': 1, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] + + title = strip_or_none(self._og_search_title(webpage, default=None)) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index edc31729d..784f8ed66 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) from ..utils import ( float_or_none, int_or_none, @@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE): def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(self._search_regex( + drupal_settings = self._parse_json(self._search_regex( r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', - webpage, 'drupal setting'), display_id)['turner_playlist'][0] + webpage, 'drupal setting'), display_id) + video_data = drupal_settings['turner_playlist'][0] media_id = video_data['mediaID'] title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) - streams_data = self._download_json( - 'http://medium.ngtv.io/media/%s/tv' % media_id, - media_id)['media']['tv'] - duration = None - chapters = [] - formats = [] - for supported_type in ('unprotected', 'bulkaes'): - stream_data = streams_data.get(supported_type, {}) - m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') - if not m3u8_url: - continue - if stream_data.get('playlistProtection') == 'spe': - m3u8_url = self._add_akamai_spe_token( - 'http://token.vgtf.net/token/token_spe', - m3u8_url, media_id, { - 'url': url, - 'site_name': site[:3].upper(), - 'auth_required': video_data.get('authRequired') == '1', - }) - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) - - if not chapters: - for chapter in stream_data.get('contentSegments', []): - start_time = float_or_none(chapter.get('start')) - duration = float_or_none(chapter.get('duration')) - if start_time is None or duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + duration, - }) - self._sort_formats(formats) + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) thumbnails = [] for image_id, image in video_data.get('images', {}).items(): @@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE): }) thumbnails.append(i) - return { + info.update({ 'id': media_id, 'title': title, 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), - 'duration': duration, + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), 'timestamp': int_or_none(video_data.get('created')), 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), - 'cahpters': chapters, 'thumbnails': thumbnails, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 9056c8cbc..73469cc5d 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,35 +1,34 @@ # coding: utf-8 from __future__ import unicode_literals -import binascii -import re import json -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_ord, -) +from .turner import TurnerBaseIE from ..utils import ( - ExtractorError, - qualities, determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + parse_duration, + parse_iso8601, + qualities, ) -class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' +class TeamcocoIE(TurnerBaseIE): + _VALID_URL = r'https?://teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' _TESTS = [ { - 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', - 'md5': '3f7746aa0dc86de18df7539903d399ea', + 'url': 'http://teamcoco.com/video/mary-kay-remote', + 'md5': '55d532f81992f5c92046ad02fec34d7d', 'info_dict': { 'id': '80187', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', - 'duration': 504, - 'age_limit': 0, + 'duration': 495.0, + 'upload_date': '20140402', + 'timestamp': 1396407600, } }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -40,7 +39,8 @@ class TeamcocoIE(InfoExtractor): 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'duration': 288, - 'age_limit': 0, + 'upload_date': '20111104', + 'timestamp': 1320405840, } }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', @@ -49,6 +49,8 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', + 'upload_date': '20150415', + 'timestamp': 1429088400, }, 'params': { 'skip_download': True, # m3u8 downloads @@ -63,110 +65,125 @@ class TeamcocoIE(InfoExtractor): }, 'params': { 'skip_download': True, # m3u8 downloads - } + }, + 'skip': 'This video is no longer available.', + }, { + 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/italy/conan-jordan-schlansky-hit-the-streets-of-florence', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/haiti/conan-s-haitian-history-lesson', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv', + 'only_matching': True, } ] - _VIDEO_ID_REGEXES = ( - r'"eVar42"\s*:\s*(\d+)', - r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', - r'"id_not"\s*:\s*(\d+)' - ) + + def _graphql_call(self, query_template, object_type, object_id): + find_object = 'find' + object_type + return self._download_json( + 'http://teamcoco.com/graphql/', object_id, data=json.dumps({ + 'query': query_template % (find_object, object_id) + }))['data'][find_object] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + display_id = self._match_id(url) - display_id = mobj.group('display_id') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'src=expired' in urlh.geturl(): - raise ExtractorError('This video is expired.', expected=True) + response = self._graphql_call('''{ + %s(slug: "%s") { + ... on RecordSlug { + record { + id + title + teaser + publishOn + thumb { + preview + } + file { + url + } + tags { + name + } + duration + turnerMediaId + turnerMediaAuthToken + } + } + ... on NotFoundSlug { + status + } + } +}''', 'Slug', display_id) + if response.get('status'): + raise ExtractorError('This video is no longer available.', expected=True) - video_id = mobj.group('video_id') - if not video_id: - video_id = self._html_search_regex( - self._VIDEO_ID_REGEXES, webpage, 'video id') + record = response['record'] + video_id = record['id'] - data = None - - preload_codes = self._html_search_regex( - r'(function.+)setTimeout\(function\(\)\{playlist', - webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes) - base64_fragments.remove('init') - - def _check_sequence(cur_fragments): - if not cur_fragments: - return - for i in range(len(cur_fragments)): - cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') - try: - raw_data = compat_b64decode(cur_sequence) - if compat_ord(raw_data[0]) == compat_ord('{'): - return json.loads(raw_data.decode('utf-8')) - except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): - continue - - def _check_data(): - for i in range(len(base64_fragments) + 1): - for j in range(i, len(base64_fragments) + 1): - data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) - if data: - return data - - self.to_screen('Try to compute possible data sequence. This may take some time.') - data = _check_data() - - if not data: - raise ExtractorError( - 'Preload information could not be extracted', expected=True) - - formats = [] - get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) - for filed in data['files']: - if determine_ext(filed['url']) == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if filed['url'].startswith('/'): - m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] - else: - m3u8_url = filed['url'] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4') - for m3u8_format in m3u8_formats: - if m3u8_format not in formats: - formats.append(m3u8_format) - elif determine_ext(filed['url']) == 'f4m': - # TODO Correct f4m extraction - continue - else: - if filed['url'].startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) - if m_format is not None: - format_id = m_format.group(1) - else: - format_id = filed['bitrate'] - tbr = ( - int(filed['bitrate']) - if filed['bitrate'].isdigit() - else None) - - formats.append({ - 'url': filed['url'], - 'ext': 'mp4', - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - - self._sort_formats(formats) - - return { + info = { 'id': video_id, 'display_id': display_id, - 'formats': formats, - 'title': data['title'], - 'thumbnail': data.get('thumb', {}).get('href'), - 'description': data.get('teaser'), - 'duration': data.get('duration'), - 'age_limit': self._family_friendly_search(webpage), + 'title': record['title'], + 'thumbnail': record.get('thumb', {}).get('preview'), + 'description': record.get('teaser'), + 'duration': parse_duration(record.get('duration')), + 'timestamp': parse_iso8601(record.get('publishOn')), } + + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): + continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index 0c6f70784..a586f30ad 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -32,7 +32,7 @@ class TennisTVIE(InfoExtractor): _NETRC_MACHINE = 'tennistv' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e595c4a69..903f47380 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -19,6 +19,7 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 36f6c1673..a51fa6515 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -36,7 +36,7 @@ class TubiTvIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 786143525..edbb0aa69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,11 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata +) class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _NETRC_MACHINE = 'tumblr' + _LOGIN_URL = 'https://www.tumblr.com/login' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -97,6 +104,45 @@ class TumblrIE(InfoExtractor): 'add_ie': ['Instagram'], }] + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ + 'user[email]': username, + 'user[password]': password + }) + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + + # Successful login + if '/dashboard' in urlh.geturl(): + return + + login_errors = self._parse_json( + self._search_regex( + r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, + 'login errors', default='[]'), + None, fatal=False) + if login_errors: + raise ExtractorError( + 'Unable to login: %s' % login_errors[0], expected=True) + + self.report_warning('Login has probably failed') + def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') @@ -105,11 +151,19 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) + redirect_url = compat_str(urlh.geturl()) + if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): + raise ExtractorError( + 'This Tumblr may contain sensitive media. ' + 'Disable safe mode in your account settings ' + 'at https://www.tumblr.com/settings/account#safe_mode', + expected=True) + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url', default=None) if iframe_url is None: - return self.url_result(urlh.geturl(), 'Generic') + return self.url_result(redirect_url, 'Generic') iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 7e51de89e..c7a5f5a63 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -62,7 +62,7 @@ class TuneInBaseIE(InfoExtractor): return { 'id': content_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index e73b64aeb..2b7b0d6e1 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -9,6 +9,7 @@ from ..utils import ( xpath_text, int_or_none, determine_ext, + float_or_none, parse_duration, xpath_attr, update_url_query, @@ -23,14 +24,17 @@ class TurnerBaseIE(AdobePassIE): def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) if not token: query = { 'path': secure_path, - 'videoId': content_id, } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id if ap_data.get('auth_required'): query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) auth = self._download_xml( @@ -188,3 +192,42 @@ class TurnerBaseIE(AdobePassIE): 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'is_live': is_live, } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index cfcce020a..51923e44a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -1,13 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_iso8601, - try_get, - determine_ext, ) @@ -78,42 +77,25 @@ class TV4IE(InfoExtractor): title = info['title'] - subtitles = {} - formats = [] - # http formats are linked with unresolvable host - for kind in ('hls3', ''): - data = self._download_json( - 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, - video_id, 'Downloading sources JSON', query={ - 'protocol': kind, - 'videoFormat': 'MP4+WEBVTT', - }) - items = try_get(data, lambda x: x['playback']['items']['item']) - if not items: - continue - if isinstance(items, dict): - items = [items] - for item in items: - manifest_url = item.get('url') - if not isinstance(manifest_url, compat_str): - continue - ext = determine_ext(manifest_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=kind, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_akamai_formats( - manifest_url, video_id, { - 'hls': 'tv4play-i.akamaihd.net', - })) - elif ext == 'webvtt': - subtitles = self._merge_subtitles( - subtitles, { - 'sv': [{ - 'url': manifest_url, - 'ext': 'vtt', - }]}) + manifest_url = self._download_json( + 'https://playback-api.b17g.net/media/' + video_id, + video_id, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + })['playbackItem']['manifestUrl'] + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_ism_formats( + re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES) @@ -124,7 +106,7 @@ class TV4IE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, - 'subtitles': subtitles, + # 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py new file mode 100644 index 000000000..2b2630b91 --- /dev/null +++ b/youtube_dl/extractor/tvnet.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class TVNetIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P<id>\d+)(?:/|$)' + _TESTS = [{ + # video + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': 'b4d7abe0252c9b47774760b7519c7558', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + 'view_count': int, + }, + }, { + # audio + 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', + 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', + 'info_dict': { + 'id': '27017', + 'ext': 'm4a', + 'title': 'VOV1 - Bản tin chiều (10/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705', + 'info_dict': { + 'id': '129999', + 'ext': 'mp4', + 'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + }, { + # live stream + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + # radio live stream + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, default=None) or self._search_regex( + r'<title>([^<]+)<', webpage, 'title') + title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + + if '/video/' in url or '/radio/' in url: + is_live = False + elif '/kenh-truyen-hinh/' in url: + is_live = True + else: + is_live = None + + data_file = unescapeHTML(self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'data file', group='url')) + + stream_urls = set() + formats = [] + for stream in self._download_json(data_file, video_id): + if not isinstance(stream, dict): + continue + stream_url = stream.get('url') + if (stream_url in stream_urls or not stream_url or + not isinstance(stream_url, compat_str)): + continue + stream_urls.add(stream_url) + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + # better support for radio streams + if title.startswith('VOV'): + for f in formats: + f.update({ + 'ext': 'm4a', + 'vcodec': 'none', + }) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or unescapeHTML( + self._search_regex( + r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'thumbnail', default=None, group='url')) + + if is_live: + title = self._live_title(title) + + view_count = int_or_none(self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', + webpage, 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 808571ece..60937616f 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -19,8 +19,8 @@ class TVNowBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -31,27 +31,42 @@ class TVNowBaseIE(InfoExtractor): video_id = compat_str(info['id']) title = info['title'] - mpd_url = info['manifest']['dashclear'] - if not mpd_url: + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: if info.get('isDrm'): raise ExtractorError( 'Video %s is DRM protected' % video_id, expected=True) if info.get('geoblocked'): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) + raise self.raise_geo_restricted() if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - - mpd_url = update_url_query(mpd_url, {'filter': ''}) - formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) description = info.get('articleLong') or info.get('articleShort') @@ -88,7 +103,7 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ (?P<show_id>[^/]+)/ (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) ''' @@ -140,11 +155,13 @@ class TVNowIE(TVNowBaseIE): }] def _real_extract(self, url): - display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + display_id = '%s/%s' % mobj.group(2, 3) info = self._call_api( 'movies/' + display_id, display_id, query={ 'fields': ','.join(self._VIDEO_FIELDS), + 'station': mobj.group(1), }) return self._extract_video(info, display_id) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 84597b55e..e09b5f804 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -227,14 +227,16 @@ class TVPlayIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, 'geo country', default=None) if geo_country: - self._initialize_geo_bypass([geo_country.upper()]) + self._initialize_geo_bypass({'countries': [geo_country.upper()]}) video = self._download_json( 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4c11fd3c3..e01f11331 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -8,6 +8,7 @@ import random from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_kwargs, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -16,11 +17,14 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + float_or_none, int_or_none, - js_to_json, orderedSet, parse_duration, parse_iso8601, + qualities, + try_get, + unified_timestamp, update_url_query, urlencode_postdata, urljoin, @@ -45,10 +49,11 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _call_api(self, path, item_id, note): + def _call_api(self, path, item_id, *args, **kwargs): + kwargs.setdefault('headers', {})['Client-ID'] = self._CLIENT_ID response = self._download_json( - '%s/%s' % (self._API_BASE, path), item_id, note, - headers={'Client-ID': self._CLIENT_ID}) + '%s/%s' % (self._API_BASE, path), item_id, + *args, **compat_kwargs(kwargs)) self._handle_error(response) return response @@ -56,7 +61,7 @@ class TwitchBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -622,21 +627,23 @@ class TwitchStreamIE(TwitchBaseIE): } -class TwitchClipsIE(InfoExtractor): +class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', + 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { - 'id': 'AggressiveCobraPoooound', + 'id': '42850523', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1465767393, + 'upload_date': '20160612', 'creator': 'EA', 'uploader': 'stereotype_', - 'uploader_id': 'stereotype_', + 'uploader_id': '43566419', }, }, { # multiple formats @@ -647,34 +654,63 @@ class TwitchClipsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + status = self._download_json( + 'https://clips.twitch.tv/api/v2/clips/%s/status' % video_id, + video_id) - clip = self._parse_json( - self._search_regex( - r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), - video_id, transform_source=js_to_json) + formats = [] - title = clip.get('title') or clip.get('channel_title') or self._og_search_title(webpage) - - formats = [{ - 'url': option['source'], - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - } for option in clip.get('quality_options', []) if option.get('source')] - - if not formats: - formats = [{ - 'url': clip['clip_video_url'], - }] + for option in status['quality_options']: + if not isinstance(option, dict): + continue + source = option.get('source') + if not source or not isinstance(source, compat_str): + continue + formats.append({ + 'url': source, + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + 'fps': int_or_none(option.get('frame_rate')), + }) self._sort_formats(formats) - return { - 'id': video_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), - 'uploader': clip.get('curator_login'), - 'uploader_id': clip.get('curator_display_name'), + info = { 'formats': formats, } + + clip = self._call_api( + 'kraken/clips/%s' % video_id, video_id, fatal=False, headers={ + 'Accept': 'application/vnd.twitchtv.v5+json', + }) + + if clip: + quality_key = qualities(('tiny', 'small', 'medium')) + thumbnails = [] + thumbnails_dict = clip.get('thumbnails') + if isinstance(thumbnails_dict, dict): + for thumbnail_id, thumbnail_url in thumbnails_dict.items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': quality_key(thumbnail_id), + }) + + info.update({ + 'id': clip.get('tracking_id') or video_id, + 'title': clip.get('title') or video_id, + 'duration': float_or_none(clip.get('duration')), + 'views': int_or_none(clip.get('views')), + 'timestamp': unified_timestamp(clip.get('created_at')), + 'thumbnails': thumbnails, + 'creator': try_get(clip, lambda x: x['broadcaster']['display_name'], compat_str), + 'uploader': try_get(clip, lambda x: x['curator']['display_name'], compat_str), + 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), + }) + else: + info.update({ + 'title': video_id, + 'id': video_id, + }) + + return info diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index d7e425041..de41065d6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -63,7 +63,7 @@ class TwitterCardIE(TwitterBaseIE): 'id': '623160978427936768', 'ext': 'mp4', 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg', + 'thumbnail': r're:^https?://.*$', }, }, { @@ -108,6 +108,8 @@ class TwitterCardIE(TwitterBaseIE): }, ] + _API_BASE = 'https://api.twitter.com/1.1' + def _parse_media_info(self, media_info, video_id): formats = [] for media_variant in media_info.get('variants', []): @@ -149,7 +151,7 @@ class TwitterCardIE(TwitterBaseIE): main_script, 'bearer token') # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id api_data = self._download_json( - 'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, + '%s/statuses/show/%s.json' % (self._API_BASE, video_id), video_id, 'Downloading API data', headers={ 'Authorization': 'Bearer ' + bearer_token, @@ -223,15 +225,49 @@ class TwitterCardIE(TwitterBaseIE): formats.extend(self._extract_mobile_formats(username, video_id)) if formats: + title = self._search_regex(r'<title>([^<]+)', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration'), scale=1000) or duration break + if not formats: + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + 'Referer': url, + } + ct0 = self._get_cookies(url).get('ct0') + if ct0: + headers['csrf_token'] = ct0.value + guest_token = self._download_json( + '%s/guest/activate.json' % self._API_BASE, video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = guest_token + self._set_cookie('api.twitter.com', 'gt', guest_token) + config = self._download_json( + '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), + video_id, headers=headers) + track = config['track'] + vmap_url = track.get('vmapUrl') + if vmap_url: + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + else: + playback_url = track['playbackUrl'] + if determine_ext(playback_url) == 'm3u8': + formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + else: + formats = [{ + 'url': playback_url, + }] + title = 'Twitter web player' + thumbnail = config.get('posterImage') + duration = float_or_none(track.get('durationMs'), scale=1000) + self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex(r'([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - return { 'id': video_id, 'title': title, @@ -375,6 +411,22 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + # card via api.twitter.com/1.1/videos/tweet/config + 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', + 'info_dict': { + 'id': '1001551623938805763', + 'ext': 'mp4', + 'title': 're:.*?Shep is on a roll today.*?', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'uploader': 'Lis Power', + 'uploader_id': 'LisPower1', + 'duration': 111.278, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index bf1134e3f..a7196997e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -18,6 +18,7 @@ from ..utils import ( int_or_none, js_to_json, sanitized_Request, + try_get, unescapeHTML, urlencode_postdata, ) @@ -105,7 +106,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', }) def _handle_error(self, response): @@ -150,7 +151,7 @@ class UdemyIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -303,9 +304,25 @@ class UdemyIE(InfoExtractor): 'url': src, }) - download_urls = asset.get('download_urls') - if isinstance(download_urls, dict): - extract_formats(download_urls.get('Video')) + for url_kind in ('download', 'stream'): + urls = asset.get('%s_urls' % url_kind) + if isinstance(urls, dict): + extract_formats(urls.get('Video')) + + captions = asset.get('captions') + if isinstance(captions, list): + for cc in captions: + if not isinstance(cc, dict): + continue + cc_url = cc.get('url') + if not cc_url or not isinstance(cc_url, compat_str): + continue + lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) + sub_dict = (automatic_captions if cc.get('source') == 'auto' + else subtitles) + sub_dict.setdefault(lang or 'en', []).append({ + 'url': cc_url, + }) view_html = lecture.get('view_html') if view_html: diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py index ab823814b..f3eaee6b3 100644 --- a/youtube_dl/extractor/ufctv.py +++ b/youtube_dl/extractor/ufctv.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, parse_duration, parse_iso8601, + urlencode_postdata, ) class UFCTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P[^/]+)' + _NETRC_MACHINE = 'ufctv' _TEST = { 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', 'info_dict': { @@ -26,6 +29,21 @@ class UFCTVIE(InfoExtractor): } } + def _real_initialize(self): + username, password = self._get_login_info() + if username is None: + return + + code = self._download_json( + 'https://www.ufc.tv/secure/authenticate', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + 'format': 'json', + })).get('code') + if code and code != 'loginsuccess': + raise ExtractorError(code, expected=True) + def _real_extract(self, url): display_id = self._match_id(url) video_data = self._download_json(url, display_id, query={ diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 80a643dfe..31eee0ba7 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -75,7 +75,7 @@ class VesselIE(InfoExtractor): 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index c21a09c01..fe7a26b62 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -24,6 +24,7 @@ class VGTVIE(XstreamIE): 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -44,7 +45,7 @@ class VGTVIE(XstreamIE): (?: (?:\#!/)?(?:video|live)/| embed?.*id=| - articles/ + a(?:rticles)?/ )| (?P %s @@ -143,6 +144,10 @@ class VGTVIE(XstreamIE): 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, { 'url': 'abtv:140026', 'only_matching': True, @@ -178,13 +183,15 @@ class VGTVIE(XstreamIE): streams = data['streamUrls'] stream_type = data.get('streamType') - + is_live = stream_type == 'live' formats = [] hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -229,13 +236,13 @@ class VGTVIE(XstreamIE): info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], + 'title': self._live_title(data['title']) if is_live else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'is_live': True if stream_type == 'live' else False, + 'is_live': is_live, }) return info diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9026e778c..d70283479 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -54,7 +54,8 @@ class VidziIE(InfoExtractor): self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=js_to_json) + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) if jwplayer_data: break diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 1f29c273f..c43d1a1e8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,24 +1,27 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, clean_html, determine_ext, int_or_none, js_to_json, + parse_age_limit, parse_duration, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -60,8 +63,10 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): formats = [] has_bitrate = False - for source in self._parse_json(js_to_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, + 'sources', default='[]'), video_id, js_to_json) + for source in sources: file_ = source.get('file') if not file_: continue @@ -70,7 +75,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): format_id = source.get('label') or ext if all(v in ('m3u8', 'hls') for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', m3u8_id='hls')) + file_, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], @@ -85,6 +91,13 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'tbr': bitrate, 'height': height, }) + if not formats: + hls_url = self._parse_json(self._search_regex( + r'filmInfo\.src\s*=\s*({.+?});', + webpage, 'src'), video_id, js_to_json)['src'] + formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') self._sort_formats(formats, field_preference) @@ -109,10 +122,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'display_id': 'lost_for_life', 'ext': 'mp4', 'title': 'Lost for Life', - 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 4489, - 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + 'categories': 'mincount:3', + 'age_limit': 14, + 'upload_date': '20150421', + 'timestamp': 1429656819, } }, { 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', @@ -125,7 +141,9 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 979, - 'categories': ['Documentary', 'Sports', 'Politics'] + 'categories': 'mincount:2', + 'timestamp': 1399478279, + 'upload_date': '20140507', } }, { # Film is not playable in your area. @@ -138,9 +156,6 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'http://www.winnersview.com/videos/the-good-son', 'only_matching': True, - }, { - 'url': 'http://www.kesari.tv/news/video/1461919076414', - 'only_matching': True, }, { # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', @@ -156,45 +171,96 @@ class ViewLiftIE(ViewLiftBaseIE): raise ExtractorError( 'Film %s is not available.' % display_id, expected=True) - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + initial_store_state = self._search_regex( + r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", + webpage, 'Initial Store State', default=None) + if initial_store_state: + modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( + initial_store_state).decode()), display_id)['page']['data']['modules'] + content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') + gist = content_data['gist'] + film_id = gist['id'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), - display_id) + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) + + info = { + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info else: - title = self._search_regex( - r'itemprop="title">([^<]+)<', webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.+?)
', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + snag = self._parse_json( + self._search_regex( + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)
(.+?)
', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + 'ie_key': 'ViewLiftEmbed', + } diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ad2a2a4b7..546de95d8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -88,7 +88,7 @@ class VikiBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a026526b2..3baa2d075 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -37,7 +37,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_URL = 'https://vimeo.com/log_in' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: if self._LOGIN_REQUIRED: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) @@ -989,10 +989,10 @@ class VimeoWatchLaterIE(VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P[0-9]+)/likes/?(?:$|[?#]|sort:)' + _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P[^/]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' - _TEST = { + _TESTS = [{ 'url': 'https://vimeo.com/user755559/likes/', 'playlist_mincount': 293, 'info_dict': { @@ -1000,7 +1000,10 @@ class VimeoLikesIE(InfoExtractor): 'description': 'See all the videos urza likes', 'title': 'Videos urza likes', }, - } + }, { + 'url': 'https://vimeo.com/stormlapse/likes', + 'only_matching': True, + }] def _real_extract(self, url): user_id = self._match_id(url) @@ -1009,7 +1012,7 @@ class VimeoLikesIE(InfoExtractor): self._search_regex( r'''(?x)
  • .*?
  • \s* - ''', webpage, 'page count'), + ''', webpage, 'page count', default=1), 'page count', fatal=True) PAGE_SIZE = 12 title = self._html_search_regex( @@ -1017,7 +1020,7 @@ class VimeoLikesIE(InfoExtractor): description = self._html_search_meta('description', webpage) def _get_page(idx): - page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % ( + page_url = 'https://vimeo.com/%s/likes/page:%d/sort:date' % ( user_id, idx + 1) webpage = self._download_webpage( page_url, user_id, @@ -1037,7 +1040,7 @@ class VimeoLikesIE(InfoExtractor): return { '_type': 'playlist', - 'id': 'user%s_likes' % user_id, + 'id': '%s_likes' % user_id, 'title': title, 'description': description, 'entries': pl, diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b50d4f170..29002b35f 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -32,7 +32,7 @@ class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 64d0224e6..0b5165fd0 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_FIELD = 'video params' @@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor): def _live(self, video_id, webpage): init_page = self._download_webpage( - 'http://www.vlive.tv/video/init/view', + 'https://www.vlive.tv/video/init/view', video_id, note='Downloading live webpage', data=urlencode_postdata({'videoSeq': video_id}), headers={ - 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, 'Content-Type': 'application/x-www-form-urlencoded' }) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 20fef1f04..8ef3e0906 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -19,7 +19,6 @@ class WatIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': '83d882d9de5c9d97f0bb2c6273cde56a', 'info_dict': { 'id': '11713067', 'ext': 'mp4', @@ -28,10 +27,15 @@ class WatIE(InfoExtractor): 'upload_date': '20140819', 'duration': 120, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', 'info_dict': { 'id': '11713075', 'ext': 'mp4', @@ -98,38 +102,25 @@ class WatIE(InfoExtractor): formats = [] try: + alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id) m3u8_url = manifest_urls.get('hls') if m3u8_url: m3u8_url = remove_bitrate_limit(m3u8_url) - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + for m3u8_alt_url in alt_urls(m3u8_url): + formats.extend(self._extract_m3u8_formats( + m3u8_alt_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - http_url = extract_url('android5/%s.mp4', 'http') - if http_url: - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - format_id = m3u8_format['format_id'].replace('hls', 'http') - fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) - if self._is_valid_url(fmt_url, video_id, format_id): - f = m3u8_format.copy() - f.update({ - 'url': fmt_url, - 'format_id': format_id, - 'protocol': 'http', - }) - formats.append(f) mpd_url = manifest_urls.get('mpd') if mpd_url: - formats.extend(self._extract_mpd_formats(remove_bitrate_limit( - mpd_url), video_id, mpd_id='dash', fatal=False)) + mpd_url = remove_bitrate_limit(mpd_url) + for mpd_alt_url in alt_urls(mpd_url): + formats.extend(self._extract_mpd_formats( + mpd_alt_url, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) except ExtractorError: abr = 64 diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index b382338fa..be0bcba15 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -69,7 +69,7 @@ class WatchBoxIE(InfoExtractor): source = self._parse_json( self._search_regex( - r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source', + r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', default='{}'), video_id, transform_source=js_to_json, fatal=False) or {} diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index c022fb33e..3dab9145b 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -36,7 +36,8 @@ class WimpIE(InfoExtractor): webpage = self._download_webpage(url, video_id) youtube_id = self._search_regex( - r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: return { diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 04aeb91af..89c8b7f8d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -37,6 +37,7 @@ from ..utils import ( orderedSet, parse_codecs, parse_duration, + qualities, remove_quotes, remove_start, smuggle_url, @@ -84,7 +85,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: @@ -509,6 +510,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'license': 'Standard YouTube License', 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -527,6 +530,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision', + 'artist': 'Justin Timberlake', 'age_limit': 18, } }, @@ -596,7 +601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'IB3lcPjvWLA', 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'description': 'md5:1900ed86ee514927b9e00fbead6969a5', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', @@ -637,7 +642,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'The Amazing Atheist', + 'uploader': 'TJ Kirk', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'license': 'Standard YouTube License', @@ -667,10 +672,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 247, + 'duration': 246, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', @@ -732,7 +737,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫艾倫', + 'uploader': '孫ᄋᄅ', 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, @@ -759,7 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, @@ -768,8 +773,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:32', + 'formats': 'mincount:31', }, + 'skip': 'not actual anymore', }, # DASH manifest with segment_list { @@ -884,7 +890,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk', + 'alt_title': 'Dark Walk - Position Music', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', @@ -892,7 +898,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', 'license': 'Standard YouTube License', - 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', + 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk - Position Music', + 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', }, 'params': { 'skip_download': True, @@ -949,7 +957,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:dda0d780d5a6e120758d1711d062a867', 'duration': 4060, 'upload_date': '20151119', - 'uploader': 'Bernie 2016', + 'uploader': 'Bernie Sanders', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', @@ -984,6 +992,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # YouTube Red video with episode data @@ -992,7 +1001,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'description': 'md5:25b78d2f64ae81719f5c96319889b736', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', @@ -1025,7 +1034,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', 'license': 'Standard YouTube License', - 'view_count': int, }, 'params': { 'skip_download': True, @@ -1537,7 +1545,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map'): + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) @@ -1693,125 +1701,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - if 'author' not in video_info: - raise ExtractorError('Unable to extract uploader name') - video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0]) - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = m_episode.group('series') - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - chapters = self._extract_chapters(description_original, video_duration) + def _extract_filesize(media_url): + return int_or_none(self._search_regex( + r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() @@ -1838,6 +1730,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(width_height[0]), 'height': int_or_none(width_height[1]), } + q = qualities(['small', 'medium', 'hd720']) formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) @@ -1917,13 +1810,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + filesize = int_or_none(url_data.get( + 'clen', [None])[0]) or _extract_filesize(url) + + quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0] + more_fields = { - 'filesize': int_or_none(url_data.get('clen', [None])[0]), + 'filesize': filesize, 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], + 'format_note': quality, + 'quality': q(quality), } for key, value in more_fields.items(): if value: @@ -1969,11 +1868,140 @@ class YoutubeIE(YoutubeBaseInfoExtractor): a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' formats.append(a_format) else: - unavailable_message = extract_unavailable_message() - if unavailable_message: - raise ExtractorError(unavailable_message, expected=True) + error_message = clean_html(video_info.get('reason', [None])[0]) + if not error_message: + error_message = extract_unavailable_message() + if error_message: + raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + # uploader + video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') + + # uploader_id + video_uploader_id = None + video_uploader_url = None + mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + video_webpage) + if mobj is not None: + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') + else: + self._downloader.report_warning('unable to extract uploader nickname') + + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') + video_thumbnail = None + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = self._html_search_meta( + 'datePublished', video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], + video_webpage, 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + video_license = self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', + video_webpage, 'license', default=None) + + m_music = re.search( + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = m_episode.group('series') + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + + m_cat_container = self._search_regex( + r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', + video_webpage, 'categories', default=None) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + + def _extract_count(count_name): + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') + + # subtitles + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) + + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + + chapters = self._extract_chapters(description_original, video_duration) + # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd_fatal = True @@ -1990,6 +2018,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for df in self._extract_mpd_formats( mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): + if not df.get('filesize'): + df['filesize'] = _extract_filesize(df['url']) # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df @@ -2037,9 +2067,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': video_uploader_url, 'upload_date': upload_date, 'license': video_license, - 'creator': video_creator, + 'creator': video_creator or artist, 'title': video_title, - 'alt_title': video_alt_title, + 'alt_title': video_alt_title or track, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, @@ -2062,6 +2092,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'series': series, 'season_number': season_number, 'episode_number': episode_number, + 'track': track, + 'artist': artist, } diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 773073d85..b5a3a0716 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -24,7 +24,7 @@ class ZattooBaseIE(InfoExtractor): _power_guide_hash = None def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: self.raise_login_required( 'A valid %s account is needed to access this media.' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3e4ac03a2..e7d8e8910 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -203,7 +203,7 @@ def parseOpts(overrideArguments=None): network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', - help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental ' + help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' 'SOCKS proxy, specify a proper scheme. For example ' 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' 'for direct connection') @@ -232,7 +232,7 @@ def parseOpts(overrideArguments=None): '--geo-verification-proxy', dest='geo_verification_proxy', default=None, metavar='URL', help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading.') + 'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading.') geo.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', @@ -240,15 +240,19 @@ def parseOpts(overrideArguments=None): geo.add_option( '--geo-bypass', action='store_true', dest='geo_bypass', default=True, - help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--no-geo-bypass', action='store_false', dest='geo_bypass', default=True, - help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--geo-bypass-country', metavar='CODE', dest='geo_bypass_country', default=None, - help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') + geo.add_option( + '--geo-bypass-ip-block', metavar='IP_BLOCK', + dest='geo_bypass_ip_block', default=None, + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -498,7 +502,7 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected file size (experimental)') + help='Set file xattribute ytdl.filesize with expected file size') downloader.add_option( '--hls-prefer-native', dest='hls_prefer_native', action='store_true', default=None, @@ -837,11 +841,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors (default)') + help='Prefer avconv over ffmpeg for running the postprocessors') postproc.add_option( '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors') + help='Prefer ffmpeg over avconv for running the postprocessors (default)') postproc.add_option( '--ffmpeg-location', '--avconv-location', metavar='PATH', dest='ffmpeg_location', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3ea1afcf3..757b496a1 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -77,7 +77,7 @@ class FFmpegPostProcessor(PostProcessor): def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = False + prefer_ffmpeg = True self.basename = None self.probe_basename = None @@ -85,7 +85,7 @@ class FFmpegPostProcessor(PostProcessor): self._paths = None self._versions = None if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) location = self._downloader.params.get('ffmpeg_location') if location is not None: if not os.path.exists(location): @@ -117,19 +117,19 @@ class FFmpegPostProcessor(PostProcessor): (p, get_exe_version(p, args=['-version'])) for p in programs) self._paths = dict((p, p) for p in programs) - if prefer_ffmpeg: - prefs = ('ffmpeg', 'avconv') - else: + if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') for p in prefs: if self._versions[p]: self.basename = p break - if prefer_ffmpeg: - prefs = ('ffprobe', 'avprobe') - else: + if prefer_ffmpeg is False: prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') for p in prefs: if self._versions[p]: self.probe_basename = p diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b460393bf..6a3199fb9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1228,7 +1228,7 @@ def unified_timestamp(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): - if url is None: + if url is None or '.' not in url: return default_ext guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): @@ -2272,7 +2272,10 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - return TV_PARENTAL_GUIDELINES.get(s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) + if m: + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] + return None def strip_jsonp(code): @@ -2664,6 +2667,7 @@ def dfxp2srt(dfxp_data): ] _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', 'ttml': 'http://www.w3.org/ns/ttml', 'tts': 'http://www.w3.org/ns/ttml#styling', }) @@ -2755,7 +2759,9 @@ def dfxp2srt(dfxp_data): repeat = False while True: for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue parent_style_id = style.get('style') if parent_style_id: if parent_style_id not in styles: @@ -3534,10 +3540,13 @@ class GeoUtils(object): } @classmethod - def random_ipv4(cls, code): - block = cls._country_ip_map.get(code.upper()) - if not block: - return None + def random_ipv4(cls, code_or_block): + if len(code_or_block) == 2: + block = cls._country_ip_map.get(code_or_block.upper()) + if not block: + return None + else: + block = code_or_block addr, preflen = block.split('/') addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] addr_max = addr_min | (0xffffffff >> int(preflen)) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 04896efc8..8fbafd6a1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.01' +__version__ = '2018.06.25'