diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 058eb4321..7607e0e03 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.10.05*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.10.05** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.11.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.11.07** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.10.05 +[debug] youtube-dl version 2018.11.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 333acee80..bbcb78808 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -296,5 +296,26 @@ title = self._search_regex( ### Use safe conversion functions -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` diff --git a/ChangeLog b/ChangeLog index 86cf489b1..fa5de8b04 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,61 @@ +version 2018.11.07 + +Extractors ++ [youtube] Add another JS signature function name regex (#18091, #18093, + #18094) +* [facebook] Fix tahoe request (#17171) +* [cliphunter] Fix extraction (#18083) ++ [youtube:playlist] Add support for invidio.us (#18077) +* [zattoo] Arrange API hosts for derived extractors (#18035) ++ [youtube] Add fallback metadata extraction from videoDetails (#18052) + + +version 2018.11.03 + +Core +* [extractor/common] Ensure response handle is not prematurely closed before + it can be read if it matches expected_status (#17195, #17846, #17447) + +Extractors +* [laola1tv:embed] Set correct stream access URL scheme (#16341) ++ [ehftv] Add support for ehftv.com (#15408) +* [azmedien] Adopt to major site redesign (#17745, #17746) ++ [twitcasting] Add support for twitcasting.tv (#17981) +* [orf:tvthek] Fix extraction (#17737, #17956, #18024) ++ [openload] Add support for oload.fun (#18045) +* [njpwworld] Fix authentication (#17427) ++ [linkedin:learning] Add support for linkedin.com/learning (#13545) +* [theplatform] Improve error detection (#13222) +* [cnbc] Simplify extraction (#14280, #17110) ++ [cbnc] Add support for new URL schema (#14193) +* [aparat] Improve extraction and extract more metadata (#17445, #18008) +* [aparat] Fix extraction + + +version 2018.10.29 + +Core ++ [extractor/common] Add validation for JSON-LD URLs + +Extractors ++ [sportbox] Add support for matchtv.ru +* [sportbox] Fix extraction (#17978) +* [screencast] Fix extraction (#14590, #14617, #17990) ++ [openload] Add support for oload.icu ++ [ivi] Add support for ivi.tv +* [crunchyroll] Improve extraction failsafeness (#17991) +* [dailymail] Fix formats extraction (#17976) +* [viewster] Reduce format requests +* [cwtv] Handle API errors (#17905) ++ [rutube] Use geo verification headers (#17897) ++ [brightcove:legacy] Add fallbacks to brightcove:new (#13912) +- [tv3] Remove extractor (#10461, #15339) +* [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) ++ [openload] Add support for oload.cc (#17823) ++ [patreon] Extract post_file URL (#17792) +* [patreon] Fix extraction (#14502, #10471) + + version 2018.10.05 Extractors diff --git a/README.md b/README.md index fdd115c9b..35c3de512 100644 --- a/README.md +++ b/README.md @@ -1168,7 +1168,28 @@ title = self._search_regex( ### Use safe conversion functions -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` # EMBEDDING YOUTUBE-DL diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f167a6ddc..24c3254c3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -84,8 +84,6 @@ - **awaan:season** - **awaan:video** - **AZMedien**: AZ Medien videos - - **AZMedienPlaylist**: AZ Medien playlists - - **AZMedienShowPlaylist**: AZ Medien show playlists - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** @@ -178,6 +176,7 @@ - **Clyp** - **cmt.com** - **CNBC** + - **CNBCVideo** - **CNN** - **CNNArticle** - **CNNBlogs** @@ -251,6 +250,7 @@ - **EchoMsk** - **egghead:course**: egghead.io course - **egghead:lesson**: egghead.io lesson + - **ehftv** - **eHow** - **EinsUndEinsTV** - **Einthusan** @@ -445,6 +445,8 @@ - **limelight:channel** - **limelight:channel_list** - **LineTV** + - **linkedin:learning** + - **linkedin:learning:course** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -818,7 +820,7 @@ - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - - **SportBoxEmbed** + - **SportBox** - **SportDeutschland** - **SpringboardPlatform** - **Sprout** @@ -909,7 +911,6 @@ - **TV2** - **tv2.hu** - **TV2Article** - - **TV3** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** @@ -931,6 +932,7 @@ - **TVPlayer** - **TVPlayHome** - **Tweakers** + - **TwitCasting** - **twitch:chapter** - **twitch:clips** - **twitch:profile** diff --git a/test/helper.py b/test/helper.py index dfee217a9..aa9a1c9b2 100644 --- a/test/helper.py +++ b/test/helper.py @@ -7,6 +7,7 @@ import json import os.path import re import types +import ssl import sys import youtube_dl.extractor @@ -244,3 +245,12 @@ def expect_warnings(ydl, warnings_re): real_warning(w) ydl.report_warning = _report_warning + + +def http_server_port(httpd): + if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): + # In Jython SSLSocket is not a subclass of socket.socket + sock = httpd.socket.sock + else: + sock = httpd.socket + return sock.getsockname()[1] diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4833396a5..06be72616 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -9,11 +9,30 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value -from youtube_dl.compat import compat_etree_fromstring +from test.helper import FakeYDL, expect_dict, expect_value, http_server_port +from youtube_dl.compat import compat_etree_fromstring, compat_http_server from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError +import threading + + +TEAPOT_RESPONSE_STATUS = 418 +TEAPOT_RESPONSE_BODY = "

418 I'm a teapot

" + + +class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def do_GET(self): + if self.path == '/teapot': + self.send_response(TEAPOT_RESPONSE_STATUS) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.end_headers() + self.wfile.write(TEAPOT_RESPONSE_BODY.encode()) + else: + assert False class TestIE(InfoExtractor): @@ -743,6 +762,25 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ for i in range(len(entries)): expect_dict(self, entries[i], expected_entries[i]) + def test_response_with_expected_status_returns_content(self): + # Checks for mitigations against the effects of + # that affect Python 3.4.1+, which + # manifest as `_download_webpage`, `_download_xml`, `_download_json`, + # or the underlying `_download_webpage_handle` returning no content + # when a response matches `expected_status`. + + httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), InfoExtractorTestRequestHandler) + port = http_server_port(httpd) + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = True + server_thread.start() + + (content, urlh) = self.ie._download_webpage_handle( + 'http://127.0.0.1:%d/teapot' % port, None, + expected_status=TEAPOT_RESPONSE_STATUS) + self.assertEqual(content, TEAPOT_RESPONSE_BODY) + if __name__ == '__main__': unittest.main() diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 5cf2bf1a5..750472281 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,26 +9,16 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import try_rm +from test.helper import http_server_port, try_rm from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD from youtube_dl.utils import encodeFilename -import ssl import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - TEST_SIZE = 10 * 1024 diff --git a/test/test_http.py b/test/test_http.py index 409fec9c8..3ee0a5dda 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,6 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import http_server_port from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -16,15 +17,6 @@ import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 6eb8bbb6e..883dcee7a 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + merge_dicts, mimetype2ext, url_or_none, ) @@ -12,59 +13,83 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', 'title': 'تیم گلکسی 11 - زومیت', - 'age_limit': 0, + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', + 'duration': 231, + 'timestamp': 1387394859, + 'upload_date': '20131218', + 'view_count': int, }, - # 'skip': 'Extremely unreliable', - } + }, { + # multiple formats + 'url': 'https://www.aparat.com/v/8dflw/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work - webpage = self._download_webpage( - 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) + # Provides more metadata + webpage = self._download_webpage(url, video_id, fatal=False) - title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + if not webpage: + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) - file_list = self._parse_json( + options = self._parse_json( self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, - 'file list'), + r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', + webpage, 'options', group='value'), video_id) + player = options['plugins']['sabaPlayerPlugin'] + formats = [] - for item in file_list[0]: - file_url = url_or_none(item.get('file')) - if not file_url: - continue - ext = mimetype2ext(item.get('type')) - label = item.get('label') - formats.append({ - 'url': file_url, - 'ext': ext, - 'format_id': label or ext, - 'height': int_or_none(self._search_regex( - r'(\d+)[pP]', label or '', 'height', default=None)), - }) - self._sort_formats(formats) + for sources in player['multiSRC']: + for item in sources: + if not isinstance(item, dict): + continue + file_url = url_or_none(item.get('src')) + if not file_url: + continue + item_type = item.get('type') + if item_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': 'http-%s' % (label or ext), + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', + default=None)), + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) - thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) + info = self._search_json_ld(webpage, video_id, default={}) - return { + if not info.get('title'): + info['title'] = player['title'] + + return merge_dicts(info, { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': self._family_friendly_search(webpage), + 'thumbnail': url_or_none(options.get('poster')), + 'duration': int_or_none(player.get('duration')), 'formats': formats, - } + }) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index 68f26e2ca..a57a5f114 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -1,213 +1,90 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - get_element_by_class, - get_element_by_id, - strip_or_none, - urljoin, -) -class AZMedienBaseIE(InfoExtractor): - def _kaltura_video(self, partner_id, entry_id): - return self.url_result( - 'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(), - video_id=entry_id) - - -class AZMedienIE(AZMedienBaseIE): +class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// (?:www\.)? - (?: + (?P telezueri\.ch| telebaern\.tv| telem1\.ch )/ - [0-9]+-show-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - (?: - /[0-9]+-segment-(?:[^/\#]+\#)?| - \# - )| - \# + [^/]+/ + (?P + [^/]+-(?P\d+) ) - (?P[^\#]+) + (?: + \#video= + (?P + [_0-9a-z]+ + ) + )? ''' _TESTS = [{ - # URL with 'segment' - 'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom', + 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { - 'id': '1_2444peh4', + 'id': '1_anruz3wy', 'ext': 'mp4', - 'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom', - 'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8', - 'uploader_id': 'TeleZ?ri', - 'upload_date': '20161218', - 'timestamp': 1482084490, + 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', + 'description': 'md5:dd9f96751ec9c35e409a698a328402f3', + 'uploader_id': 'TVOnline', + 'upload_date': '20180930', + 'timestamp': 1538328802, }, 'params': { 'skip_download': True, }, }, { - # URL with 'segment' and fragment: - 'url': 'http://www.telebaern.tv/118-show-news/14240-episode-dienstag-17-januar-2017/33666-segment-achtung-gefahr#zu-wenig-pflegerinnen-und-pfleger', - 'only_matching': True - }, { - # URL with 'episode' and fragment: - 'url': 'http://www.telem1.ch/47-show-sonntalk/13986-episode-soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz#soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz', - 'only_matching': True - }, { - # URL with 'show' and fragment: - 'url': 'http://www.telezueri.ch/66-show-sonntalk#burka-plakate-trump-putin-china-besuch', + 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - partner_id = self._search_regex( - r']+src=["\'](?:https?:)?//(?:[^/]+\.)?kaltura\.com(?:/[^/]+)*/(?:p|partner_id)/([0-9]+)', - webpage, 'kaltura partner id') - entry_id = self._html_search_regex( - r']+data-id=(["\'])(?P(?:(?!\1).)+)\1[^>]+data-slug=["\']%s' - % re.escape(video_id), webpage, 'kaltura entry id', group='id') - - return self._kaltura_video(partner_id, entry_id) - - -class AZMedienPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?P[0-9]+- - (?: - show| - topic| - themen - )-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - )? - )$ - ''' - - _TESTS = [{ - # URL with 'episode' - 'url': 'http://www.telebaern.tv/118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'info_dict': { - 'id': '118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'title': 'News - Donnerstag, 15. Dezember 2016', - }, - 'playlist_count': 9, - }, { - # URL with 'themen' - 'url': 'http://www.telem1.ch/258-themen-tele-m1-classics', - 'info_dict': { - 'id': '258-themen-tele-m1-classics', - 'title': 'Tele M1 Classics', - }, - 'playlist_mincount': 15, - }, { - # URL with 'topic', contains nested playlists - 'url': 'http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen', - 'only_matching': True, - }, { - # URL with 'show' only - 'url': 'http://www.telezueri.ch/86-show-talktaeglich', - 'only_matching': True - }] + _PARTNER_ID = '1719221' def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + entry_id = mobj.group('kaltura_id') - entries = [] + if not entry_id: + webpage = self._download_webpage(url, video_id) + api_path = self._search_regex( + r'["\']apiPath["\']\s*:\s*["\']([^"^\']+)["\']', + webpage, 'api path') + api_url = 'https://www.%s%s' % (mobj.group('host'), api_path) + payload = { + 'query': '''query VideoContext($articleId: ID!) { + article: node(id: $articleId) { + ... on Article { + mainAssetRelation { + asset { + ... on VideoAsset { + kalturaId + } + } + } + } + } + }''', + 'variables': {'articleId': 'Article:%s' % mobj.group('article_id')}, + } + json_data = self._download_json( + api_url, video_id, headers={ + 'Content-Type': 'application/json', + }, + data=json.dumps(payload).encode()) + entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id', default=None) - - if partner_id: - entries = [ - self._kaltura_video(partner_id, m.group('id')) - for m in re.finditer( - r'data-id=(["\'])(?P(?:(?!\1).)+)\1', webpage)] - - if not entries: - entries = [ - self.url_result(m.group('url'), ie=AZMedienIE.ie_key()) - for m in re.finditer( - r']+data-real=(["\'])(?Phttp.+?)\1', webpage)] - - if not entries: - entries = [ - # May contain nested playlists (e.g. [1]) thus no explicit - # ie_key - # 1. http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen) - self.url_result(urljoin(url, m.group('url'))) - for m in re.finditer( - r']+name=[^>]+href=(["\'])(?P/.+?)\1', webpage)] - - title = self._search_regex( - r'episodeShareTitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'title', - default=strip_or_none(get_element_by_id( - 'video-title', webpage)), group='title') - - return self.playlist_result(entries, show_id, title) - - -class AZMedienShowPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien show playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?: - all-episodes| - alle-episoden - )/ - (?P<id>[^/?#&]+) - ''' - - _TEST = { - 'url': 'http://www.telezueri.ch/all-episodes/astrotalk', - 'info_dict': { - 'id': 'astrotalk', - 'title': 'TeleZüri: AstroTalk - alle episoden', - 'description': 'md5:4c0f7e7d741d906004266e295ceb4a26', - }, - 'playlist_mincount': 13, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - episodes = get_element_by_class('search-mobile-box', webpage) - entries = [self.url_result( - urljoin(url, m.group('url'))) for m in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', episodes)] - title = self._og_search_title(webpage, fatal=False) - description = self._og_search_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.url_result( + 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id) diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index ab651d1c8..f2ca7a337 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,19 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none - - -_translation_table = { - 'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n', - 'm': 'a', 'n': 'm', 'p': 'u', 'q': 't', 'r': 's', 'v': 'p', 'x': 'r', - 'y': 'l', 'z': 'i', - '$': ':', '&': '.', '(': '=', '^': '&', '=': '/', -} - - -def _decode(s): - return ''.join(_translation_table.get(c, c) for c in s) +from ..utils import ( + int_or_none, + url_or_none, +) class CliphunterIE(InfoExtractor): @@ -60,14 +51,14 @@ class CliphunterIE(InfoExtractor): formats = [] for format_id, f in gexo_files.items(): - video_url = f.get('url') + video_url = url_or_none(f.get('url')) if not video_url: continue fmt = f.get('fmt') height = f.get('h') format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id formats.append({ - 'url': _decode(video_url), + 'url': video_url, 'format_id': format_id, 'width': int_or_none(f.get('w')), 'height': int_or_none(height), diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index d354d9f95..6889b0f40 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor from ..utils import smuggle_url @@ -34,3 +35,32 @@ class CNBCIE(InfoExtractor): {'force_smil_url': True}), 'id': video_id, } + + +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _TEST = { + 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'info_dict': { + 'id': '7000031301', + 'ext': 'mp4', + 'title': "Trump: I don't necessarily agree with raising rates", + 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', + 'timestamp': 1531958400, + 'upload_date': '20180719', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, + 'video id') + return self.url_result( + 'http://video.cnbc.com/gallery/?video=%s' % video_id, + CNBCIE.ie_key()) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2dbf81e6e..e5f8136fc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -69,6 +69,7 @@ from ..utils import ( update_url_query, urljoin, url_basename, + url_or_none, xpath_element, xpath_text, xpath_with_ns, @@ -605,6 +606,11 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of <https://bugs.python.org/issue15002> + # introduced in Python 3.4.1. + err.fp._error = err return err.fp if errnote is False: @@ -1213,10 +1219,10 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ - 'url': e.get('contentUrl'), + 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 045be0ab5..4a68d092b 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re import json +import xml.etree.ElementTree as etree import zlib from hashlib import sha1 @@ -398,7 +399,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'Downloading subtitles for ' + sub_name, data={ 'subtitle_script_id': sub_id, }) - if sub_doc is None: + if not isinstance(sub_doc, etree.Element): continue sid = sub_doc.get('id') iv = xpath_text(sub_doc, 'iv', 'subtitle iv') @@ -515,7 +516,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_quality': stream_quality, 'current_page': url, }) - if streamdata is not None: + if isinstance(streamdata, etree.Element): stream_info = streamdata.find('./{default}preload/stream_info') if stream_info is not None: stream_infos.append(stream_info) @@ -526,7 +527,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_format': stream_format, 'video_encode_quality': stream_quality, }) - if stream_info is not None: + if isinstance(stream_info, etree.Element): stream_infos.append(stream_info) for stream_info in stream_infos: video_encode_id = xpath_text(stream_info, './video_encode_id') @@ -598,10 +599,22 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text series = self._html_search_regex( r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', webpage, 'series', fatal=False) - season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number')) + season = episode = episode_number = duration = thumbnail = None + + if isinstance(metadata, etree.Element): + season = xpath_text(metadata, 'series_title') + episode = xpath_text(metadata, 'episode_title') + episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + duration = float_or_none(media_metadata.get('duration'), 1000) + thumbnail = xpath_text(metadata, 'episode_image_url') + + if not episode: + episode = media_metadata.get('title') + if not episode_number: + episode_number = int_or_none(media_metadata.get('episode_number')) + if not thumbnail: + thumbnail = media_metadata.get('thumbnail', {}).get('url') season_number = int_or_none(self._search_regex( r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', @@ -611,8 +624,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'id': video_id, 'title': video_title, 'description': video_description, - 'duration': float_or_none(media_metadata.get('duration'), 1000), - 'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'), + 'duration': duration, + 'thumbnail': thumbnail, 'uploader': video_uploader, 'upload_date': video_upload_date, 'series': series, diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index af3978035..4f75a2a30 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -49,6 +49,9 @@ class DailyMailIE(InfoExtractor): 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) video_sources = self._download_json(sources_url, video_id) + body = video_sources.get('body') + if body: + video_sources = body formats = [] for rendition in video_sources['renditions']: diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 17b576df3..b2b00c86f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -88,11 +88,7 @@ from .awaan import ( AWAANLiveIE, AWAANSeasonIE, ) -from .azmedien import ( - AZMedienIE, - AZMedienPlaylistIE, - AZMedienShowPlaylistIE, -) +from .azmedien import AZMedienIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE @@ -209,7 +205,10 @@ from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE -from .cnbc import CNBCIE +from .cnbc import ( + CNBCIE, + CNBCVideoIE, +) from .cnn import ( CNNIE, CNNBlogsIE, @@ -540,6 +539,7 @@ from .la7 import LA7IE from .laola1tv import ( Laola1TvEmbedIE, Laola1TvIE, + EHFTVIE, ITTFIE, ) from .lci import LCIIE @@ -569,6 +569,10 @@ from .limelight import ( LimelightChannelListIE, ) from .line import LineTVIE +from .linkedin import ( + LinkedInLearningIE, + LinkedInLearningCourseIE, +) from .litv import LiTVIE from .liveleak import ( LiveLeakIE, @@ -1043,7 +1047,7 @@ from .spike import ( ) from .stitcher import StitcherIE from .sport5 import Sport5IE -from .sportbox import SportBoxEmbedIE +from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE @@ -1189,6 +1193,7 @@ from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import TwitCastingIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -1473,3 +1478,4 @@ from .zattoo import ( ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE +from .zype import ZypeIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 97cfe0fc3..74954049d 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -57,7 +57,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' - _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2a48667f0..59cf03faf 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,7 +47,7 @@ from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE -from .sportbox import SportBoxEmbedIE +from .sportbox import SportBoxIE from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE @@ -114,6 +114,7 @@ from .apa import APAIE from .foxnews import FoxNewsIE from .viqeo import ViqeoIE from .expressen import ExpressenIE +from .zype import ZypeIE class GenericIE(InfoExtractor): @@ -2070,6 +2071,20 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # Zype embed + 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'add_ie': [ZypeIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', @@ -2636,9 +2651,9 @@ class GenericIE(InfoExtractor): return self.url_result(tvc_url, 'TVC') # Look for embedded SportBox player - sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) + sportbox_urls = SportBoxIE._extract_urls(webpage) if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) @@ -3129,6 +3144,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) + zype_urls = ZypeIE._extract_urls(webpage) + if zype_urls: + return self.playlist_from_matches( + zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index cb51cef2d..86c014b07 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -15,7 +15,7 @@ from ..utils import ( class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' - _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] @@ -65,7 +65,11 @@ class IviIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', - } + }, + { + 'url': 'https://www.ivi.tv/watch/33560/', + 'only_matching': True, + }, ] # Sorted by quality diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index c7f813370..fa217365a 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -32,7 +33,8 @@ class Laola1TvEmbedIE(InfoExtractor): def _extract_token_url(self, stream_access_url, video_id, data): return self._download_json( - stream_access_url, video_id, headers={ + self._proto_relative_url(stream_access_url, 'https:'), video_id, + headers={ 'Content-Type': 'application/json', }, data=json.dumps(data).encode())['data']['stream-access'][0] @@ -119,9 +121,59 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(Laola1TvEmbedIE): +class Laola1TvBaseIE(Laola1TvEmbedIE): + def _extract_video(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + if 'Dieser Livestream ist bereits beendet.' in webpage: + raise ExtractorError('This live stream has already finished.', expected=True) + + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, + transform_source=lambda s: js_to_json(re.sub(r'shareurl:.+,', '', s))) + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, + } + + +class Laola1TvIE(Laola1TvBaseIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -169,52 +221,30 @@ class Laola1TvIE(Laola1TvEmbedIE): }] def _real_extract(self, url): - display_id = self._match_id(url) + return self._extract_video(url) - webpage = self._download_webpage(url, display_id) - if 'Dieser Livestream ist bereits beendet.' in webpage: - raise ExtractorError('This live stream has already finished.', expected=True) +class EHFTVIE(Laola1TvBaseIE): + IE_NAME = 'ehftv' + _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(?:-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)' - conf = self._parse_json(self._search_regex( - r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), - display_id, js_to_json) + _TESTS = [{ + 'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761', + 'info_dict': { + 'id': '1166761', + 'display_id': 'paris-saint-germain-handball-pge-vive-kielce', + 'ext': 'mp4', + 'title': 'Paris Saint-Germain Handball - PGE Vive Kielce', + 'is_live': False, + 'categories': ['Handball'], + }, + 'params': { + 'skip_download': True, + }, + }] - video_id = conf['videoid'] - - config = self._download_json(conf['configUrl'], video_id, query={ - 'videoid': video_id, - 'partnerid': conf['partnerid'], - 'language': conf.get('language', ''), - 'portal': conf.get('portalid', ''), - }) - error = config.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_data = config['video'] - title = video_data['title'] - is_live = video_data.get('isLivestream') and video_data.get('isLive') - meta = video_data.get('metaInformation') - sports = meta.get('sports') - categories = sports.split(',') if sports else [] - - token_url = self._extract_token_url( - video_data['streamAccess'], video_id, - video_data['abo']['required']) - - formats = self._extract_formats(token_url, video_id) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if is_live else title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('image'), - 'categories': categories, - 'formats': formats, - 'is_live': is_live, - } + def _real_extract(self, url): + return self._extract_video(url) class ITTFIE(InfoExtractor): diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py new file mode 100644 index 000000000..259fc4c5e --- /dev/null +++ b/youtube_dl/extractor/linkedin.py @@ -0,0 +1,175 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + urlencode_postdata, +) + + +class LinkedInLearningBaseIE(InfoExtractor): + _NETRC_MACHINE = 'linkedin' + + def _call_api(self, course_slug, fields, video_slug=None, resolution=None): + query = { + 'courseSlug': course_slug, + 'fields': fields, + 'q': 'slugs', + } + sub = '' + if video_slug: + query.update({ + 'videoSlug': video_slug, + 'resolution': '_%s' % resolution, + }) + sub = ' %dp' % resolution + api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + return self._download_json( + api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ + 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, + }, query=query)['elements'][0] + + def _get_video_id(self, urn, course_slug, video_slug): + if urn: + mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) + if mobj: + return mobj.group(1) + return '%s/%s' % (course_slug, video_slug) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + 'https://www.linkedin.com/uas/login?trk=learning', + None, 'Downloading login page') + action_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url') + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + +class LinkedInLearningIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true', + 'md5': 'a1d74422ff0d5e66a792deb996693167', + 'info_dict': { + 'id': '90426', + 'ext': 'mp4', + 'title': 'Welcome', + 'timestamp': 1430396150.82, + 'upload_date': '20150430', + }, + } + + def _real_extract(self, url): + course_slug, video_slug = re.match(self._VALID_URL, url).groups() + + video_data = None + formats = [] + for width, height in ((640, 360), (960, 540), (1280, 720)): + video_data = self._call_api( + course_slug, 'selectedVideo', video_slug, height)['selectedVideo'] + + video_url_data = video_data.get('url') or {} + progressive_url = video_url_data.get('progressiveUrl') + if progressive_url: + formats.append({ + 'format_id': 'progressive-%dp' % height, + 'url': progressive_url, + 'height': height, + 'width': width, + 'source_preference': 1, + }) + + title = video_data['title'] + + audio_url = video_data.get('audio', {}).get('progressiveUrl') + if audio_url: + formats.append({ + 'abr': 64, + 'ext': 'm4a', + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + streaming_url = video_url_data.get('streamingUrl') + if streaming_url: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_slug, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr')) + + return { + 'id': self._get_video_id(video_data.get('urn'), course_slug, video_slug), + 'title': title, + 'formats': formats, + 'thumbnail': video_data.get('defaultThumbnail'), + 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), + 'duration': int_or_none(video_data.get('durationInSeconds')), + } + + +class LinkedInLearningCourseIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning:course' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals', + 'info_dict': { + 'id': 'programming-foundations-fundamentals', + 'title': 'Programming Foundations: Fundamentals', + 'description': 'md5:76e580b017694eb89dc8e8923fff5c86', + }, + 'playlist_mincount': 61, + } + + @classmethod + def suitable(cls, url): + return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_slug = self._match_id(url) + course_data = self._call_api(course_slug, 'chapters,description,title') + + entries = [] + for chapter in course_data.get('chapters', []): + chapter_title = chapter.get('title') + for video in chapter.get('videos', []): + video_slug = video.get('slug') + if not video_slug: + continue + entries.append({ + '_type': 'url_transparent', + 'id': self._get_video_id(video.get('urn'), course_slug, video_slug), + 'title': video.get('title'), + 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), + 'chapter': chapter_title, + 'ie_key': LinkedInLearningIE.ie_key(), + }) + + return self.playlist_result( + entries, course_slug, + course_data.get('title'), + course_data.get('description')) diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index febef097a..025c5d249 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -31,6 +31,8 @@ class NJPWWorldIE(InfoExtractor): 'skip': 'Requires login', } + _LOGIN_URL = 'https://front.njpwworld.com/auth/login' + def _real_initialize(self): self._login() @@ -40,13 +42,17 @@ class NJPWWorldIE(InfoExtractor): if not username: return True + # Setup session (will set necessary cookies) + self._request_webpage( + 'https://njpwworld.com/', None, note='Setting up session') + webpage, urlh = self._download_webpage_handle( - 'https://njpwworld.com/auth/login', None, + self._LOGIN_URL, None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({'login_id': username, 'pw': password}), - headers={'Referer': 'https://njpwworld.com/auth'}) + headers={'Referer': 'https://front.njpwworld.com/auth'}) # /auth/login will return 302 for successful logins - if urlh.geturl() == 'https://njpwworld.com/auth/login': + if urlh.geturl() == self._LOGIN_URL: self.report_warning('unable to login') return False diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index c652603a5..2473536fd 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -317,6 +317,12 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.cc/embed/5NEAbI2BDSk', 'only_matching': True, + }, { + 'url': 'https://oload.icu/f/-_i4y_F_Hs8', + 'only_matching': True, + }, { + 'url': 'https://oload.fun/f/gb6G1H4sHXY', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index c1fb580ca..d432e3449 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -15,6 +15,7 @@ from ..utils import ( strip_jsonp, unescapeHTML, unified_strdate, + url_or_none, ) @@ -68,26 +69,35 @@ class ORFTVthekIE(InfoExtractor): webpage, 'playlist', group='json'), playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - def quality_to_int(s): - m = re.search('([0-9]+)', s) - if m is None: - return -1 - return int(m.group(1)) - entries = [] for sd in data_jsb: video_id, title = sd.get('id'), sd.get('title') if not video_id or not title: continue video_id = compat_str(video_id) - formats = [{ - 'preference': -10 if fd['delivery'] == 'hls' else None, - 'format_id': '%s-%s-%s' % ( - fd['delivery'], fd['quality'], fd['quality_string']), - 'url': fd['src'], - 'protocol': fd['protocol'], - 'quality': quality_to_int(fd['quality']), - } for fd in sd['sources']] + formats = [] + for fd in sd['sources']: + src = url_or_none(fd.get('src')) + if not src: + continue + format_id_list = [] + for key in ('delivery', 'quality', 'quality_string'): + value = fd.get(key) + if value: + format_id_list.append(value) + format_id = '-'.join(format_id_list) + if determine_ext(fd['src']) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + fd['src'], video_id, 'mp4', m3u8_id=format_id)) + elif determine_ext(fd['src']) == 'f4m': + formats.extend(self._extract_f4m_formats( + fd['src'], video_id, f4m_id=format_id)) + else: + formats.append({ + 'format_id': format_id, + 'url': src, + 'protocol': fd.get('protocol'), + }) # Check for geoblocking. # There is a property is_geoprotection, but that's always false diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 2366dfb34..8099ef1d6 100644 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re import time from .common import InfoExtractor @@ -15,7 +16,7 @@ from ..utils import ( class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -33,20 +34,14 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - channel_id = self._match_id(url) - stream_page = self._download_webpage(url, channel_id) + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') - if '>This channel does not exist' in stream_page: - raise ExtractorError( - 'Channel %s does not exist' % channel_id, expected=True) + metadata = self._download_json( + 'https://api.picarto.tv/v1/channel/name/' + channel_id, + channel_id) - player = self._parse_json( - self._search_regex( - r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page, - 'player settings'), - channel_id, transform_source=js_to_json) - - if player.get('online') is False: + if metadata.get('online') is False: raise ExtractorError('Stream is offline', expected=True) cdn_data = self._download_json( @@ -54,20 +49,13 @@ class PicartoIE(InfoExtractor): data=urlencode_postdata({'loadbalancinginfo': channel_id}), note='Downloading load balancing info') - def get_event(key): - return try_get(player, lambda x: x['event'][key], compat_str) or '' - + token = mobj.group('token') or 'public' params = { - 'token': player.get('token') or '', - 'ticket': get_event('ticket'), 'con': int(time.time() * 1000), - 'type': get_event('ticket'), - 'scope': get_event('scope'), + 'token': token, } prefered_edge = cdn_data.get('preferedEdge') - default_tech = player.get('defaultTech') - formats = [] for edge in cdn_data['edges']: @@ -81,8 +69,6 @@ class PicartoIE(InfoExtractor): preference = 0 if edge_id == prefered_edge: preference += 1 - if tech_type == default_tech: - preference += 1 format_id = [] if edge_id: format_id.append(edge_id) @@ -109,7 +95,7 @@ class PicartoIE(InfoExtractor): continue self._sort_formats(formats) - mature = player.get('mature') + mature = metadata.get('adult') if mature is None: age_limit = None else: @@ -117,9 +103,11 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(channel_id), + 'title': self._live_title(metadata.get('title') or channel_id), 'is_live': True, - 'thumbnail': player.get('vodThumb'), + 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), + 'channel': channel_id, + 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 9fa8688f8..f530f0083 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -65,7 +65,8 @@ class RuutuIE(InfoExtractor): video_id = self._match_id(url) video_xml = self._download_xml( - 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id) + 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + query={'id': video_id}) formats = [] processed_urls = [] diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 62a6a8337..69a0d01f3 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -90,6 +90,15 @@ class ScreencastIE(InfoExtractor): r'src=(.*?)(?:$|&)', video_meta, 'meta tag video URL', default=None) + if video_url is None: + video_url = self._html_search_regex( + r'MediaContentUrl["\']\s*:(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video url', default=None, group='url') + + if video_url is None: + video_url = self._html_search_meta( + 'og:video', webpage, default=None) + if video_url is None: raise ExtractorError('Cannot find video') diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index b2250afdd..931a0f70e 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -5,6 +5,7 @@ from ..compat import compat_b64decode from ..utils import ( ExtractorError, int_or_none, + url_or_none, urlencode_postdata, ) @@ -86,9 +87,16 @@ class VivoIE(SharedBaseIE): } def _extract_video_url(self, webpage, video_id, *args): + def decode_url(encoded_url): + return compat_b64decode(encoded_url).decode('utf-8') + + stream_url = url_or_none(decode_url(self._search_regex( + r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'stream url', default=None, group='url'))) + if stream_url: + return stream_url return self._parse_json( self._search_regex( r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'stream', group='url'), - video_id, - transform_source=lambda x: compat_b64decode(x).decode('utf-8'))[0] + video_id, transform_source=decode_url)[0] diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 54497c880..b9017fd2a 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -8,20 +8,24 @@ from ..utils import ( determine_ext, int_or_none, js_to_json, + merge_dicts, ) -class SportBoxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' +class SportBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { - 'id': '211355', + 'id': '109158', 'ext': 'mp4', - 'title': '211355', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 292, 'view_count': int, + 'timestamp': 1426237001, + 'upload_date': '20150313', }, 'params': { # m3u8 download @@ -33,12 +37,18 @@ class SportBoxEmbedIE(InfoExtractor): }, { 'url': 'https://news.sportbox.ru/vdl/player/media/193095', 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/109158', + 'only_matching': True, + }, { + 'url': 'https://matchtv.ru/vdl/player/media/109158', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', + r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', webpage) def _real_extract(self, url): @@ -46,13 +56,14 @@ class SportBoxEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - wjplayer_data = self._parse_json( + sources = self._parse_json( self._search_regex( - r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'), + r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n', + webpage, 'sources'), video_id, transform_source=js_to_json) formats = [] - for source in wjplayer_data['sources']: + for source in sources: src = source.get('src') if not src: continue @@ -66,14 +77,23 @@ class SportBoxEmbedIE(InfoExtractor): }) self._sort_formats(formats) + player = self._parse_json( + self._search_regex( + r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage, + 'player options', default='{}'), + video_id, transform_source=js_to_json) + media_id = player['mediaId'] + + info = self._search_json_ld(webpage, media_id, default={}) + view_count = int_or_none(self._search_regex( r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) - return { - 'id': video_id, - 'title': video_id, - 'thumbnail': wjplayer_data.get('poster'), - 'duration': int_or_none(wjplayer_data.get('duration')), + return merge_dicts(info, { + 'id': media_id, + 'title': self._og_search_title(webpage, default=None) or media_id, + 'thumbnail': player.get('poster'), + 'duration': int_or_none(player.get('duration')), 'view_count': view_count, 'formats': formats, - } + }) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index ffef5bf06..181620615 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -39,9 +39,17 @@ class ThePlatformBaseIE(OnceIE): smil_url, video_id, note=note, query={'format': 'SMIL'}, headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') - if error_element is not None and error_element.attrib['src'].startswith( - 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD): - raise ExtractorError(error_element.attrib['abstract'], expected=True) + if error_element is not None: + exception = find_xpath_attr( + error_element, _x('.//smil:param'), 'name', 'exception') + if exception is not None: + if exception.get('value') == 'GeoLocationBlocked': + self.raise_geo_restricted(error_element.attrib['abstract']) + elif error_element.attrib['src'].startswith( + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' + % self._TP_TLD): + raise ExtractorError( + error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py new file mode 100644 index 000000000..05f8aa9ce --- /dev/null +++ b/youtube_dl/extractor/twitcasting.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re + + +class TwitCastingIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' + _TEST = { + 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', + 'md5': '745243cad58c4681dc752490f7540d7f', + 'info_dict': { + 'id': '2357609', + 'ext': 'mp4', + 'title': 'Recorded Live #2357609', + 'uploader_id': 'ivetesangalo', + 'description': "Moi! I'm live on TwitCasting from my iPhone.", + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + uploader_id = mobj.group('uploader_id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</', + webpage, 'title', default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + + m3u8_url = self._search_regex( + (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'), + webpage, 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader_id': uploader_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index d5d5b4c69..6e318479c 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -130,16 +130,16 @@ class ViewsterIE(InfoExtractor): def concat(suffix, sep='-'): return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix - for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): - media = self._download_json( - 'https://public-api.viewster.com/movies/%s/video' % entry_id, - video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={ - 'mediaType': media_type, - 'language': audio, - 'subtitle': subtitle, - }) - if not media: - continue + medias = self._download_json( + 'https://public-api.viewster.com/movies/%s/videos' % entry_id, + video_id, fatal=False, query={ + 'mediaTypes': ['application/f4m+xml', 'application/x-mpegURL', 'video/mp4'], + 'language': audio, + 'subtitle': subtitle, + }) + if not medias: + continue + for media in medias: video_url = media.get('Uri') if not video_url: continue diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 78203ef84..3f49f3889 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -41,6 +41,7 @@ from ..utils import ( remove_quotes, remove_start, smuggle_url, + str_or_none, str_to_int, try_get, unescapeHTML, @@ -501,6 +502,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -583,6 +585,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -1189,7 +1192,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('), + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) @@ -1538,6 +1542,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + player_response = {} + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -1580,6 +1586,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True sts = ytplayer_config.get('sts') + if not player_response: + pl_response = str_or_none(args.get('player_response')) + if pl_response: + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + player_response = pl_response if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1608,6 +1620,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info_webpage: continue get_video_info = compat_parse_qs(video_info_webpage) + if not player_response: + pl_response = get_video_info.get('player_response', [None])[0] + if isinstance(pl_response, dict): + player_response = pl_response add_dash_mpd(get_video_info) if view_count is None: view_count = extract_view_count(get_video_info) @@ -1653,9 +1669,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + video_details = try_get( + player_response, lambda x: x['videoDetails'], dict) or {} + # title if 'title' in video_info: video_title = video_info['title'][0] + elif 'title' in player_response: + video_title = video_details['title'] else: self._downloader.report_warning('Unable to extract video title') video_title = '_' @@ -1718,6 +1739,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if view_count is None: view_count = extract_view_count(video_info) + if view_count is None and video_details: + view_count = int_or_none(video_details.get('viewCount')) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1898,7 +1921,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + video_uploader = try_get( + video_info, lambda x: x['author'][0], + compat_str) or str_or_none(video_details.get('author')) if video_uploader: video_uploader = compat_urllib_parse_unquote_plus(video_uploader) else: @@ -2011,12 +2036,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): like_count = _extract_count('like') dislike_count = _extract_count('dislike') + if view_count is None: + view_count = str_to_int(self._search_regex( + r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, + 'view count', default=None)) + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) video_duration = try_get( video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = int_or_none(video_details.get('lengthSeconds')) if not video_duration: video_duration = parse_duration(self._html_search_meta( 'duration', video_webpage, 'video duration')) @@ -2131,7 +2163,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?:https?://)? (?:\w+\.)? (?: - youtube\.com/ + (?: + youtube\.com| + invidio\.us + ) + / (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) \? (?:.*?[&;])*? (?:p|a|list)= @@ -2244,6 +2280,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'categories': ['People & Blogs'], 'tags': list, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -2282,6 +2319,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # music album playlist 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }] def _real_initialize(self): diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index bbe0aecb6..896276301 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -22,7 +22,7 @@ class ZattooPlatformBaseIE(InfoExtractor): _power_guide_hash = None def _host_url(self): - return 'https://%s' % self._HOST + return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) def _login(self): username, password = self._get_login_info() @@ -286,6 +286,7 @@ class ZattooLiveIE(ZattooBaseIE): class NetPlusIE(ZattooIE): _NETRC_MACHINE = 'netplus' _HOST = 'netplus.tv' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -300,7 +301,7 @@ class MNetTVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.tvplus.m-net.de/watch/abc/123-abc', + 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', 'only_matching': True, }] @@ -311,7 +312,7 @@ class WalyTVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.player.waly.tv/watch/abc/123-abc', + 'url': 'https://player.waly.tv/watch/abc/123-abc', 'only_matching': True, }] @@ -319,6 +320,7 @@ class WalyTVIE(ZattooIE): class BBVTVIE(ZattooIE): _NETRC_MACHINE = 'bbvtv' _HOST = 'bbv-tv.net' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -330,6 +332,7 @@ class BBVTVIE(ZattooIE): class VTXTVIE(ZattooIE): _NETRC_MACHINE = 'vtxtv' _HOST = 'vtxtv.ch' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -341,6 +344,7 @@ class VTXTVIE(ZattooIE): class MyVisionTVIE(ZattooIE): _NETRC_MACHINE = 'myvisiontv' _HOST = 'myvisiontv.ch' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -355,7 +359,7 @@ class GlattvisionTVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.iptv.glattvision.ch/watch/abc/123-abc', + 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', 'only_matching': True, }] @@ -363,6 +367,7 @@ class GlattvisionTVIE(ZattooIE): class SAKTVIE(ZattooIE): _NETRC_MACHINE = 'saktv' _HOST = 'saktv.ch' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -377,7 +382,7 @@ class EWETVIE(ZattooIE): _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.tvonline.ewe.de/watch/abc/123-abc', + 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', 'only_matching': True, }] @@ -385,6 +390,7 @@ class EWETVIE(ZattooIE): class QuantumTVIE(ZattooIE): _NETRC_MACHINE = 'quantumtv' _HOST = 'quantum-tv.com' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ @@ -395,11 +401,11 @@ class QuantumTVIE(ZattooIE): class OsnatelTVIE(ZattooIE): _NETRC_MACHINE = 'osnateltv' - _HOST = 'onlinetv.osnatel.de' + _HOST = 'tvonline.osnatel.de' _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ - 'url': 'https://www.onlinetv.osnatel.de/watch/abc/123-abc', + 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', 'only_matching': True, }] @@ -407,6 +413,7 @@ class OsnatelTVIE(ZattooIE): class EinsUndEinsTVIE(ZattooIE): _NETRC_MACHINE = '1und1tv' _HOST = '1und1.tv' + _API_HOST = 'www.%s' % _HOST _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) _TESTS = [{ diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py new file mode 100644 index 000000000..3b16e703b --- /dev/null +++ b/youtube_dl/extractor/zype.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ZypeIE(InfoExtractor): + _VALID_URL = r'https?://player\.zype\.com/embed/(?P<id>[\da-fA-F]+)\.js\?.*?api_key=[^&]+' + _TEST = { + 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', + 'md5': 'eaee31d474c76a955bdaba02a505c595', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + } + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//player\.zype\.com/embed/[\da-fA-F]+\.js\?.*?api_key=.+?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + r'video_title\s*[:=]\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'title', group='value') + + m3u8_url = self._search_regex( + r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', webpage, + 'm3u8 url', group='url') + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'poster\s*[:=]\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'thumbnail', + default=False, group='url') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7d3f25019..7f32ad36c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.10.05' +__version__ = '2018.11.07'