diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c4d4e534e..1cfb54bfd 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.26 +[debug] youtube-dl version 2018.06.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.gitignore b/.gitignore index fbf7cecb2..f064a0d9e 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,4 @@ youtube-dl.zsh *.iml tmp/ +venv/ diff --git a/ChangeLog b/ChangeLog index 280390ea0..062000594 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,85 @@ +version 2018.06.14 + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + +version 2018.06.11 + +Extractors +* [npo] Extend URL regular expression and add support for npostart.nl (#16682) ++ [inc] Add support for another embed schema (#16666) +* [tv4] Fix format extraction (#16650) ++ [nexx] Add support for free cdn (#16538) ++ [pbs] Add another cove id pattern (#15373) ++ [rbmaradio] Add support for 192k format (#16631) + + +version 2018.06.04 + +Extractors ++ [camtube] Add support for camtube.co ++ [twitter:card] Extract guest token (#16609) ++ [chaturbate] Use geo verification headers ++ [bbc] Add support for bbcthree (#16612) +* [youtube] Move metadata extraction after video availability check ++ [youtube] Extract track and artist ++ [safari] Add support for new URL schema (#16614) +* [adn] Fix extraction + + +version 2018.06.02 + +Core +* [utils] Improve determine_ext + +Extractors ++ [facebook] Add support for tahoe player videos (#15441, #16554) +* [cbc] Improve extraction (#16583, #16593) +* [openload] Improve ext extraction (#16595) ++ [twitter:card] Add support for another endpoint (#16586) ++ [openload] Add support for oload.win and oload.download (#16592) +* [audimedia] Fix extraction (#15309) ++ [francetv] Add support for sport.francetvinfo.fr (#15645) +* [mlb] Improve extraction (#16587) +- [nhl] Remove old extractors +* [rbmaradio] Check formats availability (#16585) + + +version 2018.05.30 + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + version 2018.05.26 Core diff --git a/devscripts/gh-pages/update-copyright.py b/devscripts/gh-pages/update-copyright.py index e6c3abc8d..61487f925 100755 --- a/devscripts/gh-pages/update-copyright.py +++ b/devscripts/gh-pages/update-copyright.py @@ -13,7 +13,7 @@ year = str(datetime.datetime.now().year) for fn in glob.glob('*.html*'): with io.open(fn, encoding='utf-8') as f: content = f.read() - newc = re.sub(r'(?PCopyright © 2006-)(?P[0-9]{4})', 'Copyright © 2006-' + year, content) + newc = re.sub(r'(?PCopyright © 2011-)(?P[0-9]{4})', 'Copyright © 2011-' + year, content) if content != newc: tmpFn = fn + '.part' with io.open(tmpFn, 'wt', encoding='utf-8') as outf: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b60f2ff23..705279ac1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -15,7 +15,6 @@ - **8tracks** - **91porn** - **9c9media** - - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** @@ -48,6 +47,7 @@ - **anitube.se** - **Anvato** - **AnySex** + - **APA** - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 @@ -128,6 +128,8 @@ - **BYUtv** - **Camdemy** - **CamdemyFolder** + - **CamModels** + - **CamTube** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr @@ -552,9 +554,6 @@ - **nfl.com** - **NhkVod** - **nhl.com** - - **nhl.com:news**: NHL news - - **nhl.com:videocenter** - - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** - **nickelodeon:br** @@ -792,6 +791,7 @@ - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** + - **sport.francetvinfo.fr** - **Sport5** - **SportBoxEmbed** - **SportDeutschland** @@ -893,6 +893,7 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** + - **TVNet** - **TVNoe** - **TVNow** - **TVNowList** diff --git a/setup.cfg b/setup.cfg index 5208f7ae2..af9a554c6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv ignore = E402,E501,E731,E741 diff --git a/test/test_utils.py b/test/test_utils.py index f2b51131c..e63af0166 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -361,6 +361,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') + self.assertEqual(determine_ext('foobar', None), None) def test_find_xpath_attr(self): testxml = ''' diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index a22875f69..5b1e96013 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -217,10 +217,11 @@ class HttpFD(FileDownloader): before = start # start measuring def retry(e): - if ctx.tmpfilename != '-': + to_stdout = ctx.tmpfilename == '-' + if not to_stdout: ctx.stream.close() ctx.stream = None - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) raise RetryDownload(e) while True: diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 512f04684..4ac323bf6 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -105,22 +105,22 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' - _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', + 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZY9247A021S00', + 'id': 'ZX9371A050S00', 'ext': 'mp4', - 'title': "Gaston's Visit", + 'title': "Gaston's Birthday", 'series': "Ben And Holly's Little Kingdom", - 'description': 'md5:18db170ad71cf161e006a4c688e33155', - 'upload_date': '20180318', + 'description': 'md5:f9de914d02f226968f598ac76f105bcf', + 'upload_date': '20180604', 'uploader_id': 'abc4kids', - 'timestamp': 1521400959, + 'timestamp': 1528140219, }, 'params': { 'skip_download': True, @@ -129,17 +129,16 @@ class ABCIViewIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_params = self._parse_json(self._search_regex( - r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) - title = video_params.get('title') or video_params['seriesTitle'] - stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + video_params = self._download_json( + 'https://iview.abc.net.au/api/programs/' + video_id, video_id) + title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) + stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) - house_number = video_params.get('episodeHouseNumber') - path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + house_number = video_params.get('episodeHouseNumber') or video_id + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( int(time.time()), house_number) sig = hmac.new( - 'android.content.res.Resources'.encode('utf-8'), + b'android.content.res.Resources', path.encode('utf-8'), hashlib.sha256).hexdigest() token = self._download_webpage( 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) @@ -169,18 +168,26 @@ class ABCIViewIE(InfoExtractor): 'ext': 'vtt', }] + is_live = video_params.get('livestream') == '1' + if is_live: + title = self._live_title(title) + return { 'id': video_id, - 'title': unescapeHTML(title), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), - 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'title': title, + 'description': video_params.get('description'), + 'thumbnail': video_params.get('thumbnail'), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], - 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), - 'episode': self._html_search_meta('episode_title', webpage, default=None), + 'season_number': int_or_none(self._search_regex( + r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), + 'episode_number': int_or_none(self._search_regex( + r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), + 'episode_id': house_number, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 041c61aff..1eb99c39a 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import binascii import json import os +import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt @@ -12,9 +15,12 @@ from ..compat import ( ) from ..utils import ( bytes_to_intlist, + bytes_to_long, ExtractorError, float_or_none, intlist_to_bytes, + long_to_bytes, + pkcs1pad, srt_subtitles_timecode, strip_or_none, urljoin, @@ -35,6 +41,7 @@ class ADNIE(InfoExtractor): } } _BASE_URL = 'http://animedigitalnetwork.fr' + _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) def _get_subtitles(self, sub_path, video_id): if not sub_path: @@ -42,16 +49,14 @@ class ADNIE(InfoExtractor): enc_subtitles = self._download_webpage( urljoin(self._BASE_URL, sub_path), - video_id, fatal=False, headers={ - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', - }) + video_id, fatal=False) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), + bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -112,11 +117,24 @@ class ADNIE(InfoExtractor): error = None if not links: links_url = player_config.get('linksurl') or options['videoUrl'] - links_data = self._download_json(urljoin( - self._BASE_URL, links_url), video_id) + token = options['token'] + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 'e': 60, + 't': token, + })) + padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + n, e = self._RSA_KEY + encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) + authorization = base64.b64encode(encrypted_message).decode() + links_data = self._download_json( + urljoin(self._BASE_URL, links_url), video_id, headers={ + 'Authorization': 'Bearer ' + authorization, + }) links = links_data.get('links') or {} metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') + sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token error = links_data.get('error') title = metas.get('title') or video_info['title'] diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index aa6925623..6bd48ef15 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -5,13 +5,12 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, - sanitized_Request, ) class AudiMediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P[^/?#]+)' + _TESTS = [{ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { @@ -24,41 +23,46 @@ class AudiMediaIE(InfoExtractor): 'duration': 74022, 'view_count': int, } - } - # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) - _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' + }, { + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) raw_payload = self._search_regex([ - r'class="amtv-embed"[^>]+id="([^"]+)"', - r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', + r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"', + r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"', + r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"', + r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})', ], webpage, 'raw payload') - _, stage_mode, video_id, lang = raw_payload.split('-') + _, stage_mode, video_id, _ = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) if stage_mode not in ('s', 'e'): - request = sanitized_Request( - 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), - headers={'X-Auth-Token': self._AUTH_TOKEN}) - json_data = self._download_json(request, video_id)['results'] + video_data = self._download_json( + 'https://www.audimedia.tv/api/video/v1/videos/' + video_id, + video_id, query={ + 'embed[]': ['video_versions', 'thumbnail_image'], + })['results'] formats = [] - stream_url_hls = json_data.get('stream_url_hls') + stream_url_hls = video_data.get('stream_url_hls') if stream_url_hls: formats.extend(self._extract_m3u8_formats( stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - stream_url_hds = json_data.get('stream_url_hds') + stream_url_hds = video_data.get('stream_url_hds') if stream_url_hds: formats.extend(self._extract_f4m_formats( stream_url_hds + '?hdcore=3.4.0', video_id, f4m_id='hds', fatal=False)) - for video_version in json_data.get('video_versions'): + for video_version in video_data.get('video_versions', []): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue @@ -79,11 +83,11 @@ class AudiMediaIE(InfoExtractor): return { 'id': video_id, - 'title': json_data['title'], - 'description': json_data.get('subtitle'), - 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), - 'timestamp': parse_iso8601(json_data.get('publication_date')), - 'duration': int_or_none(json_data.get('duration')), - 'view_count': int_or_none(json_data.get('view_count')), + 'title': video_data['title'], + 'description': video_data.get('subtitle'), + 'thumbnail': video_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(video_data.get('publication_date')), + 'duration': int_or_none(video_data.get('duration')), + 'view_count': int_or_none(video_data.get('view_count')), 'formats': formats, } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 8b20c03d6..30a63a24e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..utils import ( float_or_none, get_element_by_class, int_or_none, + js_to_json, parse_duration, parse_iso8601, try_get, @@ -772,6 +773,17 @@ class BBCIE(BBCCoUkIE): # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } }] @classmethod @@ -994,6 +1006,36 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) + if bbc3_config: + bbc3_playlist = try_get( + bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py new file mode 100644 index 000000000..c7d40f849 --- /dev/null +++ b/youtube_dl/extractor/camtube.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index ce8e3d346..43f95c739 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -17,6 +17,7 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + orderedSet, parse_duration, parse_iso8601, parse_age_limit, @@ -136,9 +137,15 @@ class CBCIE(InfoExtractor): entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) entries.extend([ self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) + for media_id in orderedSet(media_ids)]) return self.playlist_result( entries, display_id, strip_or_none(title), self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index e3eba4be9..e2b828d8a 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -31,7 +31,8 @@ class ChaturbateIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url, video_id, headers=self.geo_verification_headers()) m3u8_urls = [] diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index fc014f8b5..f4a616455 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -19,8 +19,8 @@ from ..utils import ( class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' - _TEST = { + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' + _TESTS = [{ # geo restricted to CA 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { @@ -45,7 +45,10 @@ class CrackleIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index de27fffd4..9a74906cb 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,12 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json +import base64 +import hashlib import itertools +import json +import random +import re +import string from .common import InfoExtractor - +from ..compat import compat_struct_pack from ..utils import ( determine_ext, error_to_compat_str, @@ -64,7 +68,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, - 'view_count': int, }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -167,6 +170,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + if metadata.get('error', {}).get('type') == 'password_protected': + password = self._downloader.params.get('videopassword') + if password: + r = int(metadata['id'][1:], 36) + us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') + t = ''.join(random.choice(string.ascii_letters) for i in range(10)) + n = us64e(compat_struct_pack('I', r)) + i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) + metadata = self._download_json( + 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) + self._check_error(metadata) formats = [] @@ -302,8 +316,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): error = info.get('error') - if info.get('error') is not None: - title = error['title'] + if error: + title = error.get('title') or error['message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': self.raise_geo_restricted(msg=title) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index b6653784c..fba1ef221 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE +from .dplay import DPlayIE from ..compat import ( compat_parse_qs, compat_urlparse, @@ -12,8 +12,13 @@ from ..compat import ( from ..utils import smuggle_url -class DiscoveryNetworksDeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P\d+)|(?:[^/]+/)*videos/(?P[^/?#]+))' +class DiscoveryNetworksDeIE(DPlayIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ + (?: + .*\#(?P<id>\d+)| + (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| + programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) + )''' _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -40,6 +45,14 @@ class DiscoveryNetworksDeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + alternate_id = mobj.group('alternate_id') + if alternate_id: + self._initialize_geo_bypass({ + 'countries': ['DE'], + }) + return self._get_disco_api_info( + url, '%s/%s' % (mobj.group('programme'), alternate_id), + 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') brightcove_id = mobj.group('id') if not brightcove_id: title = mobj.group('title') diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 8e0374320..fe47f6dce 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -97,6 +97,75 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _get_disco_api_info(self, url, display_id, disco_host, realm): + disco_base = 'https://' + disco_host + token = self._download_json( + '%s/token' % disco_base, display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + headers = { + 'Referer': url, + 'Authorization': 'Bearer ' + token, + } + video = self._download_json( + '%s/content/videos/%s' % (disco_base, display_id), display_id, + headers=headers, query={ + 'include': 'show' + }) + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'] + formats = [] + for format_id, format_dict in self._download_json( + '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), + display_id, headers=headers)['data']['attributes']['streaming'].items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + series = None + try: + included = video.get('included') + if isinstance(included, list): + show = next(e for e in included if e.get('type') == 'show') + series = try_get( + show, lambda x: x['attributes']['name'], compat_str) + except StopIteration: + pass + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description'), + 'duration': float_or_none( + info.get('videoDuration'), scale=1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') @@ -113,72 +182,8 @@ class DPlayIE(InfoExtractor): if not video_id: host = mobj.group('host') - disco_base = 'https://disco-api.%s' % host - self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', - query={ - 'realm': host.replace('.', ''), - }) - video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, - headers={ - 'Referer': url, - 'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1', - }, query={ - 'include': 'show' - }) - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'] - formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id)['data']['attributes']['streaming'].items(): - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - } + return self._get_disco_api_info( + url, display_id, 'disco-api.' + host, host.replace('.', '')) info = self._download_json( 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5f829c72c..d4583b8e4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -147,6 +147,7 @@ from .camdemy import ( CamdemyFolderIE ) from .cammodels import CamModelsIE +from .camtube import CamTubeIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE @@ -381,6 +382,7 @@ from .francetv import ( FranceTVSiteIE, FranceTVEmbedIE, FranceTVInfoIE, + FranceTVInfoSportIE, FranceTVJeunesseIE, GenerationWhatIE, CultureboxIE, @@ -705,12 +707,7 @@ from .nexx import ( from .nfb import NFBIE from .nfl import NFLIE from .nhk import NhkVodIE -from .nhl import ( - NHLVideocenterIE, - NHLNewsIE, - NHLVideocenterCategoryIE, - NHLIE, -) +from .nhl import NHLIE from .nick import ( NickIE, NickBrIE, @@ -1142,6 +1139,7 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE +from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 0971ce356..8a9ed96c2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -56,6 +56,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -208,6 +209,17 @@ class FacebookIE(InfoExtractor): # no title 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', + 'info_dict': { + 'id': '359649331226507', + 'ext': 'mp4', + 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'uploader': 'ESL One Dota 2', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -312,16 +324,18 @@ class FacebookIE(InfoExtractor): if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) + def extract_from_jsmods_instances(js_data): + if js_data: + return extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) + if not video_data: server_js_data = self._parse_json( self._search_regex( r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) - if server_js_data: - video_data = extract_video_data(try_get( - server_js_data, lambda x: x['jsmods']['instances'], - list) or []) + video_data = extract_from_jsmods_instances(server_js_data) if not video_data: if not fatal_if_no_video: @@ -333,8 +347,33 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - else: - raise ExtractorError('Cannot parse data') + + # Video info not in first request, do a secondary request using + # tahoe player specific URL + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__user': 0, + '__a': 1, + '__pc': self._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex( + r'client_revision["\']\s*:\s*(\d+),', webpage, + 'client revision', default='3944515'), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) + + if not video_data: + raise ExtractorError('Cannot parse data') formats = [] for f in video_data: @@ -380,7 +419,8 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id( 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', + fatal=False) or self._og_search_title(webpage, fatal=False) timestamp = int_or_none(self._search_regex( r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index c02cd03de..6fc6b0da0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -379,6 +379,31 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): return self._make_url_result(video_id, catalogue) +class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): + IE_NAME = 'sport.francetvinfo.fr' + _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', + 'info_dict': { + 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', + 'ext': 'mp4', + 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', + 'timestamp': 1523639962, + 'upload_date': '20180413', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') + return self._make_url_result(video_id, 'Sport-web') + + class GenerationWhatIE(InfoExtractor): IE_NAME = 'france2.fr:generation-what' _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)' diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py index 241ec83c4..d5b258a0f 100644 --- a/youtube_dl/extractor/inc.py +++ b/youtube_dl/extractor/inc.py @@ -21,6 +21,21 @@ class IncIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # div with id=kaltura_player_1_kqs38cgm + 'url': 'https://www.inc.com/oscar-raymundo/richard-branson-young-entrepeneurs.html', + 'info_dict': { + 'id': '1_kqs38cgm', + 'ext': 'mp4', + 'title': 'Branson: "In the end, you have to say, Screw it. Just do it."', + 'description': 'md5:21b832d034f9af5191ca5959da5e9cb6', + 'timestamp': 1364403232, + 'upload_date': '20130327', + 'uploader_id': 'incdigital@inc.com', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', 'only_matching': True, @@ -31,10 +46,13 @@ class IncIE(InfoExtractor): webpage = self._download_webpage(url, display_id) partner_id = self._search_regex( - r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') + r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, + 'partner id', default='1034971') - kaltura_id = self._parse_json(self._search_regex( - r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), + kaltura_id = self._search_regex( + r'id=(["\'])kaltura_player_(?P<id>.+?)\1', webpage, 'kaltura id', + default=None, group='id') or self._parse_json(self._search_regex( + r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), display_id)['vid_kaltura_id'] return self.url_result( diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 675ff6873..b907f6b49 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,96 +1,90 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) +from .nhl import NHLBaseIE -class MLBIE(InfoExtractor): +class MLBIE(NHLBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*mlb\.com/ + (?:[\da-z_-]+\.)*(?P<site>mlb)\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| + (?:[^/]+/)*c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) - (?P<id>n?\d+)| - (?:[^/]+/)*(?P<path>[^/]+) + (?P<id>\d+) ) ''' + _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { - 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', 'info_dict': { 'id': '34698933', 'ext': 'mp4', 'title': "Ackley's spectacular catch", 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'duration': 66, - 'timestamp': 1405980600, - 'upload_date': '20140721', + 'timestamp': 1405995000, + 'upload_date': '20140722', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', - 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', + 'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663', + 'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f', 'info_dict': { 'id': '34496663', 'ext': 'mp4', 'title': 'Stanton prepares for Derby', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'duration': 46, - 'timestamp': 1405105800, + 'timestamp': 1405120200, 'upload_date': '20140711', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', - 'md5': '0e6e73d509321e142409b695eadd541f', + 'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115', + 'md5': '99bb9176531adc600b90880fb8be9328', 'info_dict': { 'id': '34578115', 'ext': 'mp4', 'title': 'Cespedes repeats as Derby champ', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'duration': 488, - 'timestamp': 1405399936, + 'timestamp': 1405414336, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', - 'md5': 'b8fd237347b844365d74ea61d4245967', + 'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915', + 'md5': 'da8b57a12b060e7663ee1eebd6f330ec', 'info_dict': { 'id': '34577915', 'ext': 'mp4', 'title': 'Bautista on Home Run Derby', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'duration': 52, - 'timestamp': 1405390722, + 'timestamp': 1405405122, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'aafaf5b0186fee8f32f20508092f8111', + 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', + 'md5': 'e09e37b552351fddbf4d9e699c924d68', 'info_dict': { 'id': '75609783', 'ext': 'mp4', 'title': 'Must C: Pillar climbs for catch', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429124820, + 'timestamp': 1429139220, 'upload_date': '20150415', } }, @@ -111,7 +105,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783', 'only_matching': True, }, { @@ -120,58 +114,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', + 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', 'only_matching': True, } ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if not video_id: - video_path = mobj.group('path') - webpage = self._download_webpage(url, video_path) - video_id = self._search_regex( - [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') - - detail = self._download_xml( - 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' - % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) - - title = detail.find('./headline').text - description = detail.find('./big-blurb').text - duration = parse_duration(detail.find('./duration').text) - timestamp = parse_iso8601(detail.attrib['date'][:-5]) - - thumbnails = [{ - 'url': thumbnail.text, - } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')] - - formats = [] - for media_url in detail.findall('./url'): - playback_scenario = media_url.attrib['playback_scenario'] - fmt = { - 'url': media_url.text, - 'format_id': playback_scenario, - } - m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario) - if m: - fmt.update({ - 'vbr': int(m.group('vbr')) * 1000, - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 5e46a75c0..82d526c22 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -29,14 +29,13 @@ class NexxIE(InfoExtractor): _TESTS = [{ # movie 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '828cea195be04e66057b846288295ba1', + 'md5': '31899fd683de49ad46f4ee67e53e83fe', 'info_dict': { 'id': '128907', 'ext': 'mp4', 'title': 'Stiftung Warentest', 'alt_title': 'Wie ein Test abläuft', 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', - 'release_year': 2013, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2509, @@ -62,6 +61,7 @@ class NexxIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { # does not work via arc 'url': 'nexx:741:1269984', @@ -71,12 +71,26 @@ class NexxIE(InfoExtractor): 'ext': 'mp4', 'title': '1 TAG ohne KLO... wortwörtlich! 😑', 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'description': 'md5:4604539793c49eda9443ab5c5b1d612f', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 607, 'timestamp': 1518614955, 'upload_date': '20180214', }, + }, { + # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html + 'url': 'nexx:747:1533779', + 'md5': '6bf6883912b82b7069fb86c2297e9893', + 'info_dict': { + 'id': '1533779', + 'ext': 'mp4', + 'title': 'Aufregung um ausgebrochene Raubtiere', + 'alt_title': 'Eifel-Zoo', + 'description': 'md5:f21375c91c74ad741dcb164c427999d2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 111, + 'timestamp': 1527874460, + 'upload_date': '20180601', + }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -141,6 +155,139 @@ class NexxIE(InfoExtractor): self._handle_error(result) return result['result'] + def _extract_free_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'free' + + hash = video['general']['hash'] + + ps = compat_str(stream_data['originalDomain']) + if stream_data['applyFolderHierarchy'] == 1: + s = ('%04d' % int(video_id))[::-1] + ps += '/%s/%s' % (s[0:2], s[2:4]) + ps += '/%s/%s_' % (video_id, hash) + + t = 'http://%s' + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] + + def p0(p): + return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' + + formats = [] + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(int(p[0])) + ',' + t += '.mp4.csmil/master.%s' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + http_base = t = '/'.join(k) + http_base = http_base % stream_data['cdnPathHTTP'] + t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + tbr = int(p[0]) + filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) + f = { + 'url': http_base + '/' + filename, + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = p[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + a = filename + ':%s' % (tbr * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + else: + assert False + + if cdn_provider == 'ce': + formats.extend(self._extract_mpd_formats( + t % (stream_data['cdnPathDASH'], 'mpd'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_m3u8_formats( + t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) + + return formats + + def _extract_azure_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'azure' + + azure_locator = stream_data['azureLocator'] + + def get_cdn_shield_base(shield_type='', static=False): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + + language = video['general'].get('language_raw') or '' + + azure_stream_base = get_cdn_shield_base() + is_ml = ',' in language + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' + + protection_token = try_get( + video, lambda x: x['protectiondata']['token'], compat_str) + if protection_token: + azure_manifest_url += '?hdnts=%s' % protection_token + + formats = self._extract_m3u8_formats( + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', True) + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') @@ -220,72 +367,15 @@ class NexxIE(InfoExtractor): general = video['general'] title = general['title'] - stream_data = video['streamdata'] - language = general.get('language_raw') or '' + cdn = video['streamdata']['cdnType'] - # TODO: reverse more cdns - - cdn = stream_data['cdnType'] - assert cdn == 'azure' - - azure_locator = stream_data['azureLocator'] - - def get_cdn_shield_base(shield_type='', static=False): - for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) - if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) - else: - if 'fb' in stream_data['azureAccount']: - prefix = 'df' if static else 'f' - else: - prefix = 'd' if static else 'p' - account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) - return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) - - azure_stream_base = get_cdn_shield_base() - is_ml = ',' in language - azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( - azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' - - protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) - if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token - - formats = self._extract_m3u8_formats( - azure_manifest_url % '(format=m3u8-aapl)', - video_id, 'mp4', 'm3u8_native', - m3u8_id='%s-hls' % cdn, fatal=False) - formats.extend(self._extract_mpd_formats( - azure_manifest_url % '(format=mpd-time-csf)', - video_id, mpd_id='%s-dash' % cdn, fatal=False)) - formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - - azure_progressive_base = get_cdn_shield_base('Prog', True) - azure_file_distribution = stream_data.get('azureFileDistribution') - if azure_file_distribution: - fds = azure_file_distribution.split(',') - if fds: - for fd in fds: - ss = fd.split(':') - if len(ss) == 2: - tbr = int_or_none(ss[0]) - if tbr: - f = { - 'url': '%s%s/%s_src_%s_%d.mp4' % ( - azure_progressive_base, azure_locator, video_id, ss[1], tbr), - 'format_id': '%s-http-%d' % (cdn, tbr), - 'tbr': tbr, - } - width_height = ss[1].split('x') - if len(width_height) == 2: - f.update({ - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - }) - formats.append(f) + if cdn == 'azure': + formats = self._extract_azure_formats(video, video_id) + elif cdn == 'free': + formats = self._extract_free_formats(video, video_id) + else: + # TODO: reverse more cdns + assert False self._sort_formats(formats) diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 62ce800c0..cf440f713 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,18 +1,10 @@ from __future__ import unicode_literals import re -import json -import os from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( - unified_strdate, determine_ext, int_or_none, parse_iso8601, @@ -20,236 +12,77 @@ from ..utils import ( ) -class NHLBaseInfoExtractor(InfoExtractor): - @staticmethod - def _fix_json(json_string): - return json_string.replace('\\\'', '\'') +class NHLBaseIE(InfoExtractor): + def _real_extract(self, url): + site, tmp_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://%s/%s/%sid/v1/%s/details/web-v1.json' + % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + if video_data.get('type') != 'video': + video_data = video_data['media'] + video = video_data.get('video') + if video: + video_data = video + else: + videos = video_data.get('videos') + if videos: + video_data = videos[0] - def _real_extract_video(self, video_id): - vid_parts = video_id.split(',') - if len(vid_parts) == 3: - video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0')) - json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id - data = self._download_json( - json_url, video_id, transform_source=self._fix_json) - return self._extract_video(data[0]) + video_id = compat_str(video_data['id']) + title = video_data['title'] - def _extract_video(self, info): - video_id = info['id'] - self.report_extraction(video_id) + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), + }) + self._sort_formats(formats) - initial_video_url = info['publishPoint'] - if info['formats'] == '1': - parsed_url = compat_urllib_parse_urlparse(initial_video_url) - filename, ext = os.path.splitext(parsed_url.path) - path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse_urlencode({ - 'type': 'fvod', - 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) + thumbnails = [] + cuts = video_data.get('image', {}).get('cuts') or [] + if isinstance(cuts, dict): + cuts = cuts.values() + for thumbnail_data in cuts: + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text - else: - video_url = initial_video_url - - join = compat_urlparse.urljoin - ret = { - 'id': video_id, - 'title': info['name'], - 'url': video_url, - 'description': info['description'], - 'duration': int(info['duration']), - 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), - 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), - } - if video_url.startswith('rtmp:'): - mobj = re.match(r'(?P<tc_url>rtmp://[^/]+/(?P<app>[a-z0-9/]+))/(?P<play_path>mp4:.*)', video_url) - ret.update({ - 'tc_url': mobj.group('tc_url'), - 'play_path': mobj.group('play_path'), - 'app': mobj.group('app'), - 'no_resume': True, - }) - return ret - - -class NHLVideocenterIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter' - _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P<id>[-0-9a-zA-Z,]+)' - - _TESTS = [{ - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', - 'md5': 'db704a4ea09e8d3988c85e36cc892d09', - 'info_dict': { - 'id': '453614', - 'ext': 'mp4', - 'title': 'Quick clip: Weise 4-3 goal vs Flames', - 'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.', - 'duration': 18, - 'upload_date': '20131006', - }, - }, { - 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', - 'md5': 'd22e82bc592f52d37d24b03531ee9696', - 'info_dict': { - 'id': '2014020024-628-h', - 'ext': 'mp4', - 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', - 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', - 'duration': 0, - 'upload_date': '20141011', - }, - }, { - 'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802', - 'md5': 'c78fc64ea01777e426cfc202b746c825', - 'info_dict': { - 'id': '58665', - 'ext': 'flv', - 'title': 'Classic Game In Six - April 22, 1979', - 'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.', - 'duration': 400, - 'upload_date': '20100129' - }, - }, { - 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/?id=736722', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en', - 'md5': '076fcb88c255154aacbf0a7accc3f340', - 'info_dict': { - 'id': '2014020299-X-h', - 'ext': 'mp4', - 'title': 'Penguins at Islanders / Game Highlights', - 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014', - 'duration': 268, - 'upload_date': '20141122', - } - }, { - 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4', - 'info_dict': { - 'id': '691469', - 'ext': 'mp4', - 'title': 'RAW | Craig MacTavish Full Press Conference', - 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.', - 'upload_date': '20141205', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - }, { - 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._real_extract_video(video_id) - - -class NHLNewsIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:news' - IE_DESC = 'NHL news' - _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.nhl.com/ice/news.htm?id=750727', - 'md5': '4b3d1262e177687a3009937bd9ec0be8', - 'info_dict': { - 'id': '736722', - 'ext': 'mp4', - 'title': 'Cal Clutterbuck has been fined $2,000', - 'description': 'md5:45fe547d30edab88b23e0dd0ab1ed9e6', - 'duration': 37, - 'upload_date': '20150128', - }, - }, { - # iframe embed - 'url': 'http://sabres.nhl.com/club/news.htm?id=780189', - 'md5': '9f663d1c006c90ac9fb82777d4294e12', - 'info_dict': { - 'id': '836127', - 'ext': 'mp4', - 'title': 'Morning Skate: OTT vs. BUF (9/23/15)', - 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.", - 'duration': 93, - 'upload_date': '20150923', - }, - }] - - def _real_extract(self, url): - news_id = self._match_id(url) - webpage = self._download_webpage(url, news_id) - video_id = self._search_regex( - [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'", - r'<iframe[^>]+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'], - webpage, 'video id') - return self._real_extract_video(video_id) - - -class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter:category' - IE_DESC = 'NHL videocenter category' - _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P<catid>[0-9]+)(?![&?]id=).*?)?$' - _TEST = { - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999', - 'info_dict': { - 'id': '999', - 'title': 'Highlights', - }, - 'playlist_count': 12, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - team = mobj.group('team') - webpage = self._download_webpage(url, team) - cat_id = self._search_regex( - [r'var defaultCatId = "(.+?)";', - r'{statusIndex:0,index:0,.*?id:(.*?),'], - webpage, 'category id') - playlist_title = self._html_search_regex( - r'tab0"[^>]*?>(.*?)</td>', - webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - - data = compat_urllib_parse_urlencode({ - 'cid': cat_id, - # This is the default value - 'count': 12, - 'ptrs': 3, - 'format': 'json', - }) - path = '/videocenter/servlets/browse?' + data - request_url = compat_urlparse.urljoin(url, path) - response = self._download_webpage(request_url, playlist_title) - response = self._fix_json(response) - if not response.strip(): - self._downloader.report_warning('Got an empty response, trying ' - 'adding the "newvideos" parameter') - response = self._download_webpage(request_url + '&newvideos=true', - playlist_title) - response = self._fix_json(response) - videos = json.loads(response) return { - '_type': 'playlist', - 'title': playlist_title, - 'id': cat_id, - 'entries': [self._extract_video(v) for v in videos], + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, } -class NHLIE(InfoExtractor): +class NHLIE(NHLBaseIE): IE_NAME = 'nhl.com' _VALID_URL = r'https?://(?:www\.)?(?P<site>nhl|wch2016)\.com/(?:[^/]+/)*c-(?P<id>\d+)' - _SITES_MAP = { - 'nhl': 'nhl', - 'wch2016': 'wch', - } + _CONTENT_DOMAIN = 'nhl.bamcontent.com' _TESTS = [{ # type=video 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', @@ -293,59 +126,3 @@ class NHLIE(InfoExtractor): 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', 'only_matching': True, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tmp_id, site = mobj.group('id'), mobj.group('site') - video_data = self._download_json( - 'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json' - % (self._SITES_MAP[site], tmp_id), tmp_id) - if video_data.get('type') == 'article': - video_data = video_data['media'] - - video_id = compat_str(video_data['id']) - title = video_data['title'] - - formats = [] - for playback in video_data.get('playbacks', []): - playback_url = playback.get('url') - if not playback_url: - continue - ext = determine_ext(playback_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False) - self._check_formats(m3u8_formats, video_id) - formats.extend(m3u8_formats) - else: - height = int_or_none(playback.get('height')) - formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), - 'url': playback_url, - 'width': int_or_none(playback.get('width')), - 'height': height, - }) - self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id')) - - thumbnails = [] - for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items(): - thumbnail_url = thumbnail_data.get('src') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_data.get('width')), - 'height': int_or_none(thumbnail_data.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('date')), - 'duration': parse_duration(video_data.get('duration')), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 875665d43..65754c5e7 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( parse_iso8601, float_or_none, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index ff2153387..cb8319f0d 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -36,8 +36,8 @@ class NPOIE(NPOBaseIE): https?:// (?:www\.)? (?: - npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| + npo\.nl/(?:[^/]+/)*| + (?:ntr|npostart)\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| (?:zapp|npo3)\.nl/(?:[^/]+/){2,} ) @@ -160,8 +160,20 @@ class NPOIE(NPOBaseIE): }, { 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996', + 'only_matching': True, + }, { + 'url': 'https://npo.nl/KN_1698996', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if any(ie.suitable(url) + for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) + else super(NPOIE, cls).suitable(url)) + def _real_extract(self, url): video_id = self._match_id(url) return self._get_info(video_id) @@ -389,7 +401,7 @@ class NPOLiveIE(NPOBaseIE): class NPORadioIE(InfoExtractor): IE_NAME = 'npo.nl:radio' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.npo.nl/radio/radio-1', @@ -404,6 +416,10 @@ class NPORadioIE(InfoExtractor): } } + @classmethod + def suitable(cls, url): + return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) + @staticmethod def _html_get_attribute_regex(attribute): return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b4f51f61..7157e2390 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -16,12 +16,22 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _api_host = None + def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://%s/mediaelement/%s' % (self._API_HOST, video_id), - video_id, 'Downloading mediaelement JSON') + api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS + + for api_host in api_hosts: + data = self._download_json( + 'http://%s/mediaelement/%s' % (api_host, video_id), + video_id, 'Downloading mediaelement JSON', + fatal=api_host == api_hosts[-1]) + if not data: + continue + self._api_host = api_host + break title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id @@ -191,7 +201,7 @@ class NRKIE(NRKBaseIE): ) (?P<id>[^?#&]+) ''' - _API_HOST = 'v8-psapi.nrk.no' + _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -237,8 +247,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-we.nrk.no' - + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '4e9ca6629f09e588ed240fb11619922a', diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d0bdd60b8..d264fe206 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -301,6 +301,16 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', 'only_matching': True, + }, { + 'url': 'https://oload.win/f/kUEfGclsU9o', + 'only_matching': True, + }, { + 'url': 'https://oload.download/f/kUEfGclsU9o', + 'only_matching': True, + }, { + # Its title has not got its extension but url has it + 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @@ -362,8 +372,7 @@ class OpenloadIE(InfoExtractor): 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, - # Seems all videos have extensions in their titles - 'ext': determine_ext(title, 'mp4'), + 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), 'subtitles': subtitles, 'http_headers': headers, } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a28ee17ca..8d6f2dd3d 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -360,6 +360,21 @@ class PBSIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', + 'info_dict': { + 'id': '2365936247', + 'ext': 'mp4', + 'title': 'Antiques Roadshow - Indianapolis, Hour 2', + 'description': 'md5:524b32249db55663e7231b6b8d1671a2', + 'duration': 3180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -422,6 +437,7 @@ class PBSIE(InfoExtractor): r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", + r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ ] media_id = self._search_regex( diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index afa7b9161..ae7413fb5 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -53,7 +53,8 @@ class RBMARadioIE(InfoExtractor): 'format_id': compat_str(abr), 'abr': abr, 'vcodec': 'none', - } for abr in (96, 128, 256)] + } for abr in (96, 128, 192, 256)] + self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 8a5d48fc2..30e2a38b4 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -74,7 +74,14 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?#&]+)\.html' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+) + ) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -94,22 +101,41 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, }] + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, video_id) - reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura reference id', group='id') - partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura widget id', group='id') - ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura uiconf id', group='id') + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') query = { 'wid': '_%s' % partner_id, @@ -159,10 +185,15 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)| + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| techbus\.safaribooksonline\.com ) - /(?P<id>[^/]+)/?(?:[#?]|$) + /(?P<id>[^/]+) ''' _TESTS = [{ @@ -179,8 +210,16 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'http://techbus.safaribooksonline.com/9780134426365', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + def _real_extract(self, url): course_id = self._match_id(url) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index edc31729d..784f8ed66 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) from ..utils import ( float_or_none, int_or_none, @@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE): def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(self._search_regex( + drupal_settings = self._parse_json(self._search_regex( r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', - webpage, 'drupal setting'), display_id)['turner_playlist'][0] + webpage, 'drupal setting'), display_id) + video_data = drupal_settings['turner_playlist'][0] media_id = video_data['mediaID'] title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) - streams_data = self._download_json( - 'http://medium.ngtv.io/media/%s/tv' % media_id, - media_id)['media']['tv'] - duration = None - chapters = [] - formats = [] - for supported_type in ('unprotected', 'bulkaes'): - stream_data = streams_data.get(supported_type, {}) - m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') - if not m3u8_url: - continue - if stream_data.get('playlistProtection') == 'spe': - m3u8_url = self._add_akamai_spe_token( - 'http://token.vgtf.net/token/token_spe', - m3u8_url, media_id, { - 'url': url, - 'site_name': site[:3].upper(), - 'auth_required': video_data.get('authRequired') == '1', - }) - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) - - if not chapters: - for chapter in stream_data.get('contentSegments', []): - start_time = float_or_none(chapter.get('start')) - duration = float_or_none(chapter.get('duration')) - if start_time is None or duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + duration, - }) - self._sort_formats(formats) + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) thumbnails = [] for image_id, image in video_data.get('images', {}).items(): @@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE): }) thumbnails.append(i) - return { + info.update({ 'id': media_id, 'title': title, 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), - 'duration': duration, + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), 'timestamp': int_or_none(video_data.get('created')), 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), - 'cahpters': chapters, 'thumbnails': thumbnails, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 63fd4fe1c..73469cc5d 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import json -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..utils import ( determine_ext, ExtractorError, @@ -15,7 +15,7 @@ from ..utils import ( ) -class TeamcocoIE(InfoExtractor): +class TeamcocoIE(TurnerBaseIE): _VALID_URL = r'https?://teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' _TESTS = [ { @@ -110,6 +110,8 @@ class TeamcocoIE(InfoExtractor): name } duration + turnerMediaId + turnerMediaAuthToken } } ... on NotFoundSlug { @@ -123,53 +125,65 @@ class TeamcocoIE(InfoExtractor): record = response['record'] video_id = record['id'] - video_sources = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id) or {} - - formats = [] - get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in video_sources.get('src', {}).items(): - if not isinstance(src, dict): - continue - src_url = src.get('src') - if not src_url: - continue - ext = determine_ext(src_url, mimetype2ext(src.get('type'))) - if format_id == 'hls' or ext == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if src_url.startswith('/'): - src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - else: - if src_url.startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - tbr = int_or_none(self._search_regex( - r'(\d+)k\.mp4', src_url, 'tbr', default=None)) - - formats.append({ - 'url': src_url, - 'ext': ext, - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - if not formats: - formats = self._extract_m3u8_formats( - record['file']['url'], video_id, 'mp4', fatal=False) - self._sort_formats(formats) - - return { + info = { 'id': video_id, 'display_id': display_id, - 'formats': formats, 'title': record['title'], 'thumbnail': record.get('thumb', {}).get('preview'), 'description': record.get('teaser'), 'duration': parse_duration(record.get('duration')), 'timestamp': parse_iso8601(record.get('publishOn')), } + + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): + continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index e73b64aeb..2b7b0d6e1 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -9,6 +9,7 @@ from ..utils import ( xpath_text, int_or_none, determine_ext, + float_or_none, parse_duration, xpath_attr, update_url_query, @@ -23,14 +24,17 @@ class TurnerBaseIE(AdobePassIE): def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) if not token: query = { 'path': secure_path, - 'videoId': content_id, } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id if ap_data.get('auth_required'): query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) auth = self._download_xml( @@ -188,3 +192,42 @@ class TurnerBaseIE(AdobePassIE): 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'is_live': is_live, } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index cfcce020a..51923e44a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -1,13 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_iso8601, - try_get, - determine_ext, ) @@ -78,42 +77,25 @@ class TV4IE(InfoExtractor): title = info['title'] - subtitles = {} - formats = [] - # http formats are linked with unresolvable host - for kind in ('hls3', ''): - data = self._download_json( - 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, - video_id, 'Downloading sources JSON', query={ - 'protocol': kind, - 'videoFormat': 'MP4+WEBVTT', - }) - items = try_get(data, lambda x: x['playback']['items']['item']) - if not items: - continue - if isinstance(items, dict): - items = [items] - for item in items: - manifest_url = item.get('url') - if not isinstance(manifest_url, compat_str): - continue - ext = determine_ext(manifest_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=kind, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_akamai_formats( - manifest_url, video_id, { - 'hls': 'tv4play-i.akamaihd.net', - })) - elif ext == 'webvtt': - subtitles = self._merge_subtitles( - subtitles, { - 'sv': [{ - 'url': manifest_url, - 'ext': 'vtt', - }]}) + manifest_url = self._download_json( + 'https://playback-api.b17g.net/media/' + video_id, + video_id, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + })['playbackItem']['manifestUrl'] + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_ism_formats( + re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES) @@ -124,7 +106,7 @@ class TV4IE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, - 'subtitles': subtitles, + # 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py new file mode 100644 index 000000000..2b2630b91 --- /dev/null +++ b/youtube_dl/extractor/tvnet.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class TVNetIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P<id>\d+)(?:/|$)' + _TESTS = [{ + # video + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': 'b4d7abe0252c9b47774760b7519c7558', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + 'view_count': int, + }, + }, { + # audio + 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', + 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', + 'info_dict': { + 'id': '27017', + 'ext': 'm4a', + 'title': 'VOV1 - Bản tin chiều (10/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705', + 'info_dict': { + 'id': '129999', + 'ext': 'mp4', + 'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + }, { + # live stream + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + # radio live stream + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, default=None) or self._search_regex( + r'<title>([^<]+)<', webpage, 'title') + title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + + if '/video/' in url or '/radio/' in url: + is_live = False + elif '/kenh-truyen-hinh/' in url: + is_live = True + else: + is_live = None + + data_file = unescapeHTML(self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'data file', group='url')) + + stream_urls = set() + formats = [] + for stream in self._download_json(data_file, video_id): + if not isinstance(stream, dict): + continue + stream_url = stream.get('url') + if (stream_url in stream_urls or not stream_url or + not isinstance(stream_url, compat_str)): + continue + stream_urls.add(stream_url) + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + # better support for radio streams + if title.startswith('VOV'): + for f in formats: + f.update({ + 'ext': 'm4a', + 'vcodec': 'none', + }) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or unescapeHTML( + self._search_regex( + r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'thumbnail', default=None, group='url')) + + if is_live: + title = self._live_title(title) + + view_count = int_or_none(self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', + webpage, 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index d7e425041..de41065d6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -63,7 +63,7 @@ class TwitterCardIE(TwitterBaseIE): 'id': '623160978427936768', 'ext': 'mp4', 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg', + 'thumbnail': r're:^https?://.*$', }, }, { @@ -108,6 +108,8 @@ class TwitterCardIE(TwitterBaseIE): }, ] + _API_BASE = 'https://api.twitter.com/1.1' + def _parse_media_info(self, media_info, video_id): formats = [] for media_variant in media_info.get('variants', []): @@ -149,7 +151,7 @@ class TwitterCardIE(TwitterBaseIE): main_script, 'bearer token') # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id api_data = self._download_json( - 'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, + '%s/statuses/show/%s.json' % (self._API_BASE, video_id), video_id, 'Downloading API data', headers={ 'Authorization': 'Bearer ' + bearer_token, @@ -223,15 +225,49 @@ class TwitterCardIE(TwitterBaseIE): formats.extend(self._extract_mobile_formats(username, video_id)) if formats: + title = self._search_regex(r'<title>([^<]+)', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration'), scale=1000) or duration break + if not formats: + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + 'Referer': url, + } + ct0 = self._get_cookies(url).get('ct0') + if ct0: + headers['csrf_token'] = ct0.value + guest_token = self._download_json( + '%s/guest/activate.json' % self._API_BASE, video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = guest_token + self._set_cookie('api.twitter.com', 'gt', guest_token) + config = self._download_json( + '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), + video_id, headers=headers) + track = config['track'] + vmap_url = track.get('vmapUrl') + if vmap_url: + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + else: + playback_url = track['playbackUrl'] + if determine_ext(playback_url) == 'm3u8': + formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + else: + formats = [{ + 'url': playback_url, + }] + title = 'Twitter web player' + thumbnail = config.get('posterImage') + duration = float_or_none(track.get('durationMs'), scale=1000) + self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex(r'([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - return { 'id': video_id, 'title': title, @@ -375,6 +411,22 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + # card via api.twitter.com/1.1/videos/tweet/config + 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', + 'info_dict': { + 'id': '1001551623938805763', + 'ext': 'mp4', + 'title': 're:.*?Shep is on a roll today.*?', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'uploader': 'Lis Power', + 'uploader_id': 'LisPower1', + 'duration': 111.278, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index c022fb33e..3dab9145b 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -36,7 +36,8 @@ class WimpIE(InfoExtractor): webpage = self._download_webpage(url, video_id) youtube_id = self._search_regex( - r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: return { diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 379559825..89c8b7f8d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -510,6 +510,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'license': 'Standard YouTube License', 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -528,6 +530,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision', + 'artist': 'Justin Timberlake', 'age_limit': 18, } }, @@ -597,7 +601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'IB3lcPjvWLA', 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'description': 'md5:1900ed86ee514927b9e00fbead6969a5', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', @@ -638,7 +642,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'The Amazing Atheist', + 'uploader': 'TJ Kirk', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'license': 'Standard YouTube License', @@ -668,10 +672,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 247, + 'duration': 246, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', @@ -733,7 +737,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫艾倫', + 'uploader': '孫ᄋᄅ', 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, @@ -760,7 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, @@ -769,8 +773,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:32', + 'formats': 'mincount:31', }, + 'skip': 'not actual anymore', }, # DASH manifest with segment_list { @@ -885,7 +890,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk', + 'alt_title': 'Dark Walk - Position Music', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', @@ -893,7 +898,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', 'license': 'Standard YouTube License', - 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', + 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk - Position Music', + 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', }, 'params': { 'skip_download': True, @@ -950,7 +957,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:dda0d780d5a6e120758d1711d062a867', 'duration': 4060, 'upload_date': '20151119', - 'uploader': 'Bernie 2016', + 'uploader': 'Bernie Sanders', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', @@ -985,6 +992,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # YouTube Red video with episode data @@ -993,7 +1001,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'description': 'md5:25b78d2f64ae81719f5c96319889b736', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', @@ -1026,7 +1034,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', 'license': 'Standard YouTube License', - 'view_count': int, }, 'params': { 'skip_download': True, @@ -1694,128 +1701,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = m_episode.group('series') - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - chapters = self._extract_chapters(description_original, video_duration) - def _extract_filesize(media_url): return int_or_none(self._search_regex( r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) @@ -1990,6 +1875,133 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + # uploader + video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') + + # uploader_id + video_uploader_id = None + video_uploader_url = None + mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + video_webpage) + if mobj is not None: + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') + else: + self._downloader.report_warning('unable to extract uploader nickname') + + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') + video_thumbnail = None + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = self._html_search_meta( + 'datePublished', video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], + video_webpage, 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + video_license = self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', + video_webpage, 'license', default=None) + + m_music = re.search( + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = m_episode.group('series') + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + + m_cat_container = self._search_regex( + r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', + video_webpage, 'categories', default=None) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + + def _extract_count(count_name): + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') + + # subtitles + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) + + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + + chapters = self._extract_chapters(description_original, video_duration) + # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd_fatal = True @@ -2055,9 +2067,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': video_uploader_url, 'upload_date': upload_date, 'license': video_license, - 'creator': video_creator, + 'creator': video_creator or artist, 'title': video_title, - 'alt_title': video_alt_title, + 'alt_title': video_alt_title or track, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, @@ -2080,6 +2092,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'series': series, 'season_number': season_number, 'episode_number': episode_number, + 'track': track, + 'artist': artist, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 63f24c0b6..6a3199fb9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1228,7 +1228,7 @@ def unified_timestamp(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): - if url is None: + if url is None or '.' not in url: return default_ext guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2253da927..1533dceb4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.26' +__version__ = '2018.06.14'