diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 423a08e4d..71a500f04 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.30.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.30.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.03.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.03.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.30.1 +[debug] youtube-dl version 2019.03.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.travis.yml b/.travis.yml index 79287ccf6..82e81d078 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,6 @@ python: - "3.6" - "pypy" - "pypy3" -sudo: false env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download diff --git a/ChangeLog b/ChangeLog index 4872cd9fc..018a30641 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,72 @@ +version 2019.03.01 + +Core ++ [downloader/external] Add support for rate limit and retries for wget +* [downloader/external] Fix infinite retries for curl (#19303) + +Extractors +* [npo] Fix extraction (#20084) +* [francetv:site] Extend video id regex (#20029, #20071) ++ [periscope] Extract width and height (#20015) +* [servus] Fix extraction (#19297) +* [bbccouk] Make subtitles non fatal (#19651) +* [metacafe] Fix family filter bypass (#19287) + + +version 2019.02.18 + +Extractors +* [tvp:website] Fix and improve extraction ++ [tvp] Detect unavailable videos +* [tvp] Fix description extraction and make thumbnail optional ++ [linuxacademy] Add support for linuxacademy.com (#12207) +* [bilibili] Update keys (#19233) +* [udemy] Extend URL regular expressions (#14330, #15883) +* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) +* [noovo] Fix extraction (#19230) +* [rai] Relax URL regular expression (#19232) ++ [vshare] Pass Referer to download request (#19205, #19221) ++ [openload] Add support for oload.live (#19222) +* [imgur] Use video id as title fallback (#18590) ++ [twitch] Add new source format detection approach (#19193) +* [tvplayhome] Fix video id extraction (#19190) +* [tvplayhome] Fix episode metadata extraction (#19190) +* [rutube:embed] Fix extraction (#19163) ++ [rutube:embed] Add support private videos (#19163) ++ [soundcloud] Extract more metadata ++ [trunews] Add support for trunews.com (#19153) ++ [linkedin:learning] Extract chapter_number and chapter_id (#19162) + + +version 2019.02.08 + +Core +* [utils] Improve JSON-LD regular expression (#18058) +* [YoutubeDL] Fallback to ie_key of matching extractor while making + download archive id when no explicit ie_key is provided (#19022) + +Extractors ++ [malltv] Add support for mall.tv (#18058, #17856) ++ [spankbang:playlist] Add support for playlists (#19145) +* [spankbang] Extend URL regular expression +* [trutv] Fix extraction (#17336) +* [toutv] Fix authentication (#16398, #18700) +* [pornhub] Fix tags and categories extraction (#13720, #19135) +* [pornhd] Fix formats extraction ++ [pornhd] Extract like count (#19123, #19125) +* [radiocanada] Switch to the new media requests (#19115) ++ [teachable] Add support for courses.workitdaily.com (#18871) +- [vporn] Remove extractor (#16276) ++ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086) ++ [drtuber] Extract duration (#19078) +* [soundcloud] Fix paged playlists extraction, add support for albums and update client id +* [soundcloud] Update client id +* [drtv] Improve preference (#19079) ++ [openload] Add support for openload.pw and oload.pw (#18930) ++ [openload] Add support for oload.info (#19073) +* [crackle] Authorize media detail request (#16931) + + version 2019.01.30.1 Core diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2918520c3..d8a8d7ede 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -458,6 +458,7 @@ - **LineTV** - **linkedin:learning** - **linkedin:learning:course** + - **LinuxAcademy** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -476,6 +477,7 @@ - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - **MakerTV** + - **MallTV** - **mangomolo:live** - **mangomolo:video** - **ManyVids** @@ -827,6 +829,7 @@ - **southpark.nl** - **southparkstudios.dk** - **SpankBang** + - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de @@ -913,6 +916,7 @@ - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **TruNews** - **TruTV** - **Tube8** - **TubiTv** @@ -1057,7 +1061,6 @@ - **Voot** - **VoxMedia** - **VoxMediaVolume** - - **Vporn** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 75fa0bbb7..f0aa8466b 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -61,6 +61,7 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -69,6 +70,7 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._og_search_property('foobar', html), 'Foo') self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') + self.assertEqual(ie._og_search_property('test3', html), 'Ill-formatted opengraph') self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 6a8243590..f959798de 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -29,6 +29,16 @@ class TestYoutubeDLCookieJar(unittest.TestCase): tf.close() os.remove(tf.name) + def test_strip_httponly_prefix(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + + def assert_cookie_has_value(key): + self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') + + assert_cookie_has_value('HTTPONLY_COOKIE') + assert_cookie_has_value('JS_ACCESSIBLE_COOKIE') + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/cookies/httponly_cookies.txt b/test/testdata/cookies/httponly_cookies.txt new file mode 100644 index 000000000..c46541d6b --- /dev/null +++ b/test/testdata/cookies/httponly_cookies.txt @@ -0,0 +1,6 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +#HttpOnly_www.foobar.foobar FALSE / TRUE 2147483647 HTTPONLY_COOKIE HTTPONLY_COOKIE_VALUE +www.foobar.foobar FALSE / TRUE 2147483647 JS_ACCESSIBLE_COOKIE JS_ACCESSIBLE_COOKIE_VALUE diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 80ed8d7e5..bc9fc270c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -82,6 +82,7 @@ from .utils import ( sanitize_url, sanitized_Request, std_headers, + str_or_none, subtitles_filename, UnavailableVideoError, url_basename, @@ -2060,15 +2061,24 @@ class YoutubeDL(object): self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return # Future-proof against any change in case # and backwards compatibility with prior versions - extractor = info_dict.get('extractor_key') + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist if extractor is None: - if 'id' in info_dict: - extractor = info_dict.get('ie_key') # key in a playlist - if extractor is None: - return None # Incomplete video information - return extractor.lower() + ' ' + info_dict['id'] + url = str_or_none(info_dict.get('url')) + if not url: + return + # Try to find matching extractor for the URL and take its ie_key + for ie in self._ies: + if ie.suitable(url): + extractor = ie.ie_key() + break + else: + return + return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -2076,7 +2086,7 @@ class YoutubeDL(object): return False vid_id = self._make_archive_id(info_dict) - if vid_id is None: + if not vid_id: return False # Incomplete video information try: diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 958d00aac..22e6093b3 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -121,7 +121,11 @@ class CurlFD(ExternalFD): cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--verbose', 'verbose') cmd += self._option('--limit-rate', 'ratelimit') - cmd += self._option('--retry', 'retries') + retry = self._option('--retry', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '2147483647' + cmd += retry cmd += self._option('--max-filesize', 'max_filesize') cmd += self._option('--interface', 'source_address') cmd += self._option('--proxy', 'proxy') @@ -160,6 +164,12 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--tries', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '0' + cmd += retry cmd += self._option('--bind-address', 'source_address') cmd += self._option('--proxy', 'proxy') cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index eac9a5a46..13340ec64 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,8 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re +import xml from .common import InfoExtractor from ..utils import ( @@ -17,6 +18,7 @@ from ..utils import ( parse_iso8601, try_get, unescapeHTML, + url_or_none, urlencode_postdata, urljoin, ) @@ -310,7 +312,13 @@ class BBCCoUkIE(InfoExtractor): def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): - captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') + cc_url = url_or_none(connection.get('href')) + if not cc_url: + continue + captions = self._download_xml( + cc_url, programme_id, 'Downloading captions', fatal=False) + if not isinstance(captions, xml.etree.ElementTree.Element): + continue lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') subtitles[lang] = [ { diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4d6b051fe..3746671d3 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -93,8 +93,8 @@ class BiliBiliIE(InfoExtractor): }] }] - _APP_KEY = '84956560bc028eb7' - _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' + _APP_KEY = 'iVGUTjsxvpLeuDCf' + _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' def _report_error(self, result): if 'message' in result: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c4ea2882f..c3b0586a0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1058,7 +1058,7 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 5e2cbe41d..fd1e7afad 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -56,22 +56,11 @@ class CrunchyrollBaseIE(InfoExtractor): if username is None: return - self._download_webpage( - 'https://www.crunchyroll.com/?a=formhandler', - None, 'Logging in', 'Wrong login info', - data=urlencode_postdata({ - 'formname': 'RpcApiUser_Login', - 'next_url': 'https://www.crunchyroll.com/acct/membership', - 'name': username, - 'password': password, - })) - - ''' login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') def is_logged(webpage): - return 'Redirecting' in webpage + return 'href="/logout"' in webpage # Already logged in if is_logged(login_page): @@ -110,7 +99,6 @@ class CrunchyrollBaseIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - ''' def _real_initialize(self): self._login() diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 5c41c8022..2baea585b 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -4,7 +4,9 @@ import re from .common import InfoExtractor from ..utils import ( + int_or_none, NO_DEFAULT, + parse_duration, str_to_int, ) @@ -65,6 +67,9 @@ class DrTuberIE(InfoExtractor): }) self._sort_formats(formats) + duration = int_or_none(video_data.get('duration')) or parse_duration( + video_data.get('duration_format')) + title = self._html_search_regex( (r'<h1[^>]+class=["\']title[^>]+>([^<]+)', r'<title>([^<]+)\s*@\s+DrTuber', @@ -103,4 +108,5 @@ class DrTuberIE(InfoExtractor): 'comment_count': comment_count, 'categories': categories, 'age_limit': self._rta_search(webpage), + 'duration': duration, } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index c5f211128..0c7e350f0 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -171,10 +171,13 @@ class DRTVIE(InfoExtractor): continue target = link.get('Target') format_id = target or '' - preference = None - if asset_target in ('SpokenSubtitles', 'SignLanguage'): + if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'): preference = -1 format_id += '-%s' % asset_target + elif asset_target == 'Default': + preference = 1 + else: + preference = None if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 127c69b2e..8cc9bd165 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -29,7 +29,8 @@ class ESPNIE(OnceIE): (?: .*?\?.*?\bid=| /_/id/ - ) + )| + [^/]+/video/ ) )| (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/ @@ -94,6 +95,9 @@ class ESPNIE(OnceIE): }, { 'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', 'only_matching': True, + }, { + 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b40be42e6..923dfe7f4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -593,6 +593,7 @@ from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, ) +from .linuxacademy import LinuxAcademyIE from .litv import LiTVIE from .liveleak import ( LiveLeakIE, @@ -619,6 +620,7 @@ from .mailru import ( MailRuMusicSearchIE, ) from .makertv import MakerTVIE +from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, @@ -1058,7 +1060,10 @@ from .southpark import ( SouthParkEsIE, SouthParkNlIE ) -from .spankbang import SpankBangIE +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE @@ -1167,6 +1172,7 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE from .tubitv import TubiTvIE @@ -1212,7 +1218,7 @@ from .tvnow import ( from .tvp import ( TVPEmbedIE, TVPIE, - TVPSeriesIE, + TVPWebsiteIE, ) from .tvplay import ( TVPlayIE, @@ -1362,7 +1368,6 @@ from .voxmedia import ( VoxMediaVolumeIE, VoxMediaIE, ) -from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE from .vrv import ( diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 2ffe83a78..3c4ef08a8 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -271,7 +271,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): catalogue = None video_id = self._search_regex( - r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1', + r'(?:data-main-video\s*=|videoId\s*:)\s*(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', default=None, group='id') if not video_id: diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 0eb54db3f..a5ba03efa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -27,6 +27,10 @@ class ImgurIE(InfoExtractor): }, { 'url': 'https://i.imgur.com/crGpqCV.mp4', 'only_matching': True, + }, { + # no title + 'url': 'https://i.imgur.com/jxBXAMC.gifv', + 'only_matching': True, }] def _real_extract(self, url): @@ -87,7 +91,7 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'title': self._og_search_title(webpage), + 'title': self._og_search_title(webpage, default=video_id), } diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index f7311f483..2cf444258 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( + clean_html, + get_element_by_class, parse_duration, + strip_or_none, unified_strdate, ) @@ -21,7 +23,9 @@ class LibsynIE(InfoExtractor): 'id': '6385796', 'ext': 'mp3', 'title': "Champion Minded - Developing a Growth Mindset", - 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + # description fetched using another request: + # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 + # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', 'upload_date': '20180320', 'thumbnail': 're:^https?://.*', }, @@ -38,22 +42,36 @@ class LibsynIE(InfoExtractor): }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - url = m.group('mainurl') + url, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - podcast_title = self._search_regex( - r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None) - if podcast_title: - podcast_title = podcast_title.strip() - episode_title = self._search_regex( - r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title') - if episode_title: - episode_title = episode_title.strip() + data = self._parse_json(self._search_regex( + r'var\s+playlistItem\s*=\s*({.+?});', + webpage, 'JSON data block'), video_id) + + episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage) + if not episode_title: + self._search_regex( + [r'data-title="([^"]+)"', r'<title>(.+?)'], + webpage, 'episode title') + episode_title = episode_title.strip() + + podcast_title = strip_or_none(clean_html(self._search_regex( + r'

([^<]+)

', webpage, 'podcast title', + default=None) or get_element_by_class('podcast-title', webpage))) title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + formats = [] + for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): + f_url = data.get(k) + if not f_url: + continue + formats.append({ + 'url': f_url, + 'format_id': format_id, + }) + description = self._html_search_regex( r'(.+?)

', webpage, 'description', default=None) @@ -61,27 +79,15 @@ class LibsynIE(InfoExtractor): # Strip non-breaking and normal spaces description = description.replace('\u00A0', ' ').strip() release_date = unified_strdate(self._search_regex( - r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) - - data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') - data = json.loads(data_json) - - formats = [{ - 'url': data['media_url'], - 'format_id': 'main', - }, { - 'url': data['media_url_libsyn'], - 'format_id': 'libsyn', - }] - thumbnail = data.get('thumbnail_url') - duration = parse_duration(data.get('duration')) + r'
Released: ([^<]+)<', + webpage, 'release date', default=None) or data.get('release_date')) return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': data.get('thumbnail_url'), 'upload_date': release_date, - 'duration': duration, + 'duration': parse_duration(data.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py index 259fc4c5e..5a86b0064 100644 --- a/youtube_dl/extractor/linkedin.py +++ b/youtube_dl/extractor/linkedin.py @@ -34,12 +34,15 @@ class LinkedInLearningBaseIE(InfoExtractor): 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, }, query=query)['elements'][0] - def _get_video_id(self, urn, course_slug, video_slug): + def _get_urn_id(self, video_data): + urn = video_data.get('urn') if urn: mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) if mobj: return mobj.group(1) - return '%s/%s' % (course_slug, video_slug) + + def _get_video_id(self, video_data, course_slug, video_slug): + return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) def _real_initialize(self): email, password = self._get_login_info() @@ -123,7 +126,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr')) return { - 'id': self._get_video_id(video_data.get('urn'), course_slug, video_slug), + 'id': self._get_video_id(video_data, course_slug, video_slug), 'title': title, 'formats': formats, 'thumbnail': video_data.get('defaultThumbnail'), @@ -154,18 +157,21 @@ class LinkedInLearningCourseIE(LinkedInLearningBaseIE): course_data = self._call_api(course_slug, 'chapters,description,title') entries = [] - for chapter in course_data.get('chapters', []): + for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1): chapter_title = chapter.get('title') + chapter_id = self._get_urn_id(chapter) for video in chapter.get('videos', []): video_slug = video.get('slug') if not video_slug: continue entries.append({ '_type': 'url_transparent', - 'id': self._get_video_id(video.get('urn'), course_slug, video_slug), + 'id': self._get_video_id(video, course_slug, video_slug), 'title': video.get('title'), 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, 'ie_key': LinkedInLearningIE.ie_key(), }) diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py new file mode 100644 index 000000000..a78c6556e --- /dev/null +++ b/youtube_dl/extractor/linuxacademy.py @@ -0,0 +1,174 @@ +from __future__ import unicode_literals + +import json +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + orderedSet, + unescapeHTML, + urlencode_postdata, + urljoin, +) + + +class LinuxAcademyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?linuxacademy\.com/cp/ + (?: + courses/lesson/course/(?P\d+)/lesson/(?P\d+)| + modules/view/id/(?P\d+) + ) + ''' + _TESTS = [{ + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', + 'info_dict': { + 'id': '1498-2', + 'ext': 'mp4', + 'title': "Introduction to the Practitioner's Brief", + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires Linux Academy account credentials', + }, { + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', + 'only_matching': True, + }, { + 'url': 'https://linuxacademy.com/cp/modules/view/id/154', + 'info_dict': { + 'id': '154', + 'title': 'AWS Certified Cloud Practitioner', + 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', + }, + 'playlist_count': 41, + 'skip': 'Requires Linux Academy account credentials', + }] + + _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' + _ORIGIN_URL = 'https://linuxacademy.com' + _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' + _NETRC_MACHINE = 'linuxacademy' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def random_string(): + return ''.join([ + random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') + for _ in range(32)]) + + webpage, urlh = self._download_webpage_handle( + self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ + 'client_id': self._CLIENT_ID, + 'response_type': 'token id_token', + 'redirect_uri': self._ORIGIN_URL, + 'scope': 'openid email user_impersonation profile', + 'audience': self._ORIGIN_URL, + 'state': random_string(), + 'nonce': random_string(), + }) + + login_data = self._parse_json( + self._search_regex( + r'atob\(\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'login info', group='value'), None, + transform_source=lambda x: compat_b64decode(x).decode('utf-8') + )['extraParams'] + + login_data.update({ + 'client_id': self._CLIENT_ID, + 'redirect_uri': self._ORIGIN_URL, + 'tenant': 'lacausers', + 'connection': 'Username-Password-Authentication', + 'username': username, + 'password': password, + 'sso': 'true', + }) + + login_state_url = compat_str(urlh.geturl()) + + try: + login_page = self._download_webpage( + 'https://login.linuxacademy.com/usernamepassword/login', None, + 'Downloading login page', data=json.dumps(login_data).encode(), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read(), None) + message = error.get('description') or error['code'] + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + raise + + callback_page, urlh = self._download_webpage_handle( + 'https://login.linuxacademy.com/login/callback', None, + 'Downloading callback page', + data=urlencode_postdata(self._hidden_inputs(login_page)), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + + access_token = self._search_regex( + r'access_token=([^=&]+)', compat_str(urlh.geturl()), + 'access token') + + self._download_webpage( + 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' + % access_token, None, 'Downloading token validation page') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') + item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) + + webpage = self._download_webpage(url, item_id) + + # course path + if course_id: + entries = [ + self.url_result( + urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) + for lesson_url in orderedSet(re.findall( + r']+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', + webpage))] + title = unescapeHTML(self._html_search_regex( + (r'class=["\']course-title["\'][^>]*>(?P[^<]+)', + r'var\s+title\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', default=None, group='value')) + description = unescapeHTML(self._html_search_regex( + r'var\s+description\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'description', default=None, group='value')) + return self.playlist_result(entries, course_id, title, description) + + # single video path + info = self._extract_jwplayer_data( + webpage, item_id, require_title=False, m3u8_id='hls',) + title = self._search_regex( + (r'>Lecture\s*:\s*(?P[^<]+)', + r'lessonName\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + info.update({ + 'id': item_id, + 'title': title, + }) + return info diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py new file mode 100644 index 000000000..e13c2e11a --- /dev/null +++ b/youtube_dl/extractor/malltv.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import merge_dicts + + +class MallTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'md5': '1c4a37f080e1f3023103a7b43458e518', + 'info_dict': { + 'id': 't0zzt0', + 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'ext': 'mp4', + 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', + 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'duration': 216, + 'timestamp': 1538870400, + 'upload_date': '20181007', + 'view_count': int, + } + }, { + 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage( + url, display_id, headers=self.geo_verification_headers()) + + SOURCE_RE = r'(]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P[\da-z]+)/index)\b' + video_id = self._search_regex( + SOURCE_RE, webpage, 'video id', group='id') + + media = self._parse_html5_media_entries( + url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, + m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts(media, info, { + 'id': video_id, + 'display_id': display_id, + 'title': self._og_search_title(webpage, default=None) or display_id, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + }) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 28f59f63c..9e92416d1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -1,12 +1,13 @@ from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_urllib_parse, compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, ) from ..utils import ( determine_ext, @@ -144,7 +145,7 @@ class MetacafeIE(InfoExtractor): headers = { # Disable family filter - 'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False}) + 'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False})) } # AnyClip videos require the flashversion cookie so that we get the link diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index 974de3c3e..b40770d07 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -57,7 +57,8 @@ class NoovoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bc_url = BrightcoveNewIE._extract_url(self, webpage) + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') data = self._parse_json( self._search_regex( @@ -89,7 +90,10 @@ class NoovoIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, 'title': title, 'description': description, 'series': series, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 5a427c396..857845d35 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -12,11 +12,16 @@ from ..utils import ( ExtractorError, fix_xml_ampersands, int_or_none, + merge_dicts, orderedSet, parse_duration, qualities, + str_or_none, strip_jsonp, unified_strdate, + unified_timestamp, + url_or_none, + urlencode_postdata, ) @@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._get_info(video_id) + try: + return self._get_info(url, video_id) + except ExtractorError: + return self._get_old_info(video_id) - def _get_info(self, video_id): + def _get_info(self, url, video_id): + token = self._download_json( + 'https://www.npostart.nl/api/token', video_id, + 'Downloading token', headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + })['token'] + + player = self._download_json( + 'https://www.npostart.nl/player/%s' % video_id, video_id, + 'Downloading player JSON', data=urlencode_postdata({ + 'autoplay': 0, + 'share': 1, + 'pageUrl': url, + 'hasAdConsent': 0, + '_token': token, + })) + + player_token = player['token'] + + format_urls = set() + formats = [] + for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'): + streams = self._download_json( + 'https://start-player.npo.nl/video/%s/streams' % video_id, + video_id, 'Downloading %s profile JSON' % profile, fatal=False, + query={ + 'profile': profile, + 'quality': 'npo', + 'tokenId': player_token, + 'streamType': 'broadcast', + }) + if not streams: + continue + stream = streams.get('stream') + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('src')) + if not stream_url or stream_url in format_urls: + continue + format_urls.add(stream_url) + if stream.get('protection') is not None: + continue + stream_type = stream.get('type') + stream_ext = determine_ext(stream_url) + if stream_type == 'application/dash+xml' or stream_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif '.ism/Manifest' in stream_url: + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': stream_url, + }) + + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + embed_url = url_or_none(player.get('embedUrl')) + if embed_url: + webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed page', fatal=False) + if webpage: + video = self._parse_json( + self._search_regex( + r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video', + default='{}'), video_id) + if video: + title = video.get('episodeTitle') + subtitles = {} + subtitles_list = video.get('subtitles') + if isinstance(subtitles_list, list): + for cc in subtitles_list: + cc_url = url_or_none(cc.get('src')) + if not cc_url: + continue + lang = str_or_none(cc.get('language')) or 'nl' + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + return merge_dicts({ + 'title': title, + 'description': video.get('description'), + 'thumbnail': url_or_none( + video.get('still_image_url') or video.get('orig_image_url')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('broadcastDate')), + 'creator': video.get('channel'), + 'series': video.get('title'), + 'episode': title, + 'episode_number': int_or_none(video.get('episodeNumber')), + 'subtitles': subtitles, + }, info) + + return info + + def _get_old_info(self, video_id): metadata = self._download_json( 'http://e.omroep.nl/metadata/%s' % video_id, video_id, @@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE): # JSON else: video_url = stream_info.get('url') - if not video_url or video_url in urls: + if not video_url or 'vodnotavailable.' in video_url or video_url in urls: continue urls.add(video_url) if determine_ext(video_url) == 'm3u8': diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index a2ae25272..c1dcbb7eb 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -249,7 +249,7 @@ class OpenloadIE(InfoExtractor): (?:www\.)? (?: openload\.(?:co|io|link|pw)| - oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw) + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live) ) )/ (?:f|embed)/ @@ -346,6 +346,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.pw/f/WyKgK8s94N0', 'only_matching': True, + }, { + 'url': 'https://oload.live/f/-Z58UZ-GR4M', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 8afe541ec..b337a56c0 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + int_or_none, parse_iso8601, unescapeHTML, ) @@ -75,6 +76,14 @@ class PeriscopeIE(PeriscopeBaseIE): 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + width = int_or_none(broadcast.get('width')) + height = int_or_none(broadcast.get('height')) + + def add_width_and_height(f): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + video_urls = set() formats = [] for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): @@ -83,16 +92,21 @@ class PeriscopeIE(PeriscopeBaseIE): continue video_urls.add(video_url) if format_id != 'rtmp': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, token, 'mp4', entry_protocol='m3u8_native' if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=False)) + m3u8_id=format_id, fatal=False) + if len(m3u8_formats) == 1: + add_width_and_height(m3u8_formats[0]) + formats.extend(m3u8_formats) continue - formats.append({ + rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', - }) + } + add_width_and_height(rtmp_format) + formats.append(rtmp_format) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index b52879c7a..27d65d4b9 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -4,9 +4,11 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, js_to_json, + urljoin, ) @@ -14,7 +16,7 @@ class PornHdIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' _TESTS = [{ 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5', + 'md5': '87f1540746c1d32ec7a2305c12b96b25', 'info_dict': { 'id': '9864', 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', @@ -23,6 +25,7 @@ class PornHdIE(InfoExtractor): 'description': 'md5:3748420395e03e31ac96857a8f125b2b', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, + 'like_count': int, 'age_limit': 18, } }, { @@ -37,6 +40,7 @@ class PornHdIE(InfoExtractor): 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, + 'like_count': int, 'age_limit': 18, }, 'skip': 'Not available anymore', @@ -65,12 +69,14 @@ class PornHdIE(InfoExtractor): formats = [] for format_id, video_url in sources.items(): + video_url = urljoin(url, video_url) if not video_url: continue height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_id, 'height', default=None)) formats.append({ 'url': video_url, + 'ext': determine_ext(video_url, 'mp4'), 'format_id': format_id, 'height': height, }) @@ -85,6 +91,11 @@ class PornHdIE(InfoExtractor): r"poster'?\s*:\s*([\"'])(?P(?:(?!\1).)+)\1", webpage, 'thumbnail', fatal=False, group='url') + like_count = int_or_none(self._search_regex( + (r'(\d+)\s*]+>(?: |\s)*\blikes', + r'class=["\']save-count["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + return { 'id': video_id, 'display_id': display_id, @@ -92,6 +103,7 @@ class PornHdIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, + 'like_count': like_count, 'formats': formats, 'age_limit': 18, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index be93d5d48..641083da7 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -16,7 +16,6 @@ from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, int_or_none, - js_to_json, orderedSet, remove_quotes, str_to_int, @@ -303,14 +302,12 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - page_params = self._parse_json(self._search_regex( - r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', - webpage, 'page parameters', group='data', default='{}'), - video_id, transform_source=js_to_json, fatal=False) - tags = categories = None - if page_params: - tags = page_params.get('tags', '').split(',') - categories = page_params.get('categories', '').split(',') + def extract_list(meta_key): + div = self._search_regex( + r'(?s)]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)
' + % meta_key, webpage, meta_key, default=None) + if div: + return re.findall(r']+\bhref=[^>]+>([^<]+)', div) return { 'id': video_id, @@ -325,8 +322,8 @@ class PornHubIE(PornHubBaseIE): 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, - 'tags': tags, - 'categories': categories, + 'tags': extract_list('tags'), + 'categories': extract_list('categories'), 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 302f67d96..dd95f99f2 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -4,16 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( - xpath_text, - find_xpath_attr, determine_ext, + ExtractorError, int_or_none, unified_strdate, - xpath_element, - ExtractorError, - determine_protocol, - unsmuggle_url, ) @@ -61,107 +57,67 @@ class RadioCanadaIE(InfoExtractor): 'only_matching': True, } ] + _GEO_COUNTRIES = ['CA'] + _access_token = None + _claims = None - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - app_code, video_id = re.match(self._VALID_URL, url).groups() - - metadata = self._download_xml( - 'http://api.radio-canada.ca/metaMedia/v1/index.ashx', - video_id, note='Downloading metadata XML', query={ + def _call_api(self, path, video_id=None, app_code=None, query=None): + if not query: + query = {} + query.update({ + 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', + 'output': 'json', + }) + if video_id: + query.update({ 'appCode': app_code, 'idMedia': video_id, }) + if self._access_token: + query['access_token'] = self._access_token + try: + return self._download_json( + 'https://services.radio-canada.ca/media/' + path, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): + data = self._parse_json(e.cause.read().decode(), None) + error = data.get('error_description') or data['errorMessage']['text'] + raise ExtractorError(error, expected=True) + raise + + def _extract_info(self, app_code, video_id): + metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] def get_meta(name): - el = find_xpath_attr(metadata, './/Meta', 'name', name) - return el.text if el is not None else None + for meta in metas: + if meta.get('name') == name: + text = meta.get('text') + if text: + return text # protectionType does not necessarily mean the video is DRM protected (see # https://github.com/rg3/youtube-dl/pull/18609). if get_meta('protectionType'): self.report_warning('This video is probably DRM protected.') - device_types = ['ipad'] - if not smuggled_data: - device_types.append('flash') - device_types.append('android') - - formats = [] - error = None - # TODO: extract f4m formats - # f4m formats can be extracted using flashhd device_type but they produce unplayable file - for device_type in device_types: - validation_url = 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx' - query = { - 'appCode': app_code, - 'idMedia': video_id, - 'connectionType': 'broadband', - 'multibitrate': 'true', - 'deviceType': device_type, - } - if smuggled_data: - validation_url = 'https://services.radio-canada.ca/media/validation/v2/' - query.update(smuggled_data) - else: - query.update({ - # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction - 'paysJ391wsHjbOJwvCs26toz': 'CA', - 'bypasslock': 'NZt5K62gRqfc', - }) - v_data = self._download_xml(validation_url, video_id, note='Downloading %s XML' % device_type, query=query, fatal=False) - v_url = xpath_text(v_data, 'url') - if not v_url: - continue - if v_url == 'null': - error = xpath_text(v_data, 'message') - continue - ext = determine_ext(v_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - v_url, video_id, f4m_id='hds', fatal=False)) - else: - ext = determine_ext(v_url) - bitrates = xpath_element(v_data, 'bitrates') - for url_e in bitrates.findall('url'): - tbr = int_or_none(url_e.get('bitrate')) - if not tbr: - continue - f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) - protocol = determine_protocol({'url': f_url}) - f = { - 'format_id': '%s-%d' % (protocol, tbr), - 'url': f_url, - 'ext': 'flv' if protocol == 'rtmp' else ext, - 'protocol': protocol, - 'width': int_or_none(url_e.get('width')), - 'height': int_or_none(url_e.get('height')), - 'tbr': tbr, - } - mobj = re.match(r'(?Prtmp://[^/]+/[^/]+)/(?P[^?]+)(?P\?.+)', f_url) - if mobj: - f.update({ - 'url': mobj.group('url') + mobj.group('auth'), - 'play_path': mobj.group('playpath'), - }) - formats.append(f) - if protocol == 'rtsp': - base_url = self._search_regex( - r'rtsp://([^?]+)', f_url, 'base url', default=None) - if base_url: - base_url = 'http://' + base_url - formats.extend(self._extract_m3u8_formats( - base_url + '/playlist.m3u8', video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - base_url + '/manifest.f4m', video_id, - f4m_id='hds', fatal=False)) - if not formats and error: + query = { + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + } + if self._claims: + query['claims'] = self._claims + v_data = self._call_api('validation/v2/', video_id, app_code, query) + v_url = v_data.get('url') + if not v_url: + error = v_data['message'] + if error == "Le contenu sélectionné n'est pas disponible dans votre pays": + raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) + if error == 'Le contenu sélectionné est disponible seulement en premium': + self.raise_login_required(error) raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') self._sort_formats(formats) subtitles = {} @@ -186,11 +142,14 @@ class RadioCanadaIE(InfoExtractor): 'formats': formats, } + def _real_extract(self, url): + return self._extract_info(*re.match(self._VALID_URL, url).groups()) + class RadioCanadaAudioVideoIE(InfoExtractor): 'radiocanada:audiovideo' - _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P[0-9]+)' - _TEST = { + _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P[0-9]+)' + _TESTS = [{ 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', 'info_dict': { 'id': '7527184', @@ -203,7 +162,10 @@ class RadioCanadaAudioVideoIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', + 'only_matching': True, + }] def _real_extract(self, url): return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 548a6553b..149153b8f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -288,7 +288,7 @@ class RaiPlayPlaylistIE(InfoExtractor): class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ # var uniquename = "ContentItem-..." # data-id="ContentItem-..." @@ -375,6 +375,9 @@ class RaiIE(RaiBaseIE): # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, + }, { + 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', + 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 10ac8ed1f..8f54d5675 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -21,7 +21,17 @@ from ..utils import ( class RutubeBaseIE(InfoExtractor): - def _extract_video(self, video, video_id=None, require_title=True): + def _download_api_info(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/video/%s/' % video_id, + video_id, 'Downloading video JSON', + 'Unable to download video JSON', query=query) + + @staticmethod + def _extract_info(video, video_id=None, require_title=True): title = video['title'] if require_title else video.get('title') age_limit = video.get('is_adult') @@ -32,7 +42,7 @@ class RutubeBaseIE(InfoExtractor): category = try_get(video, lambda x: x['category']['name']) return { - 'id': video.get('id') or video_id, + 'id': video.get('id') or video_id if video_id else video['id'], 'title': title, 'description': video.get('description'), 'thumbnail': video.get('thumbnail_url'), @@ -47,6 +57,42 @@ class RutubeBaseIE(InfoExtractor): 'is_live': bool_or_none(video.get('is_livestream')), } + def _download_and_extract_info(self, video_id, query=None): + return self._extract_info( + self._download_api_info(video_id, query=query), video_id) + + def _download_api_options(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/play/options/%s/' % video_id, + video_id, 'Downloading options JSON', + 'Unable to download options JSON', + headers=self.geo_verification_headers(), query=query) + + def _extract_formats(self, options, video_id): + formats = [] + for format_id, format_url in options['video_balancer'].items(): + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + return formats + + def _download_and_extract_formats(self, video_id, query=None): + return self._extract_formats( + self._download_api_options(video_id, query=query), video_id) + class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' @@ -55,13 +101,13 @@ class RutubeIE(RutubeBaseIE): _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - 'md5': '79938ade01294ef7e27574890d0d3769', + 'md5': '1d24f180fac7a02f3900712e5a5764d6', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', - 'duration': 80, + 'duration': 81, 'uploader': 'NTDRussian', 'uploader_id': '29790', 'timestamp': 1381943602, @@ -94,39 +140,12 @@ class RutubeIE(RutubeBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - - video = self._download_json( - 'http://rutube.ru/api/video/%s/?format=json' % video_id, - video_id, 'Downloading video JSON') - - info = self._extract_video(video, video_id) - - options = self._download_json( - 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, - video_id, 'Downloading options JSON', - headers=self.geo_verification_headers()) - - formats = [] - for format_id, format_url in options['video_balancer'].items(): - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - info['formats'] = formats + info = self._download_and_extract_info(video_id) + info['formats'] = self._download_and_extract_formats(video_id) return info -class RutubeEmbedIE(InfoExtractor): +class RutubeEmbedIE(RutubeBaseIE): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' @@ -135,7 +154,7 @@ class RutubeEmbedIE(InfoExtractor): 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', - 'ext': 'flv', + 'ext': 'mp4', 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', @@ -149,16 +168,26 @@ class RutubeEmbedIE(InfoExtractor): }, { 'url': 'http://rutube.ru/play/embed/8083783', 'only_matching': True, + }, { + # private video + 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', + 'only_matching': True, }] def _real_extract(self, url): embed_id = self._match_id(url) - webpage = self._download_webpage(url, embed_id) - - canonical_url = self._html_search_regex( - r'AA-\w+|\d+-\d+)' + _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P[aA]{2}-\w+|\d+-\d+)' _TESTS = [{ 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', - 'md5': '046dee641cda1c4cabe13baef3be2c1c', + 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', - 'title': 'Die Grünen aus Volkssicht', - 'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Die Grünen aus Sicht des Volkes', + 'description': 'md5:1247204d85783afe3682644398ff2ec4', + 'thumbnail': r're:^https?://.*\.jpg', } }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + video_id = self._match_id(url).upper() webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) + title = self._search_regex( + (r'videoLabel\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, + group='title') or self._og_search_title(webpage) + title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 0c4f865ef..35bc9fa50 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -61,7 +61,8 @@ class SixPlayIE(InfoExtractor): quality_key = qualities(['lq', 'sd', 'hq', 'hd']) formats = [] subtitles = {} - for asset in clip_data['assets']: + assets = clip_data.get('assets') or [] + for asset in assets: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls: diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 81c81c8d5..15da3496e 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -16,8 +16,10 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, - unified_strdate, + try_get, + unified_timestamp, update_url_query, + url_or_none, ) @@ -34,7 +36,7 @@ class SoundcloudIE(InfoExtractor): (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?!stations/track) (?P<uploader>[\w\d-]+)/ - (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -50,12 +52,17 @@ class SoundcloudIE(InfoExtractor): 'info_dict': { 'id': '62986583', 'ext': 'mp3', - 'upload_date': '20121011', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', - 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', + 'timestamp': 1349920598, + 'upload_date': '20121011', 'duration': 143, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, } }, # not streamable song @@ -67,9 +74,14 @@ class SoundcloudIE(InfoExtractor): 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', + 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 227, + 'duration': 30, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { # rtmp @@ -84,11 +96,16 @@ class SoundcloudIE(InfoExtractor): 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # private link (alt format) @@ -99,11 +116,16 @@ class SoundcloudIE(InfoExtractor): 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # downloadable song @@ -116,9 +138,14 @@ class SoundcloudIE(InfoExtractor): 'title': 'Bus Brakes', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', + 'timestamp': 1389232924, 'upload_date': '20140109', 'duration': 17, 'license': 'cc-by-sa', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # private link, downloadable format @@ -131,9 +158,14 @@ class SoundcloudIE(InfoExtractor): 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', 'uploader': 'Ori Uplift Music', + 'timestamp': 1504206263, 'upload_date': '20170831', 'duration': 7449, 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, # no album art, use avatar pic for thumbnail @@ -146,10 +178,15 @@ class SoundcloudIE(InfoExtractor): 'title': 'Sideways (Prod. Mad Real)', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'uploader': 'garyvee', + 'timestamp': 1488152409, 'upload_date': '20170226', 'duration': 207, 'thumbnail': r're:https?://.*\.jpg', 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -157,7 +194,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb' + _CLIENT_ID = 'NmW1FlPaiL94ueEu7oziOWjYEzZzQDcK' @staticmethod def _extract_urls(webpage): @@ -175,22 +212,33 @@ class SoundcloudIE(InfoExtractor): def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): track_id = compat_str(info['id']) + title = info['title'] name = full_title or track_id if quiet: self.report_extraction(name) thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') + username = try_get(info, lambda x: x['user']['username'], compat_str) + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + result = { 'id': track_id, - 'uploader': info.get('user', {}).get('username'), - 'upload_date': unified_strdate(info.get('created_at')), - 'title': info['title'], + 'uploader': username, + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, 'description': info.get('description'), 'thumbnail': thumbnail, 'duration': int_or_none(info.get('duration'), 1000), 'webpage_url': info.get('permalink_url'), 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), } formats = [] query = {'client_id': self._CLIENT_ID} @@ -368,7 +416,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): - _API_BASE = 'https://api.soundcloud.com' _API_V2_BASE = 'https://api-v2.soundcloud.com' def _extract_playlist(self, base_url, playlist_id, playlist_title): @@ -389,21 +436,30 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): next_href, playlist_id, 'Downloading track page %s' % (i + 1)) collection = response['collection'] - if not collection: - break - def resolve_permalink_url(candidates): + if not isinstance(collection, list): + collection = [] + + # Empty collection may be returned, in this case we proceed + # straight to next_href + + def resolve_entry(candidates): for cand in candidates: - if isinstance(cand, dict): - permalink_url = cand.get('permalink_url') - entry_id = self._extract_id(cand) - if permalink_url and permalink_url.startswith('http'): - return permalink_url, entry_id + if not isinstance(cand, dict): + continue + permalink_url = url_or_none(cand.get('permalink_url')) + if not permalink_url: + continue + return self.url_result( + permalink_url, + ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + video_id=self._extract_id(cand), + video_title=cand.get('title')) for e in collection: - permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) - if permalink_url: - entries.append(self.url_result(permalink_url, video_id=entry_id)) + entry = resolve_entry((e, e.get('track'), e.get('playlist'))) + if entry: + entries.append(entry) next_href = response.get('next_href') if not next_href: @@ -429,46 +485,53 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): (?:(?:www|m)\.)?soundcloud\.com/ (?P<user>[^/]+) (?:/ - (?P<rsrc>tracks|sets|reposts|likes|spotlight) + (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) )? /?(?:[?#].*)?$ ''' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-akashic-chronicler', + 'url': 'https://soundcloud.com/soft-cell-official', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (All)', + 'id': '207965082', + 'title': 'Soft Cell (All)', }, - 'playlist_mincount': 74, + 'playlist_mincount': 28, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', + 'url': 'https://soundcloud.com/soft-cell-official/tracks', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Tracks)', + 'id': '207965082', + 'title': 'Soft Cell (Tracks)', }, - 'playlist_mincount': 37, + 'playlist_mincount': 27, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'url': 'https://soundcloud.com/soft-cell-official/albums', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Playlists)', + 'id': '207965082', + 'title': 'Soft Cell (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/jcv246/sets', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Playlists)', }, 'playlist_mincount': 2, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'url': 'https://soundcloud.com/jcv246/reposts', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Reposts)', + 'id': '12982173', + 'title': 'Jordi / cv (Reposts)', }, - 'playlist_mincount': 7, + 'playlist_mincount': 6, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'url': 'https://soundcloud.com/clalberg/likes', 'info_dict': { - 'id': '114582580', - 'title': 'The Akashic Chronicler (Likes)', + 'id': '11817582', + 'title': 'clalberg (Likes)', }, - 'playlist_mincount': 321, + 'playlist_mincount': 5, }, { 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { @@ -479,10 +542,11 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): }] _BASE_URL_MAP = { - 'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE, + 'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, } @@ -490,6 +554,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): _TITLE_MAP = { 'all': 'All', 'tracks': 'Tracks', + 'albums': 'Albums', 'sets': 'Playlists', 'reposts': 'Reposts', 'likes': 'Likes', diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 67500b69c..f11d728ca 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -5,14 +5,17 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + orderedSet, parse_duration, parse_resolution, str_to_int, + url_or_none, + urlencode_postdata, ) class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -41,29 +44,71 @@ class SpankBangIE(InfoExtractor): # 4k 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play', + 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2y3td/embed/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, headers={ - 'Cookie': 'country=US' - }) + webpage = self._download_webpage( + url.replace('/%s/embed' % video_id, '/%s/video' % video_id), + video_id, headers={'Cookie': 'country=US'}) if re.search(r'<[^>]+\bid=["\']video_removed', webpage): raise ExtractorError( 'Video %s is not available' % video_id, expected=True) formats = [] - for mobj in re.finditer( - r'stream_url_(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', - webpage): - format_id, format_url = mobj.group('id', 'url') + + def extract_format(format_id, format_url): + f_url = url_or_none(format_url) + if not f_url: + return f = parse_resolution(format_id) f.update({ - 'url': format_url, + 'url': f_url, 'format_id': format_id, }) formats.append(f) + + STREAM_URL_PREFIX = 'stream_url_' + + for mobj in re.finditer( + r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' + % STREAM_URL_PREFIX, webpage): + extract_format(mobj.group('id', 'url')) + + if not formats: + stream_key = self._search_regex( + r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'stream key', group='value') + + sb_csrf_session = self._get_cookies( + 'https://spankbang.com')['sb_csrf_session'].value + + stream = self._download_json( + 'https://spankbang.com/api/videos/stream', video_id, + 'Downloading stream JSON', data=urlencode_postdata({ + 'id': stream_key, + 'data': 0, + 'sb_csrf_session': sb_csrf_session, + }), headers={ + 'Referer': url, + 'X-CSRFToken': sb_csrf_session, + }) + + for format_id, format_url in stream.items(): + if format_id.startswith(STREAM_URL_PREFIX): + extract_format( + format_id[len(STREAM_URL_PREFIX):], format_url) + self._sort_formats(formats) title = self._html_search_regex( @@ -94,3 +139,33 @@ class SpankBangIE(InfoExtractor): 'formats': formats, 'age_limit': age_limit, } + + +class SpankBangPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+' + _TEST = { + 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', + 'info_dict': { + 'id': 'ug0k', + 'title': 'Big Ass Titties', + }, + 'playlist_mincount': 50, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) + + entries = [self.url_result( + 'https://spankbang.com/%s/video' % video_id, + ie=SpankBangIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))] + + title = self._html_search_regex( + r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title', + fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 47ac95ee8..c1a9deafe 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -27,6 +27,7 @@ class TeachableBaseIE(InfoExtractor): 'market.saleshacker.com': 'saleshacker', 'learnability.org': 'learnability', 'edurila.com': 'edurila', + 'courses.workitdaily.com': 'workitdaily', } _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 2e7876cc5..f1ab91cf2 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -3,22 +3,19 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .radiocanada import RadioCanadaIE from ..utils import ( - int_or_none, - js_to_json, - urlencode_postdata, extract_attributes, - smuggle_url, + int_or_none, + merge_dicts, + urlencode_postdata, ) -class TouTvIE(InfoExtractor): +class TouTvIE(RadioCanadaIE): _NETRC_MACHINE = 'toutv' IE_NAME = 'tou.tv' _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' - _access_token = None - _claims = None _TESTS = [{ 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', @@ -46,18 +43,14 @@ class TouTvIE(InfoExtractor): email, password = self._get_login_info() if email is None: return - state = 'http://ici.tou.tv/' - webpage = self._download_webpage(state, None, 'Downloading homepage') - toutvlogin = self._parse_json(self._search_regex( - r'(?s)toutvlogin\s*=\s*({.+?});', webpage, 'toutvlogin'), None, js_to_json) - authorize_url = toutvlogin['host'] + '/auth/oauth/v2/authorize' login_webpage = self._download_webpage( - authorize_url, None, 'Downloading login page', query={ - 'client_id': toutvlogin['clientId'], - 'redirect_uri': 'https://ici.tou.tv/login/loginCallback', + 'https://services.radio-canada.ca/auth/oauth/v2/authorize', + None, 'Downloading login page', query={ + 'client_id': '4dd36440-09d5-4468-8923-b6d91174ad36', + 'redirect_uri': 'https://ici.tou.tv/logincallback', 'response_type': 'token', - 'scope': 'media-drmt openid profile email id.write media-validation.read.privileged', - 'state': state, + 'scope': 'id.write media-validation.read', + 'state': '/', }) def extract_form_url_and_data(wp, default_form_url, form_spec_re=''): @@ -86,12 +79,7 @@ class TouTvIE(InfoExtractor): self._access_token = self._search_regex( r'access_token=([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', urlh.geturl(), 'access token') - self._claims = self._download_json( - 'https://services.radio-canada.ca/media/validation/v2/getClaims', - None, 'Extracting Claims', query={ - 'token': self._access_token, - 'access_token': self._access_token, - })['claims'] + self._claims = self._call_api('validation/v2/getClaims')['claims'] def _real_extract(self, url): path = self._match_id(url) @@ -102,19 +90,10 @@ class TouTvIE(InfoExtractor): self.report_warning('This video is probably DRM protected.', path) video_id = metadata['IdMedia'] details = metadata['Details'] - title = details['OriginalTitle'] - video_url = 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id) - if self._access_token and self._claims: - video_url = smuggle_url(video_url, { - 'access_token': self._access_token, - 'claims': self._claims, - }) - return { - '_type': 'url_transparent', - 'url': video_url, + return merge_dicts({ 'id': video_id, - 'title': title, + 'title': details.get('OriginalTitle'), 'thumbnail': details.get('ImageUrl'), 'duration': int_or_none(details.get('LengthInSeconds')), - } + }, self._extract_info(metadata.get('AppCode', 'toutv'), video_id)) diff --git a/youtube_dl/extractor/trunews.py b/youtube_dl/extractor/trunews.py new file mode 100644 index 000000000..b0c7caabf --- /dev/null +++ b/youtube_dl/extractor/trunews.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, + int_or_none, + unified_timestamp, + update_url_query, + url_or_none, +) + + +class TruNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', + 'md5': 'a19c024c3906ff954fac9b96ce66bb08', + 'info_dict': { + 'id': '5c5a21e65d3c196e1c0020cc', + 'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', + 'ext': 'mp4', + 'title': "Will Democrats Stage a Circus During President Trump's State of the Union Speech?", + 'description': 'md5:c583b72147cc92cf21f56a31aff7a670', + 'duration': 3685, + 'timestamp': 1549411440, + 'upload_date': '20190206', + }, + 'add_ie': ['Zype'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + video = self._download_json( + 'https://api.zype.com/videos', display_id, query={ + 'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H', + 'per_page': 1, + 'active': 'true', + 'friendly_title': display_id, + })['response'][0] + + zype_id = video['_id'] + + thumbnails = [] + thumbnails_list = video.get('thumbnails') + if isinstance(thumbnails_list, list): + for thumbnail in thumbnails_list: + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + '_type': 'url_transparent', + 'url': update_url_query( + 'https://player.zype.com/embed/%s.js' % zype_id, + {'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}), + 'ie_key': 'Zype', + 'id': zype_id, + 'display_id': display_id, + 'title': video.get('title'), + 'description': dict_get(video, ('description', 'ott_description', 'short_description')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('published_at')), + 'average_rating': float_or_none(video.get('rating')), + 'view_count': int_or_none(video.get('request_count')), + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/trutv.py b/youtube_dl/extractor/trutv.py index 3a5782525..ce892c8c5 100644 --- a/youtube_dl/extractor/trutv.py +++ b/youtube_dl/extractor/trutv.py @@ -4,44 +4,72 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..utils import ( + int_or_none, + parse_iso8601, +) class TruTVIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?trutv\.com(?:(?P<path>/shows/[^/]+/videos/[^/?#]+?)\.html|/full-episodes/[^/]+/(?P<id>\d+))' + _VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P<series_slug>[0-9A-Za-z-]+)/(?:videos/(?P<clip_slug>[0-9A-Za-z-]+)|(?P<id>\d+))' _TEST = { - 'url': 'http://www.trutv.com/shows/10-things/videos/you-wont-believe-these-sports-bets.html', - 'md5': '2cdc844f317579fed1a7251b087ff417', + 'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html', 'info_dict': { - 'id': '/shows/10-things/videos/you-wont-believe-these-sports-bets', + 'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1', 'ext': 'mp4', - 'title': 'You Won\'t Believe These Sports Bets', - 'description': 'Jamie Lee sits down with a bookie to discuss the bizarre world of illegal sports betting.', - 'upload_date': '20130305', - } + 'title': 'Sunlight-Activated Flower', + 'description': "A customer is stunned when he sees Michael's sunlight-activated flower.", + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - auth_required = False - if path: - data_src = 'http://www.trutv.com/video/cvp/v2/xml/content.xml?id=%s.xml' % path + series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups() + + if video_id: + path = 'episode' + display_id = video_id else: - webpage = self._download_webpage(url, video_id) - video_id = self._search_regex( - r"TTV\.TVE\.episodeId\s*=\s*'([^']+)';", - webpage, 'video id', default=video_id) - auth_required = self._search_regex( - r'TTV\.TVE\.authRequired\s*=\s*(true|false);', - webpage, 'auth required', default='false') == 'true' - data_src = 'http://www.trutv.com/tveverywhere/services/cvpXML.do?titleId=' + video_id - return self._extract_cvp_info( - data_src, path, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/trutv/big', - 'tokenizer_src': 'http://www.trutv.com/tveverywhere/processors/services/token_ipadAdobe.do', - }, - }, { + path = 'series/clip' + display_id = clip_slug + + data = self._download_json( + 'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id), + display_id) + video_data = data['episode'] if video_id else data['info'] + media_id = video_data['mediaId'] + title = video_data['title'].strip() + + info = self._extract_ngtv_info( + media_id, {}, { 'url': url, 'site_name': 'truTV', - 'auth_required': auth_required, + 'auth_required': video_data.get('isAuthRequired'), }) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('srcUrl') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video_data.get('publicationDate')), + 'series': video_data.get('showTitle'), + 'season_number': int_or_none(video_data.get('seasonNum')), + 'episode_number': int_or_none(video_data.get('episodeNum')), + }) + return info diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 3954f0b93..accff75b5 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,14 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from ..utils import ( - determine_ext, clean_html, - get_element_by_attribute, + determine_ext, ExtractorError, + get_element_by_attribute, + orderedSet, ) @@ -19,12 +21,12 @@ class TVPIE(InfoExtractor): _TESTS = [{ 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', - 'title': 'Czas honoru, I seria – odc. 13', - 'description': 'md5:381afa5bca72655fe94b05cfe82bf53d', + 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:437f48b93558370b031740546b696e24', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', @@ -45,6 +47,7 @@ class TVPIE(InfoExtractor): 'title': 'Wiadomości, 28.09.2017, 19:30', 'description': 'Wydanie główne codziennego serwisu informacyjnego.' }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -75,8 +78,10 @@ class TVPIE(InfoExtractor): return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } @@ -87,6 +92,15 @@ class TVPEmbedIE(InfoExtractor): _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' _TESTS = [{ + 'url': 'tvp:194536', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, odc. 13 – Władek', + }, + }, { + # not available 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 'md5': '8c9cd59d16edabf39331f93bf8a766c7', 'info_dict': { @@ -94,6 +108,7 @@ class TVPEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': 'Panorama, 07.12.2015, 15:40', }, + 'skip': 'Transmisja została zakończona lub materiał niedostępny', }, { 'url': 'tvp:22670268', 'only_matching': True, @@ -105,10 +120,13 @@ class TVPEmbedIE(InfoExtractor): webpage = self._download_webpage( 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - error_massage = get_element_by_attribute('class', 'msg error', webpage) - if error_massage: + error = self._html_search_regex( + r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', + webpage, 'error', default=None) or clean_html( + get_element_by_attribute('class', 'msg error', webpage)) + if error: raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error_massage)), expected=True) + self.IE_NAME, clean_html(error)), expected=True) title = self._search_regex( r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', @@ -180,48 +198,55 @@ class TVPEmbedIE(InfoExtractor): } -class TVPSeriesIE(InfoExtractor): +class TVPWebsiteIE(InfoExtractor): IE_NAME = 'tvp:series' - _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' _TESTS = [{ - 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem', + # series + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', 'info_dict': { - 'title': 'Ogniem i mieczem', - 'id': '4278026', + 'id': '38678312', }, - 'playlist_count': 4, + 'playlist_count': 115, }, { - 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat', + # film + 'url': 'https://vod.tvp.pl/website/gloria,35139666', 'info_dict': { - 'title': 'Boso przez świat', - 'id': '9329207', + 'id': '36637049', + 'ext': 'mp4', + 'title': 'Gloria, Gloria', }, - 'playlist_count': 86, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['TVPEmbed'], + }, { + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', + 'only_matching': True, }] + def _entries(self, display_id, playlist_id): + url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) + for page_num in itertools.count(1): + page = self._download_webpage( + url, display_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + + video_ids = orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, + page)) + + if not video_ids: + break + + for video_id in video_ids: + yield self.url_result( + 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), + video_id=video_id) + def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id, tries=5) - - title = self._html_search_regex( - r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series') - playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id') - playlist = self._download_webpage( - 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend' - 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5, - note='Downloading playlist') - - videos_paths = re.findall( - '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) - entries = [ - self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key()) - for v_path in videos_paths] - - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': display_id, - 'title': title, - 'entries': entries, - } + mobj = re.match(self._VALID_URL, url) + display_id, playlist_id = mobj.group('display_id', 'id') + return self.playlist_result( + self._entries(display_id, playlist_id), playlist_id) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 8f1ff3b76..d82d48f94 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -493,10 +493,9 @@ class TVPlayHomeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_id = self._search_regex( - r'data-asset-id\s*=\s*["\'](\d{5,7})\b', webpage, 'video id', - default=None) + r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id') - if video_id: + if len(video_id) < 8: return self.url_result( 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) @@ -537,8 +536,9 @@ class TVPlayHomeIE(InfoExtractor): r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', default=None)) episode = self._search_regex( - r'(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'episode', - default=None, group='value') + (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'episode', default=None, group='value') episode_number = int_or_none(self._search_regex( r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', default=None)) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 401615683..8c87f6dd3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -136,7 +136,12 @@ class TwitchBaseIE(InfoExtractor): source = next(f for f in formats if f['format_id'] == 'Source') source['preference'] = 10 except StopIteration: - pass # No Source stream present + for f in formats: + if '/chunked/' in f['url']: + f.update({ + 'source_preference': 10, + 'format_note': 'Source', + }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 105826e9b..ae8de9897 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -29,7 +29,7 @@ class UdemyIE(InfoExtractor): IE_NAME = 'udemy' _VALID_URL = r'''(?x) https?:// - www\.udemy\.com/ + (?:[^/]+\.)?udemy\.com/ (?: [^#]+\#/lecture/| lecture/view/?\?lectureId=| @@ -64,6 +64,9 @@ class UdemyIE(InfoExtractor): # only outputs rendition 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757', + 'only_matching': True, }] def _extract_course_info(self, webpage, video_id): @@ -123,10 +126,22 @@ class UdemyIE(InfoExtractor): def _download_webpage_handle(self, *args, **kwargs): headers = kwargs.get('headers', {}).copy() - headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' + headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' kwargs['headers'] = headers - return super(UdemyIE, self)._download_webpage_handle( + ret = super(UdemyIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) + if not ret: + return ret + webpage, _ = ret + if any(p in webpage for p in ( + '>Please verify you are a human', + 'Access to this page has been denied because we believe you are using automation tools to browse the website', + '"_pxCaptcha"')): + raise ExtractorError( + 'Udemy asks you to solve a CAPTCHA. Login with browser, ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies.', expected=True) + return ret def _download_json(self, url_or_request, *args, **kwargs): headers = { @@ -403,8 +418,14 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P<id>[^/?#&]+)' - _TESTS = [] + _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.udemy.com/java-tutorial/', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/', + 'only_matching': True, + }] @classmethod def suitable(cls, url): diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 6215b3258..6f32ea6f1 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -502,7 +502,11 @@ class VimeoIE(VimeoBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') orig_url = url - if mobj.group('pro') or mobj.group('player'): + if mobj.group('pro'): + # some videos require portfolio_id to be present in player url + # https://github.com/rg3/youtube-dl/issues/20070 + url = self._extract_url(url, self._download_webpage(url, video_id)) + elif mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py deleted file mode 100644 index 858ac9e71..000000000 --- a/youtube_dl/extractor/vporn.py +++ /dev/null @@ -1,123 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - str_to_int, - urljoin, -) - - -class VpornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P<display_id>[^/]+)/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/', - 'md5': 'facf37c1b86546fa0208058546842c55', - 'info_dict': { - 'id': '497944', - 'display_id': 'violet-on-her-th-birthday', - 'ext': 'mp4', - 'title': 'Violet on her 19th birthday', - 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'kileyGrope', - 'categories': ['Masturbation', 'Teen'], - 'duration': 393, - 'age_limit': 18, - 'view_count': int, - }, - 'skip': 'video removed', - }, - { - 'url': 'http://www.vporn.com/female/hana-shower/523564/', - 'md5': 'ced35a4656198a1664cf2cda1575a25f', - 'info_dict': { - 'id': '523564', - 'display_id': 'hana-shower', - 'ext': 'mp4', - 'title': 'Hana Shower', - 'description': 'Hana showers at the bathroom.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Hmmmmm', - 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'], - 'duration': 588, - 'age_limit': 18, - 'view_count': int, - } - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!' - if errmsg in webpage: - raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) - - title = self._html_search_regex( - r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip() - description = self._html_search_regex( - r'class="(?:descr|description_txt)">(.*?)</div>', - webpage, 'description', fatal=False) - thumbnail = urljoin('http://www.vporn.com', self._html_search_regex( - r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', - default=None)) - - uploader = self._html_search_regex( - r'(?s)Uploaded by:.*?<a href="/user/[^"]+"[^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - - categories = re.findall(r'<a href="/cat/[^"]+"[^>]*>([^<]+)</a>', webpage) - - duration = parse_duration(self._search_regex( - r'Runtime:\s*</span>\s*(\d+ min \d+ sec)', - webpage, 'duration', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'class="views">([\d,\.]+) [Vv]iews<', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r"'Comments \(([\d,\.]+)\)'", - webpage, 'comment count', default=None)) - - formats = [] - - for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"(https?://[^"]+)"', webpage): - video_url = video[1] - fmt = { - 'url': video_url, - 'format_id': video[0], - } - m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)_(?P<vbr>\d+)k\.mp4$', video_url) - if m: - fmt.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'vbr': int(m.group('vbr')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'categories': categories, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } diff --git a/youtube_dl/extractor/vshare.py b/youtube_dl/extractor/vshare.py index e4ec77889..c631ac1fa 100644 --- a/youtube_dl/extractor/vshare.py +++ b/youtube_dl/extractor/vshare.py @@ -48,7 +48,7 @@ class VShareIE(InfoExtractor): webpage = self._download_webpage( 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, - video_id) + video_id, headers={'Referer': url}) title = self._html_search_regex( r'<title>([^<]+)', webpage, 'title') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c8bf98b58..457e2acea 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -352,6 +352,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| (?:www\.)?invidio\.us/| + (?:www\.)?invidious\.snopyta\.org/| + (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?vid\.wxzm\.sx/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d0cb65814..a71eda85d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -184,7 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)' +JSON_LD_RE = r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)' def preferredencoding(): @@ -1141,6 +1141,8 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + _HTTPONLY_PREFIX = '#HttpOnly_' + def save(self, filename=None, ignore_discard=False, ignore_expires=False): # Store session cookies with `expires` set to 0 instead of an empty # string @@ -1150,7 +1152,21 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) def load(self, filename=None, ignore_discard=False, ignore_expires=False): - compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires) + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + + cf = io.StringIO() + with open(filename) as f: + for line in f: + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + cf.write(compat_str(line)) + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) # Session cookies are denoted by either `expires` field set to # an empty string or 0. MozillaCookieJar only recognizes the former # (see [1]). So we need force the latter to be recognized as session diff --git a/youtube_dl/version.py b/youtube_dl/version.py index be3bbdd73..42ba37f15 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.30.1' +__version__ = '2019.03.01'