From c18142da6e0e99a7b4c9ab488ddb285ad1e8dad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 May 2018 22:46:06 +0700 Subject: [PATCH 001/187] [itv] Improve extraction (closes #16253) --- youtube_dl/extractor/itv.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 18a7d7f8c..457b424a2 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -41,6 +41,14 @@ class ITVIE(InfoExtractor): # unavailable via data-playlist-url 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', 'only_matching': True, + }, { + # InvalidVodcrid + 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034', + 'only_matching': True, + }, { + # ContentUnavailable + 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024', + 'only_matching': True, }] def _real_extract(self, url): @@ -127,7 +135,8 @@ class ITVIE(InfoExtractor): if fault_code == 'InvalidGeoRegion': self.raise_geo_restricted( msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code != 'InvalidEntity': + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): raise ExtractorError( '%s said: %s' % (self.IE_NAME, fault_string), expected=True) info.update({ From a93ce61bd5cbe7779e4eff0f8ab74a8a02211285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 May 2018 01:29:44 +0700 Subject: [PATCH 002/187] [tunein] Use live title for live streams (closes #16347) --- youtube_dl/extractor/tunein.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 7e51de89e..c7a5f5a63 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -62,7 +62,7 @@ class TuneInBaseIE(InfoExtractor): return { 'id': content_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, From 5f95927a62a533b9e616abb5f1481cedeaa16a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 May 2018 07:18:01 +0700 Subject: [PATCH 003/187] Improve geo bypass mechanism * Introduce geo bypass context * Add ability to bypass based on IP blocks in CIDR notation * Introduce --geo-bypass-ip-block --- youtube_dl/YoutubeDL.py | 3 + youtube_dl/__init__.py | 1 + youtube_dl/extractor/anvato.py | 4 +- youtube_dl/extractor/brightcove.py | 5 +- youtube_dl/extractor/common.py | 97 ++++++++++++++++++++++++------ youtube_dl/extractor/dplay.py | 4 +- youtube_dl/extractor/go.py | 2 +- youtube_dl/extractor/limelight.py | 4 +- youtube_dl/extractor/tvplay.py | 6 +- youtube_dl/options.py | 4 ++ youtube_dl/utils.py | 11 ++-- 11 files changed, 113 insertions(+), 28 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ad3598805..f1a359011 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -286,6 +286,9 @@ class YoutubeDL(object): Two-letter ISO 3166-2 country code that will be used for explicit geographic restriction bypassing via faking X-Forwarded-For HTTP header (experimental) + geo_bypass_ip_block: + IP range in CIDR notation that will be used similarly to + geo_bypass_country (experimental) The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9bb952457..ba435ea42 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -430,6 +430,7 @@ def _real_main(argv=None): 'config_location': opts.config_location, 'geo_bypass': opts.geo_bypass, 'geo_bypass_country': opts.geo_bypass_country, + 'geo_bypass_ip_block': opts.geo_bypass_ip_block, # just for deprecation check 'autonumber': opts.autonumber if opts.autonumber is True else None, 'usetitle': opts.usetitle if opts.usetitle is True else None, diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 7a29cd2c6..f6a78eb5d 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -277,7 +277,9 @@ class AnvatoIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) mobj = re.match(self._VALID_URL, url) access_key, video_id = mobj.group('access_key_or_mcp', 'id') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0e4eaef65..ab62e54d6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -669,7 +669,10 @@ class BrightcoveNewIE(AdobePassIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + 'ip_blocks': smuggled_data.get('geo_ip_blocks'), + }) account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a9939b0fd..3ef5af13c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -346,6 +346,11 @@ class InfoExtractor(object): geo restriction bypass mechanism right away in order to bypass geo restriction, of course, if the mechanism is not disabled. (experimental) + _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted + IP blocks in CIDR notation for this extractor. One of these IP blocks + will be used by geo restriction bypass mechanism similarly + to _GEO_COUNTRIES. (experimental) + NB: both these geo attributes are experimental and may change in future or be completely removed. @@ -358,6 +363,7 @@ class InfoExtractor(object): _x_forwarded_for_ip = None _GEO_BYPASS = True _GEO_COUNTRIES = None + _GEO_IP_BLOCKS = None _WORKING = True def __init__(self, downloader=None): @@ -392,12 +398,15 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" - self._initialize_geo_bypass(self._GEO_COUNTRIES) + self._initialize_geo_bypass({ + 'countries': self._GEO_COUNTRIES, + 'ip_blocks': self._GEO_IP_BLOCKS, + }) if not self._ready: self._real_initialize() self._ready = True - def _initialize_geo_bypass(self, countries): + def _initialize_geo_bypass(self, geo_bypass_context): """ Initialize geo restriction bypass mechanism. @@ -408,28 +417,82 @@ class InfoExtractor(object): HTTP requests. This method will be used for initial geo bypass mechanism initialization - during the instance initialization with _GEO_COUNTRIES. + during the instance initialization with _GEO_COUNTRIES and + _GEO_IP_BLOCKS. - You may also manually call it from extractor's code if geo countries + You may also manually call it from extractor's code if geo bypass information is not available beforehand (e.g. obtained during - extraction) or due to some another reason. + extraction) or due to some other reason. In this case you should pass + this information in geo bypass context passed as first argument. It may + contain following fields: + + countries: List of geo unrestricted countries (similar + to _GEO_COUNTRIES) + ip_blocks: List of geo unrestricted IP blocks in CIDR notation + (similar to _GEO_IP_BLOCKS) + """ if not self._x_forwarded_for_ip: - country_code = self._downloader.params.get('geo_bypass_country', None) - # If there is no explicit country for geo bypass specified and - # the extractor is known to be geo restricted let's fake IP - # as X-Forwarded-For right away. - if (not country_code and - self._GEO_BYPASS and - self._downloader.params.get('geo_bypass', True) and - countries): - country_code = random.choice(countries) - if country_code: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + + # Geo bypass mechanism is explicitly disabled by user + if not self._downloader.params.get('geo_bypass', True): + return + + if not geo_bypass_context: + geo_bypass_context = {} + + # Backward compatibility: previously _initialize_geo_bypass + # expected a list of countries, some 3rd party code may still use + # it this way + if isinstance(geo_bypass_context, (list, tuple)): + geo_bypass_context = { + 'countries': geo_bypass_context, + } + + # The whole point of geo bypass mechanism is to fake IP + # as X-Forwarded-For HTTP header based on some IP block or + # country code. + + # Path 1: bypassing based on IP block in CIDR notation + + # Explicit IP block specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + ip_block = self._downloader.params.get('geo_bypass_ip_block', None) + + # Otherwise use random IP block from geo bypass context but only + # if extractor is known as geo bypassable + if not ip_block: + ip_blocks = geo_bypass_context.get('ip_blocks') + if self._GEO_BYPASS and ip_blocks: + ip_block = random.choice(ip_blocks) + + if ip_block: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s as X-Forwarded-For.' + % self._x_forwarded_for_ip) + return + + # Path 2: bypassing based on country code + + # Explicit country code specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + country = self._downloader.params.get('geo_bypass_country', None) + + # Otherwise use random country code from geo bypass context but + # only if extractor is known as geo bypassable + if not country: + countries = geo_bypass_context.get('countries') + if self._GEO_BYPASS and countries: + country = random.choice(countries) + + if country: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) if self._downloader.params.get('verbose', False): self._downloader.to_screen( '[debug] Using fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) + % (self._x_forwarded_for_ip, country.upper())) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index b73446773..8e0374320 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -102,7 +102,9 @@ class DPlayIE(InfoExtractor): display_id = mobj.group('id') domain = mobj.group('domain') - self._initialize_geo_bypass([mobj.group('country').upper()]) + self._initialize_geo_bypass({ + 'countries': [mobj.group('country').upper()], + }) webpage = self._download_webpage(url, display_id) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 9c7b1bd37..e781405f2 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -123,7 +123,7 @@ class GoIE(AdobePassIE): 'adobe_requestor_id': requestor_id, }) else: - self._initialize_geo_bypass(['US']) + self._initialize_geo_bypass({'countries': ['US']}) entitlement = self._download_json( 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', video_id, data=urlencode_postdata(data)) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 2803d7e8d..729d8de50 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -282,7 +282,9 @@ class LimelightMediaIE(LimelightBaseIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) pc, mobile, metadata = self._extract( video_id, 'getPlaylistByMediaId', diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 84597b55e..e09b5f804 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -227,14 +227,16 @@ class TVPlayIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, 'geo country', default=None) if geo_country: - self._initialize_geo_bypass([geo_country.upper()]) + self._initialize_geo_bypass({'countries': [geo_country.upper()]}) video = self._download_json( 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3e4ac03a2..f3f8f23b6 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -249,6 +249,10 @@ def parseOpts(overrideArguments=None): '--geo-bypass-country', metavar='CODE', dest='geo_bypass_country', default=None, help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') + geo.add_option( + '--geo-bypass-ip-block', metavar='IP_BLOCK', + dest='geo_bypass_ip_block', default=None, + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation (experimental)') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b460393bf..f9ca63c58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3534,10 +3534,13 @@ class GeoUtils(object): } @classmethod - def random_ipv4(cls, code): - block = cls._country_ip_map.get(code.upper()) - if not block: - return None + def random_ipv4(cls, code_or_block): + if len(code_or_block) == 2: + block = cls._country_ip_map.get(code_or_block.upper()) + if not block: + return None + else: + block = code_or_block addr, preflen = block.split('/') addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] addr_max = addr_min | (0xffffffff >> int(preflen)) From ea1f5e5dbd6c58d4f0872a65b97611732f4b29bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 May 2018 07:21:24 +0700 Subject: [PATCH 004/187] [itv:btcc] Add extractor (closes #16139) --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/itv.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9fe3f649d..316c8199d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -477,7 +477,10 @@ from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE -from .itv import ITVIE +from .itv import ( + ITVIE, + ITVBTCCIE, +) from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 457b424a2..6a4f8a505 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -7,6 +7,7 @@ import json import re from .common import InfoExtractor +from .brightcove import BrightcoveNewIE from ..compat import ( compat_str, compat_etree_register_namespace, @@ -18,6 +19,7 @@ from ..utils import ( xpath_text, int_or_none, parse_duration, + smuggle_url, ExtractorError, determine_ext, ) @@ -260,3 +262,38 @@ class ITVIE(InfoExtractor): 'subtitles': subtitles, }) return info + + +class ITVBTCCIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'info_dict': { + 'id': 'btcc-2018-all-the-action-from-brands-hatch', + 'title': 'BTCC 2018: All the action from Brands Hatch', + }, + 'playlist_mincount': 9, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + # ITV does not like some GB IP ranges, so here are some + # IP blocks it accepts + 'geo_ip_blocks': [ + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + ], + 'referrer': url, + }), + ie=BrightcoveNewIE.ie_key(), video_id=video_id) + for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + + title = self._og_search_title(webpage, fatal=False) + + return self.playlist_result(entries, playlist_id, title) From 3cc0d0b8299308958bfe8b4c42c739505df27f50 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 2 May 2018 09:32:53 +0100 Subject: [PATCH 005/187] [discovery] extract Affiliate/Anonymous Auth Token from cookies(closes #14954) --- youtube_dl/extractor/discovery.py | 37 ++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 91449dcd8..3589bd428 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -5,7 +5,10 @@ import re import string from .discoverygo import DiscoveryGoBaseIE -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, try_get, @@ -55,15 +58,27 @@ class DiscoveryIE(DiscoveryGoBaseIE): video = next(cb for cb in content_blocks if cb.get('type') == 'video')['content']['items'][0] video_id = video['id'] - access_token = self._download_json( - 'https://www.%s.com/anonymous' % site, display_id, query={ - 'authRel': 'authorization', - 'client_id': try_get( - react_data, lambda x: x['application']['apiClientId'], - compat_str) or '3020a40c2356a645b4b4', - 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, - })['access_token'] + access_token = None + cookies = self._get_cookies(url) + + # prefer Affiliate Auth Token over Anonymous Auth Token + auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn') + if auth_storage_cookie and auth_storage_cookie.value: + auth_storage = self._parse_json(compat_urllib_parse_unquote( + compat_urllib_parse_unquote(auth_storage_cookie.value)), + video_id, fatal=False) or {} + access_token = auth_storage.get('a') or auth_storage.get('access_token') + + if not access_token: + access_token = self._download_json( + 'https://www.%s.com/anonymous' % site, display_id, query={ + 'authRel': 'authorization', + 'client_id': try_get( + react_data, lambda x: x['application']['apiClientId'], + compat_str) or '3020a40c2356a645b4b4', + 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, + })['access_token'] try: stream = self._download_json( @@ -72,7 +87,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): 'Authorization': 'Bearer ' + access_token, }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): e_description = self._parse_json( e.cause.read().decode(), display_id)['description'] if 'resource not available for country' in e_description: From a90a6b54ee5ceb6002f4ebd73d62c65cc00484d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 May 2018 20:43:34 +0700 Subject: [PATCH 006/187] [watchbox] Fix extraction (closes #16356) --- youtube_dl/extractor/watchbox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py index b382338fa..be0bcba15 100644 --- a/youtube_dl/extractor/watchbox.py +++ b/youtube_dl/extractor/watchbox.py @@ -69,7 +69,7 @@ class WatchBoxIE(InfoExtractor): source = self._parse_json( self._search_regex( - r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source', + r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', default='{}'), video_id, transform_source=js_to_json, fatal=False) or {} From 660a230b2dcc734f018557c7898384ba438e9137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 May 2018 01:21:52 +0700 Subject: [PATCH 007/187] [cloudflarestream] Add support for cloudflare streams (closes #16375) --- youtube_dl/extractor/cloudflarestream.py | 60 ++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 19 ++++++++ 3 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/cloudflarestream.py diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py new file mode 100644 index 000000000..e6d92cca2 --- /dev/null +++ b/youtube_dl/extractor/cloudflarestream.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CloudflareStreamIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:watch\.)?cloudflarestream\.com/| + embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo= + ) + (?P[\da-f]+) + ''' + _TESTS = [{ + 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', + 'only_matching': True, + }, { + 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://cloudflarestream.com/%s/manifest/video.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False) + formats.extend(self._extract_mpd_formats( + 'https://cloudflarestream.com/%s/manifest/video.mpd' % video_id, + video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 316c8199d..a00e003c2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -195,6 +195,7 @@ from .clippit import ClippitIE from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 73980601c..532c995f5 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -107,6 +107,7 @@ from .springboardplatform import SpringboardPlatformIE from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE +from .cloudflarestream import CloudflareStreamIE class GenericIE(InfoExtractor): @@ -2013,6 +2014,19 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # CloudflareStream embed + 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'add_ie': [CloudflareStreamIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3025,6 +3039,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) + if cloudflarestream_urls: + return self.playlist_from_matches( + cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From 789b7774a771335c7d0b42c834195bef2e8617c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 6 May 2018 21:58:55 +0700 Subject: [PATCH 008/187] [businessinsider] Add extractor (closes #16387, closes #16388, closes #16389) --- youtube_dl/extractor/businessinsider.py | 42 +++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 15 --------- 3 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 youtube_dl/extractor/businessinsider.py diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py new file mode 100644 index 000000000..dfcf9bc6b --- /dev/null +++ b/youtube_dl/extractor/businessinsider.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BusinessInsiderIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'hZRllCfw', + 'ext': 'mp4', + 'title': "Here's how much radiation you're exposed to in everyday life", + 'description': 'md5:9a0d6e2c279948aadaa5e84d6d9b99bd', + 'upload_date': '20170709', + 'timestamp': 1499606400, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', + 'only_matching': True, + }, { + 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex( + (r'data-media-id=["\']([a-zA-Z0-9]{8})', + r'id=["\']jwplayer_([a-zA-Z0-9]{8})', + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})'), + webpage, 'jwplatform id') + return self.url_result( + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a00e003c2..f03f98a6c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -137,6 +137,7 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 532c995f5..76852f9dc 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1472,21 +1472,6 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to parse JSON Expecting value'], }, - # Ooyala embed - { - 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', - 'info_dict': { - 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', - 'ext': 'mp4', - 'description': 'Index/Match versus VLOOKUP.', - 'title': 'This is what separates the Excel masters from the wannabes', - 'duration': 191.933, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - } - }, # Brightcove URL in single quotes { 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', From 0ce76801e8f6e4d69182c20d9cef4de772555ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 May 2018 22:33:35 +0700 Subject: [PATCH 009/187] [udemy] Extract stream URLs (closes #16372) --- youtube_dl/extractor/udemy.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index bf1134e3f..4664e6222 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -105,7 +105,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,data', }) def _handle_error(self, response): @@ -303,9 +303,10 @@ class UdemyIE(InfoExtractor): 'url': src, }) - download_urls = asset.get('download_urls') - if isinstance(download_urls, dict): - extract_formats(download_urls.get('Video')) + for url_kind in ('download', 'stream'): + urls = asset.get('%s_urls' % url_kind) + if isinstance(urls, dict): + extract_formats(urls.get('Video')) view_html = lecture.get('view_html') if view_html: From 2fbd86352eaa9df6afeed6698114132aea3cbe81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 May 2018 22:57:01 +0700 Subject: [PATCH 010/187] [udemy] Extract asset captions --- youtube_dl/extractor/udemy.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 4664e6222..0a74a9768 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -18,6 +18,7 @@ from ..utils import ( int_or_none, js_to_json, sanitized_Request, + try_get, unescapeHTML, urlencode_postdata, ) @@ -105,7 +106,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', }) def _handle_error(self, response): @@ -308,6 +309,21 @@ class UdemyIE(InfoExtractor): if isinstance(urls, dict): extract_formats(urls.get('Video')) + captions = asset.get('captions') + if isinstance(captions, list): + for cc in captions: + if not isinstance(cc, dict): + continue + cc_url = cc.get('url') + if not cc_url or not isinstance(cc_url, compat_str): + continue + lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) + sub_dict = (automatic_captions if cc.get('source') == 'auto' + else subtitles) + sub_dict.setdefault(lang or 'en', []).append({ + 'url': cc_url, + }) + view_html = lecture.get('view_html') if view_html: view_html_urls = set() From 05108a496a0eb6ca5d6f93072e2871dec8958b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 8 May 2018 22:57:52 +0700 Subject: [PATCH 011/187] [YoutubeDL] Ensure ext exists for automatic captions --- youtube_dl/YoutubeDL.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f1a359011..046e03247 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1482,23 +1482,28 @@ class YoutubeDL(object): if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') subtitles = info_dict.get('subtitles') - if subtitles: - for _, subtitle in subtitles.items(): - for subtitle_format in subtitle: - if subtitle_format.get('url'): - subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if subtitle_format.get('ext') is None: - subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: - self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return + info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], subtitles, - info_dict.get('automatic_captions')) + info_dict['id'], subtitles, automatic_captions) # We now pick which formats have to be downloaded if info_dict.get('formats') is None: From 44277998adae1e17e4d21208e7dd1ad44decc733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 9 May 2018 00:34:39 +0700 Subject: [PATCH 012/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 916b8edb8..ab6c5dab6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version + +Core +* [YoutubeDL] Ensure ext exists for automatic captions +* Introduce --geo-bypass-ip-block + +Extractors ++ [udemy] Extract asset captions ++ [udemy] Extract stream URLs (#16372) ++ [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) ++ [cloudflarestream] Add support for cloudflarestream.com (#16375) +* [watchbox] Fix extraction (#16356) +* [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) ++ [itv:btcc] Add support for itv.com/btcc (#16139) +* [tunein] Use live title for live streams (#16347) +* [itv] Improve extraction (#16253) + + version 2018.05.01 Core From 9e18bb4c67af7b748ee62247d751c0e705aa791a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 9 May 2018 00:36:47 +0700 Subject: [PATCH 013/187] release 2018.05.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 3 +++ docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c2bd5d8ae..b2bfa9ec5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.01 +[debug] youtube-dl version 2018.05.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ab6c5dab6..ef6cc3850 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.09 Core * [YoutubeDL] Ensure ext exists for automatic captions diff --git a/README.md b/README.md index 5af0f387b..d9fe2350a 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental) + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with + explicitly provided IP block in CIDR + notation (experimental) ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c5a48002b..88fac6e90 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -122,6 +122,7 @@ - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen + - **BusinessInsider** - **BuzzFeed** - **BYUtv** - **Camdemy** @@ -163,6 +164,7 @@ - **ClipRs** - **Clipsyndicate** - **CloserToTruth** + - **CloudflareStream** - **cloudtime**: CloudTime - **Cloudy** - **Clubic** @@ -373,6 +375,7 @@ - **Ir90Tv** - **ITTF** - **ITV** + - **ITVBTCC** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 04896efc8..6f47b1795 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.01' +__version__ = '2018.05.09' From ff8889cd4dfae0ae3758e3d8a496f5724f6dc092 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 10 May 2018 08:19:32 +0100 Subject: [PATCH 014/187] [teamcoco] fix extraction(closes #16374) --- youtube_dl/extractor/teamcoco.py | 175 ++++++++++++++----------------- 1 file changed, 80 insertions(+), 95 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 9056c8cbc..f06e5b19a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,35 +1,34 @@ # coding: utf-8 from __future__ import unicode_literals -import binascii -import re import json from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_ord, -) from ..utils import ( - ExtractorError, - qualities, determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + parse_duration, + parse_iso8601, + qualities, ) class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P[0-9]+)?/?(?P.*)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P[^/?#]+)' _TESTS = [ { - 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', - 'md5': '3f7746aa0dc86de18df7539903d399ea', + 'url': 'http://teamcoco.com/video/mary-kay-remote', + 'md5': '55d532f81992f5c92046ad02fec34d7d', 'info_dict': { 'id': '80187', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', - 'duration': 504, - 'age_limit': 0, + 'duration': 495.0, + 'upload_date': '20140402', + 'timestamp': 1396407600, } }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -40,7 +39,8 @@ class TeamcocoIE(InfoExtractor): 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'duration': 288, - 'age_limit': 0, + 'upload_date': '20111104', + 'timestamp': 1320405840, } }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', @@ -49,6 +49,8 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', + 'upload_date': '20150415', + 'timestamp': 1429088400, }, 'params': { 'skip_download': True, # m3u8 downloads @@ -63,110 +65,93 @@ class TeamcocoIE(InfoExtractor): }, 'params': { 'skip_download': True, # m3u8 downloads - } + }, + 'skip': 'This video is no longer available.', } ] - _VIDEO_ID_REGEXES = ( - r'"eVar42"\s*:\s*(\d+)', - r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', - r'"id_not"\s*:\s*(\d+)' - ) + + def _graphql_call(self, query_template, object_type, object_id): + find_object = 'find' + object_type + return self._download_json( + 'http://teamcoco.com/graphql/', object_id, data=json.dumps({ + 'query': query_template % (find_object, object_id) + }))['data'][find_object] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + display_id = self._match_id(url) - display_id = mobj.group('display_id') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'src=expired' in urlh.geturl(): - raise ExtractorError('This video is expired.', expected=True) + response = self._graphql_call('''{ + %s(slug: "video/%s") { + ... on RecordSlug { + record { + id + title + teaser + publishOn + thumb { + preview + } + tags { + name + } + duration + } + } + ... on NotFoundSlug { + status + } + } +}''', 'Slug', display_id) + if response.get('status'): + raise ExtractorError('This video is no longer available.', expected=True) - video_id = mobj.group('video_id') - if not video_id: - video_id = self._html_search_regex( - self._VIDEO_ID_REGEXES, webpage, 'video id') + record = response['record'] + video_id = record['id'] - data = None - - preload_codes = self._html_search_regex( - r'(function.+)setTimeout\(function\(\)\{playlist', - webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes) - base64_fragments.remove('init') - - def _check_sequence(cur_fragments): - if not cur_fragments: - return - for i in range(len(cur_fragments)): - cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') - try: - raw_data = compat_b64decode(cur_sequence) - if compat_ord(raw_data[0]) == compat_ord('{'): - return json.loads(raw_data.decode('utf-8')) - except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): - continue - - def _check_data(): - for i in range(len(base64_fragments) + 1): - for j in range(i, len(base64_fragments) + 1): - data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) - if data: - return data - - self.to_screen('Try to compute possible data sequence. This may take some time.') - data = _check_data() - - if not data: - raise ExtractorError( - 'Preload information could not be extracted', expected=True) + srcs = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id)['src'] formats = [] - get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) - for filed in data['files']: - if determine_ext(filed['url']) == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if filed['url'].startswith('/'): - m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] - else: - m3u8_url = filed['url'] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4') - for m3u8_format in m3u8_formats: - if m3u8_format not in formats: - formats.append(m3u8_format) - elif determine_ext(filed['url']) == 'f4m': - # TODO Correct f4m extraction + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in srcs.items(): + if not isinstance(src, dict): continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) else: - if filed['url'].startswith('/mp4:protected/'): + if src_url.startswith('/mp4:protected/'): # TODO Correct extraction for these files continue - m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) - if m_format is not None: - format_id = m_format.group(1) - else: - format_id = filed['bitrate'] - tbr = ( - int(filed['bitrate']) - if filed['bitrate'].isdigit() - else None) + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) formats.append({ - 'url': filed['url'], - 'ext': 'mp4', + 'url': src_url, + 'ext': ext, 'tbr': tbr, 'format_id': format_id, 'quality': get_quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'formats': formats, - 'title': data['title'], - 'thumbnail': data.get('thumb', {}).get('href'), - 'description': data.get('teaser'), - 'duration': data.get('duration'), - 'age_limit': self._family_friendly_search(webpage), + 'title': record['title'], + 'thumbnail': record.get('thumb', {}).get('preview'), + 'description': record.get('teaser'), + 'duration': parse_duration(record.get('duration')), + 'timestamp': parse_iso8601(record.get('publishOn')), } From 1344d3e169840f6c9d585648c1597da6a2b00ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 10 May 2018 22:01:13 +0700 Subject: [PATCH 015/187] [nickbr] Relax _VALID_URL (#13230) --- youtube_dl/extractor/nick.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 256a24d86..5e34d776b 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -85,7 +85,7 @@ class NickBrIE(MTVServicesInfoExtractor): https?:// (?: (?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br| - (?:www\.)?nickjr\.nl + (?:www\.)?nickjr\.[a-z]{2} ) /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?\#.]+) ''' @@ -98,6 +98,9 @@ class NickBrIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', 'only_matching': True, + }, { + 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', + 'only_matching': True, }] def _real_extract(self, url): From bc5e4aa57e92201610f9ab79b10a3ae3b316fc3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 10 May 2018 22:22:26 +0700 Subject: [PATCH 016/187] [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) --- youtube_dl/extractor/mixcloud.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a56b7690f..b7bccb504 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -179,6 +179,10 @@ class MixcloudIE(InfoExtractor): formats.append({ 'format_id': 'http', 'url': decrypted, + 'downloader_options': { + # Mixcloud starts throttling at >~5M + 'http_chunk_size': 5242880, + }, }) self._sort_formats(formats) From dbd5c502ead468771d45c7893dd5dd14cf99a276 Mon Sep 17 00:00:00 2001 From: llyyr Date: Thu, 10 May 2018 21:47:23 +0530 Subject: [PATCH 017/187] [redditr] Relax _VALID_URL (closes #16426) --- youtube_dl/extractor/reddit.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 8372925be..7b0aa6232 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -47,7 +47,7 @@ class RedditIE(InfoExtractor): class RedditRIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -86,6 +86,10 @@ class RedditRIE(InfoExtractor): # youtube 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, }] def _real_extract(self, url): From 49fa7de301019e23e66c01e5007561eefd51ca47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 11 May 2018 23:20:12 +0700 Subject: [PATCH 018/187] [twitch:clips] Fix extraction (closes #16429) --- youtube_dl/extractor/twitch.py | 102 ++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4c11fd3c3..ec96ae506 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -8,6 +8,7 @@ import random from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_kwargs, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -16,11 +17,14 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + float_or_none, int_or_none, - js_to_json, orderedSet, parse_duration, parse_iso8601, + qualities, + try_get, + unified_timestamp, update_url_query, urlencode_postdata, urljoin, @@ -45,10 +49,11 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _call_api(self, path, item_id, note): + def _call_api(self, path, item_id, *args, **kwargs): + kwargs.setdefault('headers', {})['Client-ID'] = self._CLIENT_ID response = self._download_json( - '%s/%s' % (self._API_BASE, path), item_id, note, - headers={'Client-ID': self._CLIENT_ID}) + '%s/%s' % (self._API_BASE, path), item_id, + *args, **compat_kwargs(kwargs)) self._handle_error(response) return response @@ -622,21 +627,23 @@ class TwitchStreamIE(TwitchBaseIE): } -class TwitchClipsIE(InfoExtractor): +class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ - 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', + 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { - 'id': 'AggressiveCobraPoooound', + 'id': '42850523', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1465767393, + 'upload_date': '20160612', 'creator': 'EA', 'uploader': 'stereotype_', - 'uploader_id': 'stereotype_', + 'uploader_id': '43566419', }, }, { # multiple formats @@ -647,34 +654,61 @@ class TwitchClipsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + status = self._download_json( + 'https://clips.twitch.tv/api/v2/clips/%s/status' % video_id, + video_id) - clip = self._parse_json( - self._search_regex( - r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), - video_id, transform_source=js_to_json) + formats = [] - title = clip.get('title') or clip.get('channel_title') or self._og_search_title(webpage) + for option in status['quality_options']: + if not isinstance(option, dict): + continue + source = option.get('source') + if not source or not isinstance(source, compat_str): + continue + formats.append({ + 'url': source, + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + 'fps': int_or_none(option.get('frame_rate')), + }) - formats = [{ - 'url': option['source'], - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - } for option in clip.get('quality_options', []) if option.get('source')] - - if not formats: - formats = [{ - 'url': clip['clip_video_url'], - }] - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), - 'uploader': clip.get('curator_login'), - 'uploader_id': clip.get('curator_display_name'), + info = { 'formats': formats, } + + clip = self._call_api( + 'kraken/clips/%s' % video_id, video_id, fatal=False, headers={ + 'Accept': 'application/vnd.twitchtv.v5+json', + }) + + if clip: + quality_key = qualities(('tiny', 'small', 'medium')) + thumbnails = [] + thumbnails_dict = clip.get('thumbnails') + if isinstance(thumbnails_dict, dict): + for thumbnail_id, thumbnail_url in thumbnails_dict.items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': quality_key(thumbnail_id), + }) + + info.update({ + 'id': clip.get('tracking_id') or video_id, + 'title': clip.get('title') or video_id, + 'duration': float_or_none(clip.get('duration')), + 'views': int_or_none(clip.get('views')), + 'timestamp': unified_timestamp(clip.get('created_at')), + 'thumbnails': thumbnails, + 'creator': try_get(clip, lambda x: x['broadcaster']['display_name'], compat_str), + 'uploader': try_get(clip, lambda x: x['curator']['display_name'], compat_str), + 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), + }) + else: + info.update({ + 'title': video_id, + 'id': video_id, + }) + + return info From 07acdc5afcff3b47b26b26355a75704e3cda670f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 May 2018 12:08:54 +0700 Subject: [PATCH 019/187] [twitch:clips] Sort formats --- youtube_dl/extractor/twitch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ec96ae506..3ee2af52e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -673,6 +673,8 @@ class TwitchClipsIE(TwitchBaseIE): 'fps': int_or_none(option.get('frame_rate')), }) + self._sort_formats(formats) + info = { 'formats': formats, } From 90b633f86b000f8b6a58ce99d9bbbe0fff6d4f62 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 13 May 2018 11:30:21 +0100 Subject: [PATCH 020/187] [nbc] improve info extraction(fixes #16440) --- youtube_dl/extractor/nbc.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 9dc8f9ebc..1b1722cfa 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,6 +9,7 @@ from .adobepass import AdobePassIE from ..utils import ( find_xpath_attr, smuggle_url, + try_get, unescapeHTML, update_url_query, int_or_none, @@ -78,10 +79,14 @@ class NBCIE(AdobePassIE): def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() permalink = 'http' + permalink - video_data = self._download_json( + response = self._download_json( 'https://api.nbc.com/v3/videos', video_id, query={ 'filter[permalink]': permalink, - })['data'][0]['attributes'] + 'fields[videos]': 'description,entitlement,episodeNumber,guid,keywords,seasonNumber,title,vChipRating', + 'fields[shows]': 'shortTitle', + 'include': 'show.shortTitle', + }) + video_data = response['data'][0]['attributes'] query = { 'mbr': 'true', 'manifest': 'm3u', @@ -103,10 +108,11 @@ class NBCIE(AdobePassIE): 'title': title, 'url': theplatform_url, 'description': video_data.get('description'), - 'keywords': video_data.get('keywords'), + 'tags': video_data.get('keywords'), 'season_number': int_or_none(video_data.get('seasonNumber')), 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('showName'), + 'episode': title, + 'series': try_get(response, lambda x: x['included'][0]['attributes']['shortTitle']), 'ie_key': 'ThePlatform', } From 4c76aa06665621c7689938afd7bbdbc797b5c7ea Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 13 May 2018 13:20:16 +0100 Subject: [PATCH 021/187] [youtube] fix extraction for embed restricted live streams(fixes #16433) --- youtube_dl/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 04aeb91af..1f29e8a4e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1537,7 +1537,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map'): + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) @@ -1969,9 +1969,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' formats.append(a_format) else: - unavailable_message = extract_unavailable_message() - if unavailable_message: - raise ExtractorError(unavailable_message, expected=True) + error_message = clean_html(video_info.get('reason', [None])[0]) + if not error_message: + error_message = extract_unavailable_message() + if error_message: + raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest From 84a9fef899374d46cfad8d292187ca8d84791c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 May 2018 22:49:01 +0700 Subject: [PATCH 022/187] [youtube] Make uploader extraction non fatal (#16444) --- youtube_dl/extractor/youtube.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f29e8a4e..897398d20 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1697,9 +1697,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_information_extraction(video_id) # uploader - if 'author' not in video_info: - raise ExtractorError('Unable to extract uploader name') - video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0]) + video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') # uploader_id video_uploader_id = None From c63ca0eef8ac147b3f2a39ba7265ad1b3c11d516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 14 May 2018 23:27:56 +0700 Subject: [PATCH 023/187] [youtube] Improve format filesize extraction (#16453) --- youtube_dl/extractor/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 897398d20..7f4298c08 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1815,6 +1815,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): chapters = self._extract_chapters(description_original, video_duration) + def _extract_filesize(media_url): + return int_or_none(self._search_regex( + r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1919,8 +1923,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + filesize = int_or_none(url_data.get( + 'clen', [None])[0]) or _extract_filesize(url) + more_fields = { - 'filesize': int_or_none(url_data.get('clen', [None])[0]), + 'filesize': filesize, 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, @@ -1994,6 +2001,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for df in self._extract_mpd_formats( mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): + if not df.get('filesize'): + df['filesize'] = _extract_filesize(df['url']) # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df From 1e4fe5a7cc80f73b92e068515352d7c7124a49c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 14 May 2018 23:42:33 +0700 Subject: [PATCH 024/187] [options] Fix typo (closes #16450) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index f3f8f23b6..b692c6b3b 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -232,7 +232,7 @@ def parseOpts(overrideArguments=None): '--geo-verification-proxy', dest='geo_verification_proxy', default=None, metavar='URL', help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading.') + 'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading.') geo.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', From 7f34984e811897b65e1b7e3a25cfdb45bf863dcf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 08:08:44 +0100 Subject: [PATCH 025/187] [dtube] Add new extractor(closes #15201) --- youtube_dl/extractor/dtube.py | 83 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 84 insertions(+) create mode 100644 youtube_dl/extractor/dtube.py diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py new file mode 100644 index 000000000..4ca97f860 --- /dev/null +++ b/youtube_dl/extractor/dtube.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re +from socket import timeout + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class DTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P[0-9a-z.-]+)/(?P[0-9a-z]{8})' + _TEST = { + 'url': 'https://d.tube/#!/v/benswann/zqd630em', + 'md5': 'a03eaa186618ffa7a3145945543a251e', + 'info_dict': { + 'id': 'zqd630em', + 'ext': 'mp4', + 'title': 'Reality Check: FDA\'s Disinformation Campaign on Kratom', + 'description': 'md5:700d164e066b87f9eac057949e4227c2', + 'uploader_id': 'benswann', + 'upload_date': '20180222', + 'timestamp': 1519328958, + }, + 'params': { + 'format': '480p', + }, + } + + def _real_extract(self, url): + uploader_id, video_id = re.match(self._VALID_URL, url).groups() + result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({ + 'jsonrpc': '2.0', + 'method': 'get_content', + 'params': [uploader_id, video_id], + }).encode())['result'] + + metadata = json.loads(result['json_metadata']) + video = metadata['video'] + content = video['content'] + info = video.get('info', {}) + title = info.get('title') or result['title'] + + def canonical_url(h): + if not h: + return None + return 'https://ipfs.io/ipfs/' + h + + formats = [] + for q in ('240', '480', '720', '1080', ''): + video_url = canonical_url(content.get('video%shash' % q)) + if not video_url: + continue + format_id = (q + 'p') if q else 'Source' + try: + self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) + self._downloader._opener.open(video_url, timeout=5).close() + except timeout as e: + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, format_id)) + continue + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'height': int_or_none(q), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('description'), + 'thumbnail': canonical_url(info.get('snaphash')), + 'tags': content.get('tags') or metadata.get('tags'), + 'duration': info.get('duration'), + 'formats': formats, + 'timestamp': parse_iso8601(result.get('created')), + 'uploader_id': uploader_id, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f03f98a6c..4da477647 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -283,6 +283,7 @@ from .drtv import ( DRTVIE, DRTVLiveIE, ) +from .dtube import DTubeIE from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE From fe3a60f040f614d36e99f80ea1e3a8387d995fff Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 11:24:44 +0100 Subject: [PATCH 026/187] [dreisat] improve extraction(closes #15350) - extract all formats - extract more format metadata extraction - improve format sorting - use hls native downloader - detect geo-restriction - bypass geo-restriction --- youtube_dl/extractor/dreisat.py | 141 +++++++++++++++----------------- 1 file changed, 65 insertions(+), 76 deletions(-) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index f138025d5..8d31258c1 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -8,7 +8,6 @@ from ..utils import ( unified_strdate, xpath_text, determine_ext, - qualities, float_or_none, ExtractorError, ) @@ -16,7 +15,8 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' + _GEO_COUNTRIES = ['DE'] + _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', @@ -43,7 +43,8 @@ class DreiSatIE(InfoExtractor): def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): param_groups = {} for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) + group_id = param_group.get(self._xpath_ns( + 'id', 'http://www.w3.org/XML/1998/namespace')) params = {} for param in param_group: params[param.get('name')] = param.get('value') @@ -54,7 +55,7 @@ class DreiSatIE(InfoExtractor): src = video.get('src') if not src: continue - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) group_id = video.get('paramGroup') param_group = param_groups[group_id] for proto in param_group['protocols'].split(','): @@ -75,66 +76,36 @@ class DreiSatIE(InfoExtractor): note='Downloading video info', errnote='Failed to download video info') - status_code = doc.find('./status/statuscode') - if status_code is not None and status_code.text != 'ok': - code = status_code.text - if code == 'notVisibleAnymore': + status_code = xpath_text(doc, './status/statuscode') + if status_code and status_code != 'ok': + if status_code == 'notVisibleAnymore': message = 'Video %s is not available' % video_id else: - message = '%s returned error: %s' % (self.IE_NAME, code) + message = '%s returned error: %s' % (self.IE_NAME, status_code) raise ExtractorError(message, expected=True) - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + title = xpath_text(doc, './/information/title', 'title', True) - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - return thumbnails - - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - - format_nodes = doc.findall('.//formitaeten/formitaet') - quality = qualities(['veryhigh', 'high', 'med', 'low']) - - def get_quality(elem): - return quality(xpath_text(elem, 'quality')) - format_nodes.sort(key=get_quality) - format_ids = [] + urls = [] formats = [] - for fnode in format_nodes: - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - if not is_available: + for fnode in doc.findall('.//formitaeten/formitaet'): + video_url = xpath_text(fnode, 'url') + if not video_url or video_url in urls: continue + urls.append(video_url) + + is_available = 'http://www.metafilegenerator' not in video_url + geoloced = 'static_geoloced_online' in video_url + if not is_available or geoloced: + continue + format_id = fnode.attrib['basetype'] - quality = xpath_text(fnode, './quality', 'quality') format_m = re.match(r'''(?x) (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ (?P[^_]+)_(?P[^_]+)_(?P[^_]+) ''', format_id) ext = determine_ext(video_url, None) or format_m.group('container') - if ext not in ('smil', 'f4m', 'm3u8'): - format_id = format_id + '-' + quality - if format_id in format_ids: - continue if ext == 'meta': continue @@ -147,24 +118,23 @@ class DreiSatIE(InfoExtractor): if video_url.startswith('https://'): continue formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id=format_id, fatal=False)) else: - proto = format_m.group('proto').lower() + quality = xpath_text(fnode, './quality') + if quality: + format_id += '-' + quality - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None + tbr = int_or_none(self._search_regex( + r'_(\d+)k', video_url, 'bitrate', None)) + if tbr and vbr and not abr: + abr = tbr - vbr formats.append({ 'format_id': format_id, @@ -174,31 +144,50 @@ class DreiSatIE(InfoExtractor): 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, + 'tbr': tbr, + 'width': int_or_none(xpath_text(fnode, './width')), + 'height': int_or_none(xpath_text(fnode, './height')), + 'filesize': int_or_none(xpath_text(fnode, './filesize')), + 'protocol': format_m.group('proto').lower(), }) - format_ids.append(format_id) + + geolocation = xpath_text(doc, './/details/geolocation') + if not formats and geolocation and geolocation != 'none': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) self._sort_formats(formats) + thumbnails = [] + for node in doc.findall('.//teaserimages/teaserimage'): + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + thumbnail_key = node.get('key') + if thumbnail_key: + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': xpath_text(doc, './/information/detail'), + 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'uploader': xpath_text(doc, './/details/originChannelTitle'), + 'uploader_id': xpath_text(doc, './/details/originChannelId'), 'upload_date': upload_date, 'formats': formats, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + video_id = self._match_id(url) + details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id return self.extract_from_xml_url(video_id, details_url) From 997530d9d472285126bdfb642915062f286d38a8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 12:04:24 +0100 Subject: [PATCH 027/187] [dailymotion] remove fragment part from m3u8 urls(closes #8915) --- youtube_dl/extractor/dailymotion.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 0e7d587dd..de27fffd4 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -180,9 +180,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): continue ext = mimetype2ext(type_) or determine_ext(media_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + f['url'] = f['url'].split('#')[0] + formats.append(f) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) From 54fc90aabfb71968f28af68dfe3f7a3544cc2f0b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 16:24:44 +0100 Subject: [PATCH 028/187] [youtube] fix hd720 format position --- youtube_dl/extractor/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7f4298c08..e4eec7c30 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -37,6 +37,7 @@ from ..utils import ( orderedSet, parse_codecs, parse_duration, + qualities, remove_quotes, remove_start, smuggle_url, @@ -1844,6 +1845,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(width_height[0]), 'height': int_or_none(width_height[1]), } + q = qualities(['small', 'medium', 'hd720']) formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) @@ -1926,13 +1928,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) + quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0] + more_fields = { 'filesize': filesize, 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], + 'format_note': quality, + 'quality': q(quality), } for key, value in more_fields.items(): if value: From 6843ac5b1395157608324be71dc84803b3495857 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 17:49:35 +0100 Subject: [PATCH 029/187] add support for paramountnetwork.com and bellator.com(fixes #15418) --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/spike.py | 63 +++++++++++++----------------- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4da477647..48e3da9c4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1016,7 +1016,10 @@ from .spankbang import SpankBangIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE -from .spike import SpikeIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index a7b1b3b5f..e76522b45 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -1,55 +1,46 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -class SpikeIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spike\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' +class BellatorIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' _TESTS = [{ - 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', - 'md5': '1a9265f32b0c375793d6c4ce45255256', + 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { - 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4', 'ext': 'mp4', - 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', - 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', - 'timestamp': 1388120400, - 'upload_date': '20131227', + 'title': 'Douglas Lima vs. Paul Daley - Round 1', + 'description': 'md5:805a8dd29310fd611d32baba2f767885', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { - 'url': 'http://www.spike.com/full-episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-209', - 'md5': 'b25c6f16418aefb9ad5a6cae2559321f', + 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] + + _FEED_URL = 'http://www.spike.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + +class ParamountNetworkIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', 'info_dict': { 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', 'ext': 'mp4', 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', }, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m/', - 'only_matching': True, - }, { - 'url': 'http://www.spike.com/video-clips/lhtu8m', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', - 'only_matching': True, - }, { - 'url': 'http://bellator.spike.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', - 'only_matching': True, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] - _FEED_URL = 'http://www.spike.com/feeds/mrss/' - _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' - _CUSTOM_URL_REGEX = re.compile(r'spikenetworkapp://([^/]+/[-a-fA-F0-9]+)') + _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - - def _extract_mgid(self, webpage): - mgid = super(SpikeIE, self)._extract_mgid(webpage) - if mgid is None: - url_parts = self._search_regex(self._CUSTOM_URL_REGEX, webpage, 'episode_id') - video_type, episode_id = url_parts.split('/', 1) - mgid = 'mgid:arc:{0}:spike.com:{1}'.format(video_type, episode_id) - return mgid From eea2fafcf506336e37ca514f72757acf8ee004af Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 18:34:25 +0100 Subject: [PATCH 030/187] [pbs] fix embed data extraction(fixes #16474) --- youtube_dl/extractor/pbs.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f11d5da52..a28ee17ca 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -505,7 +505,7 @@ class PBSIE(InfoExtractor): if player: video_info = self._parse_json( self._search_regex( - r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], player, '%s video data' % page, default='{}'), display_id, transform_source=js_to_json, fatal=False) if video_info: @@ -513,10 +513,14 @@ class PBSIE(InfoExtractor): if not info: info = video_info if not chapters: - for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): - chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) - if not chapter: - continue + raw_chapters = video_info.get('chapters') or [] + if not raw_chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + raw_chapters.append(chapter) + for chapter in raw_chapters: start_time = float_or_none(chapter.get('start_time'), 1000) duration = float_or_none(chapter.get('duration'), 1000) if start_time is None or duration is None: From 58a68d8fdae5358273ee52d05d77fe42094e128e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 18:44:33 +0100 Subject: [PATCH 031/187] [moniker] Remove extractor(closes #15336) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/moniker.py | 116 ----------------------------- 2 files changed, 117 deletions(-) delete mode 100644 youtube_dl/extractor/moniker.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 48e3da9c4..24c23646c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -625,7 +625,6 @@ from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE -from .moniker import MonikerIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py deleted file mode 100644 index b208820fe..000000000 --- a/youtube_dl/extractor/moniker.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os.path -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - remove_start, - sanitized_Request, - urlencode_postdata, -) - - -class MonikerIE(InfoExtractor): - IE_DESC = 'allmyvideos.net and vidspot.net' - _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P[a-zA-Z0-9_-]+)' - - _TESTS = [{ - 'url': 'http://allmyvideos.net/jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'jih3nce3x6wn', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'http://vidspot.net/l2ngsmhs8ci5', - 'md5': '710883dee1bfc370ecf9fa6a89307c88', - 'info_dict': { - 'id': 'l2ngsmhs8ci5', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', - 'only_matching': True, - }, { - 'url': 'http://vidspot.net/2/v-ywDf99', - 'md5': '5f8254ce12df30479428b0152fb8e7ba', - 'info_dict': { - 'id': 'ywDf99', - 'ext': 'mp4', - 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)', - 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.', - }, - }, { - 'url': 'http://allmyvideos.net/v/v-HXZm5t', - 'only_matching': True, - }] - - def _real_extract(self, url): - orig_video_id = self._match_id(url) - video_id = remove_start(orig_video_id, 'embed-') - url = url.replace(orig_video_id, video_id) - assert re.match(self._VALID_URL, url) is not None - orig_webpage = self._download_webpage(url, video_id) - - if '>File Not Found<' in orig_webpage: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - error = self._search_regex( - r'class="err">([^<]+)<', orig_webpage, 'error', default=None) - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - builtin_url = self._search_regex( - r']+src=(["\'])(?P.+?/builtin-.+?)\1', - orig_webpage, 'builtin URL', default=None, group='url') - - if builtin_url: - req = sanitized_Request(builtin_url) - req.add_header('Referer', url) - webpage = self._download_webpage(req, video_id, 'Downloading builtin page') - title = self._og_search_title(orig_webpage).strip() - description = self._og_search_description(orig_webpage).strip() - else: - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = dict(fields) - - post = urlencode_postdata(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - description = None - - # Could be several links with different quality - links = re.findall(r'"file" : "?(.+?)",', webpage) - # Assume the links are ordered in quality - formats = [{ - 'url': l, - 'quality': i, - } for i, l in enumerate(links)] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - } From 1306f5ed726b9f8778a5cc0586436b555f64c2ff Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 16 May 2018 19:11:48 +0100 Subject: [PATCH 032/187] [mychannels] add support for mychannels.com(closes #15334) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/minoto.py | 19 +++++++----------- .../{makerschannel.py => mychannels.py} | 20 +++++++++---------- 3 files changed, 18 insertions(+), 23 deletions(-) rename youtube_dl/extractor/{makerschannel.py => mychannels.py} (59%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 24c23646c..7d5927131 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -582,7 +582,6 @@ from .mailru import ( MailRuMusicIE, MailRuMusicSearchIE, ) -from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .mangomolo import ( MangomoloVideoIE, @@ -645,6 +644,7 @@ from .mtv import ( from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE from .mwave import MwaveIE, MwaveMeetGreetIE +from .mychannels import MyChannelsIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import ( diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py index 959a10589..636731195 100644 --- a/youtube_dl/extractor/minoto.py +++ b/youtube_dl/extractor/minoto.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_codecs, +) class MinotoIE(InfoExtractor): @@ -26,7 +29,7 @@ class MinotoIE(InfoExtractor): formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) else: fmt_profile = fmt.get('profile') or {} - f = { + formats.append({ 'format_id': fmt_profile.get('name-short'), 'format_note': fmt_profile.get('name'), 'url': fmt_url, @@ -35,16 +38,8 @@ class MinotoIE(InfoExtractor): 'filesize': int_or_none(fmt.get('filesize')), 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), - } - codecs = fmt.get('codecs') - if codecs: - codecs = codecs.split(',') - if len(codecs) == 2: - f.update({ - 'vcodec': codecs[0], - 'acodec': codecs[1], - }) - formats.append(f) + 'codecs': parse_codecs(fmt.get('codecs')), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/mychannels.py similarity index 59% rename from youtube_dl/extractor/makerschannel.py rename to youtube_dl/extractor/mychannels.py index f5d00e61d..b1ffe7848 100644 --- a/youtube_dl/extractor/makerschannel.py +++ b/youtube_dl/extractor/mychannels.py @@ -6,17 +6,17 @@ import re from .common import InfoExtractor -class MakersChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?Pvideo|production)_id=(?P[0-9]+)' +class MyChannelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?Pvideo|production)_id=(?P[0-9]+)' _TEST = { - 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849', - 'md5': '624a512c6969236b5967bf9286345ad1', + 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', + 'md5': 'b8993daad4262dd68d89d651c0c52c45', 'info_dict': { - 'id': '849', + 'id': 'wUUDZZep6vQD', 'ext': 'mp4', - 'title': 'Landing a bus on a plane is an epic win', - 'uploader': 'ZoomIn', - 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139', + 'title': 'Miss Holland joins VOTE LEAVE', + 'description': 'Miss Holland | #13 Not a potato', + 'uploader': 'Miss Holland', } } @@ -27,12 +27,12 @@ class MakersChannelIE(InfoExtractor): def extract_data_val(attr, fatal=False): return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) - minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') return { '_type': 'url_transparent', 'url': 'minoto:%s' % minoto_id, - 'id': extract_data_val('video-id', True), + 'id': url_id, 'title': extract_data_val('title', True), 'description': extract_data_val('description'), 'thumbnail': extract_data_val('image'), From a3f86160fa15f9e65789a73208cb50b0d82d715f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 17 May 2018 13:46:05 +0100 Subject: [PATCH 033/187] [pluralsight] fix clip id extraction(fixes #16460) --- youtube_dl/extractor/pluralsight.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index aacc5d4bb..3c508c9ca 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -140,10 +140,10 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to log in') - def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): + def _get_subtitles(self, author, clip_idx, lang, name, duration, video_id): captions_post = { 'a': author, - 'cn': clip_id, + 'cn': clip_idx, 'lc': lang, 'm': name, } @@ -195,13 +195,13 @@ class PluralsightIE(PluralsightBaseIE): author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] - clip_id = qs.get('clip', [None])[0] + clip_idx = qs.get('clip', [None])[0] course_name = qs.get('course', [None])[0] - if any(not f for f in (author, name, clip_id, course_name,)): + if any(not f for f in (author, name, clip_idx, course_name,)): raise ExtractorError('Invalid URL', expected=True) - display_id = '%s-%s' % (name, clip_id) + display_id = '%s-%s' % (name, clip_idx) course = self._download_course(course_name, url, display_id) @@ -217,7 +217,7 @@ class PluralsightIE(PluralsightBaseIE): clip_index = clip_.get('index') if clip_index is None: continue - if compat_str(clip_index) == clip_id: + if compat_str(clip_index) == clip_idx: clip = clip_ break @@ -225,6 +225,7 @@ class PluralsightIE(PluralsightBaseIE): raise ExtractorError('Unable to resolve clip') title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] QUALITIES = { 'low': {'width': 640, 'height': 480}, @@ -277,7 +278,7 @@ class PluralsightIE(PluralsightBaseIE): clip_post = { 'author': author, 'includeCaptions': False, - 'clipIndex': int(clip_id), + 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', 'moduleName': name, @@ -330,10 +331,10 @@ class PluralsightIE(PluralsightBaseIE): # TODO: other languages? subtitles = self.extract_subtitles( - author, clip_id, 'en', name, duration, display_id) + author, clip_idx, 'en', name, duration, display_id) return { - 'id': clip.get('clipName') or clip['name'], + 'id': clip_id, 'title': title, 'duration': duration, 'creator': author, From 361a965b5cd83b725560f740570d208c2a6886ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 17 May 2018 23:21:40 +0700 Subject: [PATCH 034/187] [vimeo:likes] Relax _VALID_URL and fix single page likes extraction (closes #16475) --- youtube_dl/extractor/vimeo.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a026526b2..8dfd8891c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -989,10 +989,10 @@ class VimeoWatchLaterIE(VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P[0-9]+)/likes/?(?:$|[?#]|sort:)' + _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P[^/]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' - _TEST = { + _TESTS = [{ 'url': 'https://vimeo.com/user755559/likes/', 'playlist_mincount': 293, 'info_dict': { @@ -1000,7 +1000,10 @@ class VimeoLikesIE(InfoExtractor): 'description': 'See all the videos urza likes', 'title': 'Videos urza likes', }, - } + }, { + 'url': 'https://vimeo.com/stormlapse/likes', + 'only_matching': True, + }] def _real_extract(self, url): user_id = self._match_id(url) @@ -1009,7 +1012,7 @@ class VimeoLikesIE(InfoExtractor): self._search_regex( r'''(?x)
  • .*?
  • \s* - ''', webpage, 'page count'), + ''', webpage, 'page count', default=1), 'page count', fatal=True) PAGE_SIZE = 12 title = self._html_search_regex( @@ -1017,7 +1020,7 @@ class VimeoLikesIE(InfoExtractor): description = self._html_search_meta('description', webpage) def _get_page(idx): - page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % ( + page_url = 'https://vimeo.com/%s/likes/page:%d/sort:date' % ( user_id, idx + 1) webpage = self._download_webpage( page_url, user_id, @@ -1037,7 +1040,7 @@ class VimeoLikesIE(InfoExtractor): return { '_type': 'playlist', - 'id': 'user%s_likes' % user_id, + 'id': '%s_likes' % user_id, 'title': title, 'description': description, 'entries': pl, From 58197205d32ae7164303b8ac37ad1d1191a91a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 May 2018 00:30:41 +0700 Subject: [PATCH 035/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ChangeLog b/ChangeLog index ef6cc3850..37dba892e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,34 @@ +version + +Extractors +* [vimeo:likes] Relax URL regular expression and fix single page likes + extraction (#16475) +* [pluralsight] Fix clip id extraction (#16460) ++ [mychannels] Add support for mychannels.com (#15334) +- [moniker] Remove extractor (#15336) +* [pbs] Fix embed data extraction (#16474) ++ [mtv] Add support for paramountnetwork.com and bellator.com (#15418) +* [youtube] Fix hd720 format position +* [dailymotion] Remove fragment part from m3u8 URLs (#8915) +* [3sat] Improve extraction (#15350) + * Extract all formats + * Extract more format metadata + * Improve format sorting + * Use hls native downloader + * Detect and bypass geo-restriction ++ [dtube] Add support for d.tube (#15201) +* [options] Fix typo (#16450) +* [youtube] Improve format filesize extraction (#16453) +* [youtube] Make uploader extraction non fatal (#16444) +* [youtube] Fix extraction for embed restricted live streams (#16433) +* [nbc] Improve info extraction (#16440) +* [twitch:clips] Fix extraction (#16429) +* [redditr] Relax URL regular expression (#16426, #16427) +* [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) ++ [nick] Add support for nickjr.de (#13230) +* [teamcoco] Fix extraction (#16374) + + version 2018.05.09 Core From 7550ea501a94ed9060220cf4c8f696e514862c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 May 2018 00:32:51 +0700 Subject: [PATCH 036/187] release 2018.05.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 2 +- docs/supportedsites.md | 7 ++++--- youtube_dl/version.py | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b2bfa9ec5..7d9de5171 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.09 +[debug] youtube-dl version 2018.05.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 37dba892e..08233cd5b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.18 Extractors * [vimeo:likes] Relax URL regular expression and fix single page likes diff --git a/README.md b/README.md index d9fe2350a..20982b0f1 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --geo-verification-proxy URL Use this proxy to verify the IP address for some geo-restricted sites. The default proxy specified by --proxy (or none, if the - options is not present) is used for the + option is not present) is used for the actual downloading. --geo-bypass Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 88fac6e90..c1048cc4c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -100,6 +100,7 @@ - **Beatport** - **Beeg** - **BehindKink** + - **Bellator** - **BellMedia** - **Bet** - **Bigflix** @@ -234,6 +235,7 @@ - **DrTuber** - **drtv** - **drtv:live** + - **DTube** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -448,7 +450,6 @@ - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - - **MakersChannel** - **MakerTV** - **mangomolo:live** - **mangomolo:video** @@ -486,7 +487,6 @@ - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** - - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** - **MotherlessGroup** @@ -508,6 +508,7 @@ - **mva:course**: Microsoft Virtual Academy courses - **Mwave** - **MwaveMeetGreet** + - **MyChannels** - **MySpace** - **MySpace:album** - **MySpass** @@ -618,6 +619,7 @@ - **PacktPubCourse** - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV + - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) @@ -789,7 +791,6 @@ - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** - - **Spike** - **Sport5** - **SportBoxEmbed** - **SportDeutschland** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6f47b1795..a43eec860 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.09' +__version__ = '2018.05.18' From 0167f0dbfe792355e793ea82791d61fc1d05f1f9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 May 2018 10:15:11 +0100 Subject: [PATCH 037/187] [imdb] improve extraction(fixes #4085)(fixes #14557) --- youtube_dl/extractor/imdb.py | 107 ++++++++++++++++------------------- 1 file changed, 48 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 425421968..926c2c388 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -7,23 +7,23 @@ from ..compat import compat_str from ..utils import ( determine_ext, mimetype2ext, + parse_duration, qualities, - remove_end, ) class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'Ice Age: Continental Drift Trailer (No. 2)', - 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', + 'title': 'No. 2 from Ice Age: Continental Drift (2012)', + 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -40,82 +40,67 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', 'only_matching': True, + }, { + 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) - descr = self._html_search_regex( - r'(?s)(.*?)', - webpage, 'description', fatal=False) - player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id - player_page = self._download_webpage( - player_url, video_id, 'Downloading player page') - # the player page contains the info for the default format, we have to - # fetch other pages for the rest of the formats - extra_formats = re.findall(r'href="(?P%s.*?)".*?>(?P.*?)<' % re.escape(player_url), player_page) - format_pages = [ - self._download_webpage( - f_url, video_id, 'Downloading info for %s format' % f_name) - for f_url, f_name in extra_formats] - format_pages.append(player_page) + webpage = self._download_webpage( + 'https://www.imdb.com/videoplayer/vi' + video_id, video_id) + video_metadata = self._parse_json(self._search_regex( + r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage, + 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id] + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'(.+?)', webpage, 'title', fatal=False) or video_metadata['title'] quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] - for format_page in format_pages: - json_data = self._search_regex( - r']+class="imdb-player-data"[^>]*?>(.*?)', - format_page, 'json data', flags=re.DOTALL) - info = self._parse_json(json_data, video_id, fatal=False) - if not info: + for encoding in video_metadata.get('encodings', []): + if not encoding or not isinstance(encoding, dict): continue - format_info = info.get('videoPlayerObject', {}).get('video', {}) - if not format_info: + video_url = encoding.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): continue - video_info_list = format_info.get('videoInfoList') - if not video_info_list or not isinstance(video_info_list, list): + ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - for video_info in video_info_list: - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): - continue - if (video_info.get('videoMimeType') == 'application/x-mpegURL' or - determine_ext(video_url) == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + format_id = encoding.get('definition') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'quality': quality(format_id), + }) self._sort_formats(formats) return { 'id': video_id, - 'title': remove_end(self._og_search_title(webpage), ' - IMDb'), + 'title': title, 'formats': formats, - 'description': descr, - 'thumbnail': format_info.get('slate'), + 'description': video_metadata.get('description'), + 'thumbnail': video_metadata.get('slate', {}).get('url'), + 'duration': parse_duration(video_metadata.get('duration')), } class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/(?P[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d+)(?!/videoplayer/vi\d+)' _TEST = { - 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', + 'url': 'https://www.imdb.com/list/ls009921623/', 'info_dict': { - 'id': 'JFs9NWw6XI0', - 'title': 'March 23, 2012 Releases', + 'id': '009921623', + 'title': 'The Bourne Legacy', + 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.', }, - 'playlist_count': 7, + 'playlist_count': 8, } def _real_extract(self, url): @@ -123,9 +108,13 @@ class ImdbListIE(InfoExtractor): webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') - for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)] + for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)] list_title = self._html_search_regex( - r'

    (.*?)

    ', webpage, 'list title') + r']+class="[^"]*header[^"]*"[^>]*>(.*?)', + webpage, 'list title') + list_description = self._html_search_regex( + r']+class="[^"]*list-description[^"]*"[^>]*>

    (.*?)

    ', + webpage, 'list description') - return self.playlist_result(entries, list_id, list_title) + return self.playlist_result(entries, list_id, list_title, list_description) From 27694fe7ad77d5f99d7b46fa7395f4ccbb378777 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 May 2018 11:04:08 +0100 Subject: [PATCH 038/187] [imdb:list] fix _VALID_URL regex --- youtube_dl/extractor/imdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 926c2c388..4bafa54a2 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -92,7 +92,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d+)(?!/videoplayer/vi\d+)' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P\d{9})(?!/videoplayer/vi\d+)' _TEST = { 'url': 'https://www.imdb.com/list/ls009921623/', 'info_dict': { From acd620c930a92511c2e2099a4fc82d41825fdf93 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 May 2018 12:19:05 +0100 Subject: [PATCH 039/187] [teamcoco] improve _VALID_URL regex(#16484) --- youtube_dl/extractor/teamcoco.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index f06e5b19a..64235b0f6 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..utils import ( class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P[^/?#]+)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { 'url': 'http://teamcoco.com/video/mary-kay-remote', @@ -67,6 +67,9 @@ class TeamcocoIE(InfoExtractor): 'skip_download': True, # m3u8 downloads }, 'skip': 'This video is no longer available.', + }, { + 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', + 'only_matching': True, } ] From f2b1fa07ec063ca63373e8558223e7af544f2cf8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 May 2018 13:05:51 +0100 Subject: [PATCH 040/187] [teamcoco] relax _VALID_URL regex and add a fallback for format extraction(fixes #16484) --- youtube_dl/extractor/teamcoco.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 64235b0f6..63fd4fe1c 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..utils import ( class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P([^/]+/)*[^/?#]+)' + _VALID_URL = r'https?://teamcoco\.com/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { 'url': 'http://teamcoco.com/video/mary-kay-remote', @@ -70,6 +70,15 @@ class TeamcocoIE(InfoExtractor): }, { 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/italy/conan-jordan-schlansky-hit-the-streets-of-florence', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/haiti/conan-s-haitian-history-lesson', + 'only_matching': True, + }, { + 'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv', + 'only_matching': True, } ] @@ -84,7 +93,7 @@ class TeamcocoIE(InfoExtractor): display_id = self._match_id(url) response = self._graphql_call('''{ - %s(slug: "video/%s") { + %s(slug: "%s") { ... on RecordSlug { record { id @@ -94,6 +103,9 @@ class TeamcocoIE(InfoExtractor): thumb { preview } + file { + url + } tags { name } @@ -111,15 +123,15 @@ class TeamcocoIE(InfoExtractor): record = response['record'] video_id = record['id'] - srcs = self._graphql_call('''{ + video_sources = self._graphql_call('''{ %s(id: "%s") { src } -}''', 'RecordVideoSource', video_id)['src'] +}''', 'RecordVideoSource', video_id) or {} formats = [] get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in srcs.items(): + for format_id, src in video_sources.get('src', {}).items(): if not isinstance(src, dict): continue src_url = src.get('src') @@ -146,6 +158,9 @@ class TeamcocoIE(InfoExtractor): 'format_id': format_id, 'quality': get_quality(format_id), }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) self._sort_formats(formats) return { From 504f20dd302189db2cfe1cb5ee9a622c39ee693c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 May 2018 23:53:24 +0700 Subject: [PATCH 041/187] Remove experimental mark for some options --- youtube_dl/YoutubeDL.py | 10 +++++----- youtube_dl/downloader/common.py | 1 - youtube_dl/extractor/common.py | 9 +++------ youtube_dl/options.py | 12 ++++++------ 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 046e03247..2a405c5ca 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -211,7 +211,7 @@ class YoutubeDL(object): At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification - on geo-restricted sites. (Experimental) + on geo-restricted sites. socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -259,7 +259,7 @@ class YoutubeDL(object): - "warn": only emit a warning - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) - source_address: (Experimental) Client-side IP address to bind to. + source_address: Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. sleep_interval: Number of seconds to sleep before each download when @@ -281,14 +281,14 @@ class YoutubeDL(object): match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For - HTTP header (experimental) + HTTP header geo_bypass_country: Two-letter ISO 3166-2 country code that will be used for explicit geographic restriction bypassing via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header geo_bypass_ip_block: IP range in CIDR notation that will be used similarly to - geo_bypass_country (experimental) + geo_bypass_country The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index edd125ee2..5979833c0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -45,7 +45,6 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - (experimental) external_downloader_args: A list of additional command-line arguments for the external downloader. hls_use_mpegts: Use the mpegts container for HLS videos. diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3ef5af13c..a2548dba3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -339,20 +339,17 @@ class InfoExtractor(object): _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. (experimental) + country code provided with geo_bypass_country. _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted countries for this extractor. One of these countries will be used by geo restriction bypass mechanism right away in order to bypass - geo restriction, of course, if the mechanism is not disabled. (experimental) + geo restriction, of course, if the mechanism is not disabled. _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted IP blocks in CIDR notation for this extractor. One of these IP blocks will be used by geo restriction bypass mechanism similarly - to _GEO_COUNTRIES. (experimental) - - NB: both these geo attributes are experimental and may change in future - or be completely removed. + to _GEO_COUNTRIES. Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. diff --git a/youtube_dl/options.py b/youtube_dl/options.py index b692c6b3b..e83d546a0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -203,7 +203,7 @@ def parseOpts(overrideArguments=None): network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', - help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental ' + help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' 'SOCKS proxy, specify a proper scheme. For example ' 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' 'for direct connection') @@ -240,19 +240,19 @@ def parseOpts(overrideArguments=None): geo.add_option( '--geo-bypass', action='store_true', dest='geo_bypass', default=True, - help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--no-geo-bypass', action='store_false', dest='geo_bypass', default=True, - help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') geo.add_option( '--geo-bypass-country', metavar='CODE', dest='geo_bypass_country', default=None, - help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') geo.add_option( '--geo-bypass-ip-block', metavar='IP_BLOCK', dest='geo_bypass_ip_block', default=None, - help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation (experimental)') + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -502,7 +502,7 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected file size (experimental)') + help='Set file xattribute ytdl.filesize with expected file size') downloader.add_option( '--hls-prefer-native', dest='hls_prefer_native', action='store_true', default=None, From 5c766952dc6de9065060344342184c4037403409 Mon Sep 17 00:00:00 2001 From: huichen90 <35417991+huichen90@users.noreply.github.com> Date: Wed, 16 May 2018 17:29:25 +0800 Subject: [PATCH 042/187] Update leeco.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed this bug :youtube_dl.utils.ExtractorError: An extractor error has occurred. (caused by KeyError('location',)); --- youtube_dl/extractor/leeco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index ffe10154b..8dd1ce0d0 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -130,7 +130,7 @@ class LeIE(InfoExtractor): media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, - 'splatid': 101, + 'splatid': 105, 'format': 1, 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), From db2058f63e64ff59ffad0e1e8ad5e18d18d3da71 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 14:53:02 +0100 Subject: [PATCH 043/187] [globo] improve extraction(closes #4189) - add support for authentication - simplify url signing - extract DASH and MSS formats --- youtube_dl/extractor/globo.py | 321 +++++----------------------------- 1 file changed, 41 insertions(+), 280 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dc7b2661c..730deda6b 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -1,16 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import hashlib +import json import random import re -import math from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_chr, - compat_ord, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -22,12 +20,7 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' - - _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' - _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' - - _RESIGN_EXPIRATION = 86400 - + _LOGGED_IN = False _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', @@ -70,287 +63,49 @@ class GloboIE(InfoExtractor): 'only_matching': True, }] - class MD5(object): - HEX_FORMAT_LOWERCASE = 0 - HEX_FORMAT_UPPERCASE = 1 - BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' - BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' - PADDING = '=0xFF01DD' - hexcase = 0 - b64pad = '' + def _real_initialize(self): + if self._LOGGED_IN: + return - def __init__(self): - pass + email, password = self._get_login_info() + if email is None: + return - class JSArray(list): - def __getitem__(self, y): - try: - return list.__getitem__(self, y) - except IndexError: - return 0 - - def __setitem__(self, i, y): - try: - return list.__setitem__(self, i, y) - except IndexError: - self.extend([0] * (i - len(self) + 1)) - self[-1] = y - - @classmethod - def hex_md5(cls, param1): - return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) - - @classmethod - def b64_md5(cls, param1, param2=None): - return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) - - @classmethod - def any_md5(cls, param1, param2): - return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) - - @classmethod - def rstr_md5(cls, param1): - return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) - - @classmethod - def rstr2hex(cls, param1): - _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' - _loc_3 = '' - for _loc_5 in range(0, len(param1)): - _loc_4 = compat_ord(param1[_loc_5]) - _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] - return _loc_3 - - @classmethod - def rstr2b64(cls, param1): - _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' - _loc_3 = '' - _loc_4 = len(param1) - for _loc_5 in range(0, _loc_4, 3): - _loc_6_1 = compat_ord(param1[_loc_5]) << 16 - _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 - _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 - _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 - for _loc_7 in range(0, 4): - if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: - _loc_3 += cls.b64pad - else: - _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] - return _loc_3 - - @staticmethod - def rstr2any(param1, param2): - _loc_3 = len(param2) - _loc_4 = [] - _loc_9 = [0] * ((len(param1) >> 2) + 1) - for _loc_5 in range(0, len(_loc_9)): - _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) - - while len(_loc_9) > 0: - _loc_8 = [] - _loc_7 = 0 - for _loc_5 in range(0, len(_loc_9)): - _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] - _loc_6 = math.floor(_loc_7 / _loc_3) - _loc_7 -= _loc_6 * _loc_3 - if len(_loc_8) > 0 or _loc_6 > 0: - _loc_8[len(_loc_8)] = _loc_6 - - _loc_4[len(_loc_4)] = _loc_7 - _loc_9 = _loc_8 - - _loc_10 = '' - _loc_5 = len(_loc_4) - 1 - while _loc_5 >= 0: - _loc_10 += param2[_loc_4[_loc_5]] - _loc_5 -= 1 - - return _loc_10 - - @classmethod - def str2rstr_utf8(cls, param1, param2=None): - _loc_3 = '' - _loc_4 = -1 - if not param2: - param2 = cls.PADDING - param1 = param1 + param2[1:9] - while True: - _loc_4 += 1 - if _loc_4 >= len(param1): - break - _loc_5 = compat_ord(param1[_loc_4]) - _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 - if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: - _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) - _loc_4 += 1 - if _loc_5 <= 127: - _loc_3 += compat_chr(_loc_5) - continue - if _loc_5 <= 2047: - _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) - continue - if _loc_5 <= 65535: - _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( - 128 | _loc_5 & 63) - continue - if _loc_5 <= 2097151: - _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( - 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) - return _loc_3 - - @staticmethod - def rstr2binl(param1): - _loc_2 = [0] * ((len(param1) >> 2) + 1) - for _loc_3 in range(0, len(_loc_2)): - _loc_2[_loc_3] = 0 - for _loc_3 in range(0, len(param1) * 8, 8): - _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 - return _loc_2 - - @staticmethod - def binl2rstr(param1): - _loc_2 = '' - for _loc_3 in range(0, len(param1) * 32, 8): - _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) - return _loc_2 - - @classmethod - def binl_md5(cls, param1, param2): - param1 = cls.JSArray(param1) - param1[param2 >> 5] |= 128 << param2 % 32 - param1[(param2 + 64 >> 9 << 4) + 14] = param2 - _loc_3 = 1732584193 - _loc_4 = -271733879 - _loc_5 = -1732584194 - _loc_6 = 271733878 - for _loc_7 in range(0, len(param1), 16): - _loc_8 = _loc_3 - _loc_9 = _loc_4 - _loc_10 = _loc_5 - _loc_11 = _loc_6 - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) - _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) - _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) - _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) - _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) - _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) - _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) - _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) - _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) - _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) - _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) - _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) - _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) - _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) - _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) - _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) - _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) - _loc_3 = cls.safe_add(_loc_3, _loc_8) - _loc_4 = cls.safe_add(_loc_4, _loc_9) - _loc_5 = cls.safe_add(_loc_5, _loc_10) - _loc_6 = cls.safe_add(_loc_6, _loc_11) - return [_loc_3, _loc_4, _loc_5, _loc_6] - - @classmethod - def md5_cmn(cls, param1, param2, param3, param4, param5, param6): - return cls.safe_add( - cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) - - @classmethod - def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) - - @classmethod - def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): - return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) - - @classmethod - def safe_add(cls, param1, param2): - _loc_3 = (param1 & 65535) + (param2 & 65535) - _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) - return cls.lshift(_loc_4, 16) | _loc_3 & 65535 - - @classmethod - def bit_rol(cls, param1, param2): - return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) - - @staticmethod - def lshift(value, count): - r = (0xFFFFFFFF & value) << count - return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + self._download_json( + 'https://login.globo.com/api/authentication', None, data=json.dumps({ + 'payload': { + 'email': email, + 'password': password, + 'serviceId': 4654, + }, + }).encode(), headers={ + 'Content-Type': 'application/json; charset=utf-8', + }) + self._LOGGED_IN = True def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + 'http://api.globovideos.com/videos/%s/playlist' % video_id, + video_id)['videos'][0] title = video['title'] formats = [] for resource in video['resources']: resource_id = resource.get('_id') - if not resource_id or resource_id.endswith('manifest'): + resource_url = resource.get('url') + if not resource_id or not resource_url: continue security = self._download_json( - self._SECURITY_URL_TEMPLATE % (video_id, resource_id), - video_id, 'Downloading security hash for %s' % resource_id) + 'http://security.video.globo.com/videos/%s/hash' % video_id, + video_id, 'Downloading security hash for %s' % resource_id, query={ + 'player': 'flash', + 'version': '17.0.0.132', + 'resource_id': resource_id, + }) security_hash = security.get('hash') if not security_hash: @@ -365,18 +120,24 @@ class GloboIE(InfoExtractor): received_random = security_hash[12:22] received_md5 = security_hash[22:] - sign_time = received_time + self._RESIGN_EXPIRATION + sign_time = received_time + 86400 padding = '%010d' % random.randint(1, 10000000000) - signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) + md5_data = (received_md5 + str(sign_time) + padding + '0xFF01DD').encode() + signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 - resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats( + signed_url, resource_id, mpd_id='dash', fatal=False)) + elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'): + formats.extend(self._extract_ism_formats( + signed_url, resource_id, ism_id='mss', fatal=False)) else: formats.append({ 'url': signed_url, From e5187493002f1d089d450fc3b2b4af64c996dc71 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 15:07:24 +0100 Subject: [PATCH 044/187] [globo] handle login errors --- youtube_dl/extractor/globo.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 730deda6b..9c2360464 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -8,7 +8,10 @@ import random import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, float_or_none, @@ -71,16 +74,22 @@ class GloboIE(InfoExtractor): if email is None: return - self._download_json( - 'https://login.globo.com/api/authentication', None, data=json.dumps({ - 'payload': { - 'email': email, - 'password': password, - 'serviceId': 4654, - }, - }).encode(), headers={ - 'Content-Type': 'application/json; charset=utf-8', - }) + try: + self._download_json( + 'https://login.globo.com/api/authentication', None, data=json.dumps({ + 'payload': { + 'email': email, + 'password': password, + 'serviceId': 4654, + }, + }).encode(), headers={ + 'Content-Type': 'application/json; charset=utf-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read(), None) + raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) + raise self._LOGGED_IN = True def _real_extract(self, url): From d81ffc3aa0f7b4114cec68cac9e347689a6d5462 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 15:39:02 +0100 Subject: [PATCH 045/187] [globo] Add entry for netrc authentication --- youtube_dl/extractor/globo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 9c2360464..8e6c38742 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -24,6 +24,7 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' _LOGGED_IN = False + _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', From b89ac534555692c3c29a57d97ec0bda3bef3b086 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 17:46:52 +0100 Subject: [PATCH 046/187] [globo] use compat_str --- youtube_dl/extractor/globo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8e6c38742..81d6d36d3 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -126,16 +126,16 @@ class GloboIE(InfoExtractor): continue hash_code = security_hash[:2] - received_time = int(security_hash[2:12]) + received_time = security_hash[2:12] received_random = security_hash[12:22] received_md5 = security_hash[22:] - sign_time = received_time + 86400 + sign_time = compat_str(int(received_time) + 86400) padding = '%010d' % random.randint(1, 10000000000) - md5_data = (received_md5 + str(sign_time) + padding + '0xFF01DD').encode() + md5_data = (received_md5 + sign_time + padding + '0xFF01DD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + signed_hash = hash_code + received_time + received_random + sign_time + padding + signed_md5 signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): From 57d6792024f2670a21f923dfbd81614a1ee6b735 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 11:27:36 +0100 Subject: [PATCH 047/187] [viewlift] fix extraction for snagfils.com(closes #15766) --- youtube_dl/extractor/viewlift.py | 164 ++++++++++++++++++++++--------- 1 file changed, 115 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 1f29c273f..e466156f6 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,24 +1,27 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, clean_html, determine_ext, int_or_none, js_to_json, + parse_age_limit, parse_duration, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com' class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -60,8 +63,10 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): formats = [] has_bitrate = False - for source in self._parse_json(js_to_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, + 'sources', default='[]'), video_id, js_to_json) + for source in sources: file_ = source.get('file') if not file_: continue @@ -70,7 +75,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): format_id = source.get('label') or ext if all(v in ('m3u8', 'hls') for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', m3u8_id='hls')) + file_, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], @@ -85,6 +91,13 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'tbr': bitrate, 'height': height, }) + if not formats: + hls_url = self._parse_json(self._search_regex( + r'filmInfo\.src\s*=\s*({.+?});', + webpage, 'src'), video_id, js_to_json)['src'] + formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') self._sort_formats(formats, field_preference) @@ -109,10 +122,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'display_id': 'lost_for_life', 'ext': 'mp4', 'title': 'Lost for Life', - 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 4489, - 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + 'categories': 'mincount:3', + 'age_limit': 14, + 'upload_date': '20150421', + 'timestamp': 1429656819, } }, { 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', @@ -125,7 +141,9 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 979, - 'categories': ['Documentary', 'Sports', 'Politics'] + 'categories': 'mincount:2', + 'timestamp': 1399478279, + 'upload_date': '20140507', } }, { # Film is not playable in your area. @@ -138,9 +156,6 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'http://www.winnersview.com/videos/the-good-son', 'only_matching': True, - }, { - 'url': 'http://www.kesari.tv/news/video/1461919076414', - 'only_matching': True, }, { # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', @@ -156,45 +171,96 @@ class ViewLiftIE(ViewLiftBaseIE): raise ExtractorError( 'Film %s is not available.' % display_id, expected=True) - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + initial_store_state = self._search_regex( + r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", + webpage, 'Initial Store State', default=None) + if initial_store_state: + modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( + initial_store_state).decode()), display_id)['page']['data']['modules'] + content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') + gist = content_data['gist'] + film_id = gist['id'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), - display_id) + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) + + info = { + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating', '').replace('_', '-')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info else: - title = self._search_regex( - r'itemprop="title">([^<]+)<', webpage, 'title') - description = self._html_search_regex( - r'(?s)
    (.+?)
    ', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + snag = self._parse_json( + self._search_regex( + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)
    (.+?)
    ', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + 'ie_key': 'ViewLiftEmbed', + } From b836118724122a639a1cb78d55d91724bf1e7251 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:12:20 +0100 Subject: [PATCH 048/187] [utils] Relax TV Parental Guidelines matching --- youtube_dl/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f9ca63c58..d61af8837 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2253,12 +2253,12 @@ US_RATINGS = { TV_PARENTAL_GUIDELINES = { - 'TV-Y': 0, - 'TV-Y7': 7, - 'TV-G': 0, - 'TV-PG': 0, - 'TV-14': 14, - 'TV-MA': 17, + 'Y': 0, + 'Y7': 7, + 'G': 0, + 'PG': 0, + '14': 14, + 'MA': 17, } @@ -2272,7 +2272,10 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - return TV_PARENTAL_GUIDELINES.get(s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(TV_PARENTAL_GUIDELINES.keys()), s) + if m: + return TV_PARENTAL_GUIDELINES[m.group(1)] + return None def strip_jsonp(code): From 670dcba8c73ee69545513522676b2c480bc48662 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:13:44 +0100 Subject: [PATCH 049/187] [viewlift] Remove rating format transformation --- youtube_dl/extractor/viewlift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index e466156f6..51a002b11 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -214,7 +214,7 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': gist.get('description'), 'thumbnail': gist.get('videoImageUrl'), 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating', '').replace('_', '-')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, } From 268e132dec96ea9e8a9a3cafb788baf39a498c7d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:15:21 +0100 Subject: [PATCH 050/187] [go90] extract age limit and detect drm protection(#10127) --- youtube_dl/extractor/go90.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 9b2e1c164..35dde42d0 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + ExtractorError, int_or_none, + parse_age_limit, parse_iso8601, ) @@ -23,6 +25,7 @@ class Go90IE(InfoExtractor): 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', 'timestamp': 1491868800, 'upload_date': '20170411', + 'age_limit': 14, } } @@ -33,6 +36,8 @@ class Go90IE(InfoExtractor): video_id, headers={ 'Content-Type': 'application/json; charset=utf-8', }, data=b'{"client":"web","device_type":"pc"}') + if video_data.get('requires_drm'): + raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] episode_number = int_or_none(video_data.get('episode_number')) @@ -123,4 +128,5 @@ class Go90IE(InfoExtractor): 'season_number': season_number, 'episode_number': episode_number, 'subtitles': subtitles, + 'age_limit': parse_age_limit(video_data.get('rating')), } From 3bb3ff38a15ccf00686c75af8d6635903632ee87 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:20:05 +0100 Subject: [PATCH 051/187] [test_utils] add tests for b836118724122a639a1cb78d55d91724bf1e7251 --- test/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 14503ab53..f2b51131c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -519,6 +519,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_age_limit('PG-13'), 13) self.assertEqual(parse_age_limit('TV-14'), 14) self.assertEqual(parse_age_limit('TV-MA'), 17) + self.assertEqual(parse_age_limit('TV14'), 14) + self.assertEqual(parse_age_limit('TV_G'), 0) def test_parse_duration(self): self.assertEqual(parse_duration(None), None) From ca0aef42d4fa77123c56c19ef3fe2673645391a2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 23:04:12 +0100 Subject: [PATCH 052/187] [viewlift] add support for hoichoi.tv(closes #16536) --- youtube_dl/extractor/viewlift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 51a002b11..c43d1a1e8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -17,7 +17,7 @@ from ..utils import ( class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): From 1139935db78b610d15ade2b667e2a07b4df0ecf0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 24 May 2018 02:51:47 +0100 Subject: [PATCH 053/187] [nbc] add support for stream.nbcsports.com(closes #13911) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nbc.py | 62 +++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7d5927131..52e330955 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -666,6 +666,7 @@ from .nbc import ( NBCOlympicsIE, NBCOlympicsStreamIE, NBCSportsIE, + NBCSportsStreamIE, NBCSportsVPlayerIE, ) from .ndr import ( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1b1722cfa..c843f8649 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,7 +1,8 @@ from __future__ import unicode_literals -import re import base64 +import json +import re from .common import InfoExtractor from .theplatform import ThePlatformIE @@ -175,6 +176,65 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') +class NBCSportsStreamIE(AdobePassIE): + _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P\d+)' + _TEST = { + 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559', + 'info_dict': { + 'id': '206559', + 'ext': 'mp4', + 'title': 'Amgen Tour of California Women\'s Recap', + 'description': 'md5:66520066b3b5281ada7698d0ea2aa894', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires Adobe Pass Authentication', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + live_source = self._download_json( + 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, + video_id) + video_source = live_source['videoSources'][0] + title = video_source['title'] + source_url = None + for k in ('source', 'msl4source', 'iossource', 'hlsv4'): + sk = k + 'Url' + source_url = video_source.get(sk) or video_source.get(sk + 'Alt') + if source_url: + break + else: + source_url = video_source['ottStreamUrl'] + is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' + resource = self._get_mvpd_resource('nbcsports', title, video_id, '') + token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) + tokenized_url = self._download_json( + 'https://token.playmakerservices.com/cdn', + video_id, data=json.dumps({ + 'requestorId': 'nbcsports', + 'pid': video_id, + 'application': 'NBCSports', + 'version': 'v1', + 'platform': 'desktop', + 'cdn': 'akamai', + 'url': video_source['sourceUrl'], + 'token': base64.b64encode(token.encode()).decode(), + 'resourceId': base64.b64encode(resource.encode()).decode(), + }).encode())['tokenizedUrl'] + formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': live_source.get('description'), + 'formats': formats, + 'is_live': is_live, + } + + class CSNNEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' From e8e58c22786918f93e6928d86b878fdc56461c4d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 24 May 2018 11:53:42 +0100 Subject: [PATCH 054/187] [hidive] add support for authentication(closes #16534) --- youtube_dl/extractor/hidive.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index eee517071..d8f2e682f 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -17,6 +17,9 @@ class HiDiveIE(InfoExtractor): # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, # so disabling geo bypass completely _GEO_BYPASS = False + _NETRC_MACHINE = 'hidive' + _LOGGED_IN = False + _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', @@ -31,8 +34,30 @@ class HiDiveIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires Authentication', }] + def _real_initialize(self): + if self._LOGGED_IN: + return + + (email, password) = self._get_login_info() + if email is None: + return + + webpage = self._download_webpage(self._LOGIN_URL, None) + form = self._search_regex( + r'(?s)]+action="/account/login"[^>]*>(.+?)', + webpage, 'login form') + data = self._hidden_inputs(form) + data.update({ + 'Email': email, + 'Password': password, + }) + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) + self._LOGGED_IN = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title, key = mobj.group('title', 'key') @@ -43,6 +68,7 @@ class HiDiveIE(InfoExtractor): data=urlencode_postdata({ 'Title': title, 'Key': key, + 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', })) restriction = settings.get('restrictionReason') @@ -79,6 +105,7 @@ class HiDiveIE(InfoExtractor): subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, }) + self._sort_formats(formats) season_number = int_or_none(self._search_regex( r's(\d+)', key, 'season number', default=None)) From 3d2a643fdcba126b209b758f2e403742ee631cf3 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Thu, 24 May 2018 11:15:03 +0200 Subject: [PATCH 055/187] [imgur] Fix extraction --- youtube_dl/extractor/imgur.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 67c24a51c..2901960a5 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -21,7 +20,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,7 +28,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/gallery/YcAQlkx', @@ -37,8 +36,6 @@ class ImgurIE(InfoExtractor): 'id': 'YcAQlkx', 'ext': 'mp4', 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - 'description': 'Imgur: The most awesome images on the Internet.' - } }, { 'url': 'http://imgur.com/topic/Funny/N8rOudd', @@ -50,8 +47,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - compat_urlparse.urljoin(url, video_id), video_id) + gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) + webpage = self._download_webpage(gifv_url, video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -107,7 +104,7 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'title': self._og_search_title(webpage), } From c561b75c82247188e010b6b53c118bb26b4daaf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:09:15 +0700 Subject: [PATCH 056/187] [peertube] Add extractor (closes #16301, closes #16329) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/peertube.py | 210 +++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 youtube_dl/extractor/peertube.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 52e330955..374aa185c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -811,6 +811,7 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE from .periscope import ( diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py new file mode 100644 index 000000000..b086f6f5a --- /dev/null +++ b/youtube_dl/extractor/peertube.py @@ -0,0 +1,210 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_resolution, + try_get, + unified_timestamp, + urljoin, +) + + +class PeerTubeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + # Taken from https://instances.joinpeertube.org/instances + tube\.openalgeria\.org| + peertube\.pointsecu\.fr| + peertube\.nogafa\.org| + peertube\.pl| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.inapurna\.org| + peertube\.netzspielplatz\.de| + video\.deadsuperhero\.com| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.worldofhauru\.xyz| + tube\.bootlicker\.party| + skeptikon\.fr| + peertube\.geekshell\.fr| + tube\.opportunis\.me| + peertube\.peshane\.net| + video\.blueline\.mg| + tube\.homecomputing\.fr| + videos\.cloudfrancois\.fr| + peertube\.viviers-fibre\.net| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + peertube\.extremely\.online| + peertube\.public-infrastructure\.eu| + tube\.kher\.nl| + peertube\.qtg\.fr| + tube\.22decembre\.eu| + facegirl\.me| + video\.migennes\.net| + janny\.moe| + tube\.p2p\.legal| + video\.atlanti\.se| + troll\.tv| + peertube\.geekael\.fr| + vid\.leotindall\.com| + video\.anormallostpod\.ovh| + p-tube\.h3z\.jp| + tube\.darfweb\.eu| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.symphonie-of-code\.fr| + testtube\.ortg\.de| + videos\.cemea\.org| + peertube\.gwendalavir\.eu| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + peertube\.duckdns\.org| + sikke\.fi| + peertube\.mastodon\.host| + firedragonvideos\.com| + vidz\.dou\.bet| + peertube\.koehn\.com| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + medias\.libox\.fr| + peertube\.moe| + peertube\.xyz| + jp\.peertube\.network| + videos\.benpro\.fr| + tube\.otter\.sh| + peertube\.angristan\.xyz| + peertube\.parleur\.net| + peer\.ecutsa\.fr| + peertube\.heraut\.eu| + peertube\.tifox\.fr| + peertube\.maly\.io| + vod\.mochi\.academy| + exode\.me| + coste\.video| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + meilleurtube\.delire\.party| + tube\.mochi\.academy| + peertube\.dav\.li| + media\.zat\.im| + pytu\.be| + peertube\.valvin\.fr| + peertube\.nsa\.ovh| + video\.colibris-outilslibres\.org| + video\.hispagatos\.org| + tube\.svnet\.fr| + peertube\.video| + videos\.lecygnenoir\.info| + peertube3\.cpy\.re| + peertube2\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re + ) + /videos/watch/(?P[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'md5': '80f24ff364cc9d333529506a263e7feb', + 'info_dict': { + 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'ext': 'mp4', + 'title': 'wow', + 'description': 'wow such video, so gif', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1519297480, + 'upload_date': '20180222', + 'uploader': 'Luclu7', + 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', + 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', + 'license': 'Unknown', + 'duration': 3, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': list, + 'categories': list, + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + urljoin(url, '/api/v1/videos/%s' % video_id), video_id) + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = file_.get('fileUrl') + if not file_url or not isinstance(file_url, compat_str): + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + def account_data(field): + return try_get(video, lambda x: x['account'][field], compat_str) + + category = try_get(video, lambda x: x['category']['label'], compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName'), + 'uploader_id': account_data('uuid'), + 'uploder_url': account_data('url'), + 'license': try_get( + video, lambda x: x['licence']['label'], compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + } From f2fc63a5a873391b9ac15642507a2eae71e42906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:15:38 +0700 Subject: [PATCH 057/187] [peertube] Add support for embed and API URLs --- youtube_dl/extractor/peertube.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index b086f6f5a..61c41add0 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -116,7 +116,8 @@ class PeerTubeIE(InfoExtractor): videos\.tcit\.fr| peertube\.cpy\.re ) - /videos/watch/(?P[^/?#&]+) + /(?:videos/(?:watch|embed)|api/v\d/videos)/ + (?P[^/?#&]+) ''' _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', @@ -147,6 +148,12 @@ class PeerTubeIE(InfoExtractor): # nsfw 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, }] def _real_extract(self, url): From 6bd499e8ca769cf69c4b24fa2d7a751d7869b679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:28:30 +0700 Subject: [PATCH 058/187] [peertube] Add support for generic embeds --- youtube_dl/extractor/generic.py | 15 +++++++++++++++ youtube_dl/extractor/peertube.py | 23 +++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 76852f9dc..47ac139c9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -108,6 +108,7 @@ from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE +from .peertube import PeerTubeIE class GenericIE(InfoExtractor): @@ -2012,6 +2013,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # PeerTube embed + 'url': 'https://joinpeertube.org/fr/home/', + 'info_dict': { + 'id': 'home', + 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', + }, + 'playlist_count': 2, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3029,6 +3039,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + peertube_urls = PeerTubeIE._extract_urls(webpage) + if peertube_urls: + return self.playlist_from_matches( + peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 61c41add0..a481b3151 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -13,9 +15,7 @@ from ..utils import ( class PeerTubeIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: + _INSTANCES_RE = r'''(?: # Taken from https://instances.joinpeertube.org/instances tube\.openalgeria\.org| peertube\.pointsecu\.fr| @@ -115,10 +115,13 @@ class PeerTubeIE(InfoExtractor): peertube2\.cpy\.re| videos\.tcit\.fr| peertube\.cpy\.re - ) + )''' + _VALID_URL = r'''(?x) + https?:// + %s /(?:videos/(?:watch|embed)|api/v\d/videos)/ - (?P[^/?#&]+) - ''' + (?P[^/?\#&]+) + ''' % _INSTANCES_RE _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', @@ -156,6 +159,14 @@ class PeerTubeIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'''(?x)]+\bsrc=(["\'])(?P(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' + % PeerTubeIE._INSTANCES_RE, webpage)] + def _real_extract(self, url): video_id = self._match_id(url) From b39f42ee92a3cd669da24db9798e1dc9b574720f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A1s=20Veres-Szentkir=C3=A1lyi?= Date: Fri, 25 May 2018 19:46:05 +0200 Subject: [PATCH 059/187] [indavideo] Sign download URLs --- youtube_dl/extractor/indavideo.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 11cf3c609..15b766fb2 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -2,10 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + update_url_query, ) @@ -58,11 +60,10 @@ class IndavideoEmbedIE(InfoExtractor): if flv_url not in video_urls: video_urls.append(flv_url) - formats = [{ - 'url': video_url, - 'height': int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)), - } for video_url in video_urls] + filesh = video.get('filesh') + formats = [ + self.video_url_to_format(video_url, filesh) + for video_url in video_urls] self._sort_formats(formats) timestamp = video.get('date') @@ -90,6 +91,18 @@ class IndavideoEmbedIE(InfoExtractor): 'formats': formats, } + def video_url_to_format(self, video_url, filesh): + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if height and filesh: + token = filesh.get(compat_str(height)) + if token is not None: + video_url = update_url_query(video_url, {'token': token}) + return { + 'url': video_url, + 'height': height, + } + class IndavideoIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' From 2a7c6befc16f72df5368cb4adccd1cd84fd432d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:09:44 +0700 Subject: [PATCH 060/187] [indavideo] Fix extraction (closes #11221) --- youtube_dl/extractor/indavideo.py | 48 +++++++++++++++++++------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 15b766fb2..2946c7b84 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -15,7 +15,7 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P[\da-f]+)' _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', - 'md5': 'f79b009c66194acacd40712a6778acfa', + 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', 'ext': 'mp4', @@ -47,7 +47,14 @@ class IndavideoEmbedIE(InfoExtractor): title = video['title'] - video_urls = video.get('video_files', []) + video_urls = [] + + video_files = video.get('video_files') + if isinstance(video_files, list): + video_urls.extend(video_files) + elif isinstance(video_files, dict): + video_urls.extend(video_files.values()) + video_file = video.get('video_file') if video: video_urls.append(video_file) @@ -61,9 +68,22 @@ class IndavideoEmbedIE(InfoExtractor): video_urls.append(flv_url) filesh = video.get('filesh') - formats = [ - self.video_url_to_format(video_url, filesh) - for video_url in video_urls] + + formats = [] + for video_url in video_urls: + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if filesh: + if not height: + continue + token = filesh.get(compat_str(height)) + if token is None: + continue + video_url = update_url_query(video_url, {'token': token}) + formats.append({ + 'url': video_url, + 'height': height, + }) self._sort_formats(formats) timestamp = video.get('date') @@ -91,18 +111,6 @@ class IndavideoEmbedIE(InfoExtractor): 'formats': formats, } - def video_url_to_format(self, video_url, filesh): - height = int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) - if height and filesh: - token = filesh.get(compat_str(height)) - if token is not None: - video_url = update_url_query(video_url, {'token': token}) - return { - 'url': video_url, - 'height': height, - } - class IndavideoIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' @@ -122,7 +130,7 @@ class IndavideoIE(InfoExtractor): 'upload_date': '20140127', 'duration': 7, 'age_limit': 0, - 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], + 'tags': list, }, }, { 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', @@ -146,7 +154,9 @@ class IndavideoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) embed_url = self._search_regex( - r']+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') + (r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.indavideo\.hu/player/video/.+?)\1', + r']+rel="video_src"[^>]+href="(?P.+?)"'), + webpage, 'embed url', group='url') return { '_type': 'url_transparent', From aee36ca832ec3a5696c40707098d97be0353e997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:25:40 +0700 Subject: [PATCH 061/187] [indavideo] Add support for generic embeds (closes #11989) --- youtube_dl/extractor/extractors.py | 5 +-- youtube_dl/extractor/generic.py | 24 ++++++++++ youtube_dl/extractor/indavideo.py | 70 +++++++----------------------- 3 files changed, 41 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 374aa185c..c9b49a0cd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -469,10 +469,7 @@ from .imgur import ( ) from .ina import InaIE from .inc import IncIE -from .indavideo import ( - IndavideoIE, - IndavideoEmbedIE, -) +from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internazionale import InternazionaleIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 47ac139c9..0292e0458 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -109,6 +109,7 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .indavideo import IndavideoEmbedIE class GenericIE(InfoExtractor): @@ -2022,6 +2023,24 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 2, }, + { + # Indavideo embed + 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', + 'info_dict': { + 'id': '1693903', + 'ext': 'mp4', + 'title': 'Így kell otthon hamburgert sütni', + 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', + 'timestamp': 1426330212, + 'upload_date': '20150314', + 'uploader': 'StreetKitchen', + 'uploader_id': '546363', + }, + 'add_ie': [IndavideoEmbedIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3044,6 +3063,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) + if indavideo_urls: + return self.playlist_from_matches( + indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 2946c7b84..2b5b2b5b0 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -38,6 +40,20 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\'](?P(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) @@ -110,57 +126,3 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': tags, 'formats': formats, } - - -class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' - _TESTS = [{ - 'url': 'http://indavideo.hu/video/Vicces_cica_1', - 'md5': '8c82244ba85d2a2310275b318eb51eac', - 'info_dict': { - 'id': '1335611', - 'display_id': 'Vicces_cica_1', - 'ext': 'mp4', - 'title': 'Vicces cica', - 'description': 'Játszik a tablettel. :D', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Jet_Pack', - 'uploader_id': '491217', - 'timestamp': 1390821212, - 'upload_date': '20140127', - 'duration': 7, - 'age_limit': 0, - 'tags': list, - }, - }, { - 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', - 'only_matching': True, - }, { - 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko', - 'only_matching': True, - }, { - 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci', - 'only_matching': True, - }, { - 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt', - 'only_matching': True, - }, { - 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - embed_url = self._search_regex( - (r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.indavideo\.hu/player/video/.+?)\1', - r']+rel="video_src"[^>]+href="(?P.+?)"'), - webpage, 'embed url', group='url') - - return { - '_type': 'url_transparent', - 'ie_key': 'IndavideoEmbed', - 'url': embed_url, - 'display_id': display_id, - } From f4d261b765a17ef2beccec78680ec693c7df014c Mon Sep 17 00:00:00 2001 From: Enes Date: Tue, 24 Apr 2018 22:48:40 +0300 Subject: [PATCH 062/187] [izlesene] Fix extraction (closes #16233) --- youtube_dl/extractor/izlesene.py | 33 ++++++++++---------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index b1d72177d..5b2095490 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( @@ -72,7 +70,7 @@ class IzleseneIE(InfoExtractor): 'uploadDate', webpage, 'upload date')) duration = float_or_none(self._html_search_regex( - r'"videoduration"\s*:\s*"([^"]+)"', + r'videoduration\s*=\s*\'([^\']+)\'', webpage, 'duration', fatal=False), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) @@ -80,29 +78,18 @@ class IzleseneIE(InfoExtractor): r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - content_url = self._html_search_meta( - 'contentURL', webpage, 'content URL', fatal=False) - ext = determine_ext(content_url, 'mp4') - - # Might be empty for some videos. - streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') + streams_json = self._html_search_regex( + r'_videoObj\s*=\s*(.+);', webpage, 'streams') + streams = self._parse_json(streams_json, video_id) formats = [] - if streams: - for stream in streams.split('|'): - quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() - formats.append({ - 'format_id': '%sp' % quality if quality else 'sd', - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - else: - stream_url = self._search_regex( - r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') + for stream in streams.get('media').get('level'): + url = stream.get('source') + ext = determine_ext(url, 'mp4') + quality = stream.get('value') formats.append({ - 'format_id': 'sd', - 'url': compat_urllib_parse_unquote(stream_url), + 'format_id': '%sp' % quality, + 'url': compat_urllib_parse_unquote(url), 'ext': ext, }) From 03fad17cb6ae24259808078a165c287c23d77f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:51:38 +0700 Subject: [PATCH 063/187] [izlesene] Improve extraction and fix issues (closes #16407, closes #16271) --- youtube_dl/extractor/izlesene.py | 55 +++++++++++++++++++------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 5b2095490..f8fca6c8f 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, float_or_none, @@ -55,12 +58,33 @@ class IzleseneIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://www.izlesene.com/video/%s' % video_id - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('http://www.izlesene.com/video/%s' % video_id, video_id) + + video = self._parse_json( + self._search_regex( + r'videoObj\s*=\s*({.+?})\s*;\s*\n', webpage, 'streams'), + video_id) + + title = video.get('videoTitle') or self._og_search_title(webpage) + + formats = [] + for stream in video['media']['level']: + source_url = stream.get('source') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(url, 'mp4') + quality = stream.get('value') + height = int_or_none(quality) + formats.append({ + 'format_id': '%sp' % quality if quality else 'sd', + 'url': compat_urllib_parse_unquote(source_url), + 'ext': ext, + 'height': height, + }) + self._sort_formats(formats) - title = self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) - thumbnail = self._proto_relative_url( + thumbnail = video.get('posterURL') or self._proto_relative_url( self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( @@ -69,30 +93,15 @@ class IzleseneIE(InfoExtractor): timestamp = parse_iso8601(self._html_search_meta( 'uploadDate', webpage, 'upload date')) - duration = float_or_none(self._html_search_regex( - r'videoduration\s*=\s*\'([^\']+)\'', - webpage, 'duration', fatal=False), scale=1000) + duration = float_or_none(video.get('duration') or self._html_search_regex( + r'videoduration["\']?\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'duration', fatal=False, group='value'), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) comment_count = self._html_search_regex( r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - streams_json = self._html_search_regex( - r'_videoObj\s*=\s*(.+);', webpage, 'streams') - streams = self._parse_json(streams_json, video_id) - - formats = [] - for stream in streams.get('media').get('level'): - url = stream.get('source') - ext = determine_ext(url, 'mp4') - quality = stream.get('value') - formats.append({ - 'format_id': '%sp' % quality, - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - return { 'id': video_id, 'title': title, From 9ef5cdb5cb637660decbc82117d5d6790c48ad99 Mon Sep 17 00:00:00 2001 From: rhhayward Date: Fri, 25 May 2018 14:13:29 -0500 Subject: [PATCH 064/187] [audiomack] Stringify video id (closes #15310) --- youtube_dl/extractor/audiomack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index f3bd4d444..62049b921 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -65,7 +65,7 @@ class AudiomackIE(InfoExtractor): return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} return { - 'id': api_response.get('id', album_url_tag), + 'id': compat_str(api_response.get('id', album_url_tag)), 'uploader': api_response.get('artist'), 'title': api_response.get('title'), 'url': api_response['url'], From bdbcc8eecb6d498e5c33dcbfb330d7d82021b3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Nov=C3=A1k?= Date: Fri, 25 May 2018 21:15:50 +0200 Subject: [PATCH 065/187] [dvtv] Remove dead test --- youtube_dl/extractor/dvtv.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 3f760888e..20996962a 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -91,17 +91,6 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, - }, { - 'url': 'https://video.aktualne.cz/dvtv/babis-a-zeman-nesou-vinu-za-to-ze-nemame-jasno-v-tom-kdo-bud/r~026afb54fad711e79704ac1f6b220ee8/', - 'md5': '87defe16681b1429c91f7a74809823c6', - 'info_dict': { - 'id': 'f5ae72f6fad611e794dbac1f6b220ee8', - 'ext': 'mp4', - 'title': 'Babiš a Zeman nesou vinu za to, že nemáme jasno v tom, kdo bude vládnout, říká Pekarová Adamová', - }, - 'params': { - 'skip_download': True, - }, }] def _parse_video_metadata(self, js, video_id, live_js=None): From 5a16c9d9d37389d163b0004f1c9332764a50ef83 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 25 May 2018 23:12:18 +0100 Subject: [PATCH 066/187] [utils] keep the original TV_PARENTAL_GUIDELINES dict --- youtube_dl/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d61af8837..7b4fd882f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2253,12 +2253,12 @@ US_RATINGS = { TV_PARENTAL_GUIDELINES = { - 'Y': 0, - 'Y7': 7, - 'G': 0, - 'PG': 0, - '14': 14, - 'MA': 17, + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, } @@ -2272,9 +2272,9 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - m = re.match(r'^TV[_-]?(%s)$' % '|'.join(TV_PARENTAL_GUIDELINES.keys()), s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) if m: - return TV_PARENTAL_GUIDELINES[m.group(1)] + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] return None From 38e4e8ab80b784f59b3a3ef6d313a70e13f17cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 12:58:34 +0700 Subject: [PATCH 067/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/ChangeLog b/ChangeLog index 08233cd5b..9d0264bf7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,31 @@ +version + +Core +* [utils] Improve parse_age_limit + +Extractors +* [audiomack] Stringify video id (#15310) +* [izlesene] Fix extraction (#16233, #16271, #16407) ++ [indavideo] Add support for generic embeds (#11989) +* [indavideo] Fix extraction (#11221) +* [indavideo] Sign download URLs (#16174) ++ [peertube] Add support for PeerTube based sites (#16301, #16329) +* [imgur] Fix extraction (#16537) ++ [hidive] Add support for authentication (#16534) ++ [nbc] Add support for stream.nbcsports.com (#13911) ++ [viewlift] Add support for hoichoi.tv (#16536) +* [go90] Extract age limit and detect DRM protection(#10127) +* [viewlift] fix extraction for snagfilms.com (#15766) +* [globo] Improve extraction (#4189) + * Add support for authentication + * Simplify URL signing + * Extract DASH and MSS formats +* [leeco] Fix extraction (#16464) +* [teamcoco] Add fallback for format extraction (#16484) +* [teamcoco] Improve URL regular expression (#16484) +* [imdb] Improve extraction (#4085, #14557) + + version 2018.05.18 Extractors From 0934c9d4faadbfd2b076d13c7e24f4bf039cdc79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 13:02:21 +0700 Subject: [PATCH 068/187] release 2018.05.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 13 ++++++------- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7d9de5171..c4d4e534e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.18 +[debug] youtube-dl version 2018.05.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 9d0264bf7..280390ea0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.26 Core * [utils] Improve parse_age_limit diff --git a/README.md b/README.md index 20982b0f1..499a0c206 100644 --- a/README.md +++ b/README.md @@ -93,8 +93,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable experimental SOCKS proxy, specify - a proper scheme. For example + To enable SOCKS proxy, specify a proper + scheme. For example socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds @@ -109,16 +109,15 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo option is not present) is used for the actual downloading. --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header - (experimental) --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 - country code (experimental) + country code --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with explicitly provided IP block in CIDR - notation (experimental) + notation ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) @@ -209,7 +208,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with - expected file size (experimental) + expected file size --hls-prefer-native Use the native HLS downloader instead of ffmpeg --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1048cc4c..b60f2ff23 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -365,7 +365,6 @@ - **ImgurAlbum** - **Ina** - **Inc** - - **Indavideo** - **IndavideoEmbed** - **InfoQ** - **Instagram** @@ -526,6 +525,7 @@ - **nbcolympics** - **nbcolympics:stream** - **NBCSports** + - **NBCSportsStream** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk - **ndr:embed** @@ -625,6 +625,7 @@ - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - **PearVideo** + - **PeerTube** - **People** - **PerformGroup** - **periscope**: Periscope diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a43eec860..2253da927 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.18' +__version__ = '2018.05.26' From c678192af3f004205b18a16b7418cbd937c1b584 Mon Sep 17 00:00:00 2001 From: Zack Fernandes Date: Sun, 31 Dec 2017 13:55:35 -0800 Subject: [PATCH 069/187] [tumblr] Add support for authentication --- youtube_dl/extractor/tumblr.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 786143525..58ac66755 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,11 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + ExtractorError, + int_or_none, + sanitized_Request, + urlencode_postdata +) class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P[0-9]+)(?:$|[/?#])' + _NETRC_MACHINE = 'tumblr' + _LOGIN_URL = 'https://www.tumblr.com/login' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -97,6 +104,31 @@ class TumblrIE(InfoExtractor): 'add_ie': ['Instagram'], }] + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + webpage = self._download_webpage(self._LOGIN_URL, None, False) + form = self._hidden_inputs(webpage) + form.update({ + 'user[email]': username, + 'user[password]': password + }) + login_response = self._download_webpage( + sanitized_Request(self._LOGIN_URL, urlencode_postdata(form), { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL + }), None, False, 'Wrong login info') + + # Check the login response from Tumblr for an error message and fail the extraction if we find one. + login_errors = self._search_regex(r'Tumblr\.RegistrationForm\.errors\s*=\s*\[[\"|\'](.+)[\"|\']\]', login_response, 'login errors', False) + if login_errors: + raise ExtractorError('Error logging in: %s' % login_errors, expected=True) + def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') From 56cd31f32015cce131fb40a112d323da57fdda8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 19:53:32 +0700 Subject: [PATCH 070/187] [tumblr] Improve authentication (closes #15133) --- youtube_dl/extractor/tumblr.py | 39 ++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 58ac66755..758ccbb44 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata ) @@ -111,23 +110,37 @@ class TumblrIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: return - self.report_login() - webpage = self._download_webpage(self._LOGIN_URL, None, False) - form = self._hidden_inputs(webpage) - form.update({ + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ 'user[email]': username, 'user[password]': password }) - login_response = self._download_webpage( - sanitized_Request(self._LOGIN_URL, urlencode_postdata(form), { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL - }), None, False, 'Wrong login info') - # Check the login response from Tumblr for an error message and fail the extraction if we find one. - login_errors = self._search_regex(r'Tumblr\.RegistrationForm\.errors\s*=\s*\[[\"|\'](.+)[\"|\']\]', login_response, 'login errors', False) + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + + # Successful login + if '/dashboard' in urlh.geturl(): + return + + login_errors = self._parse_json( + self._search_regex( + r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, + 'login errors', default='[]'), + None, fatal=False) if login_errors: - raise ExtractorError('Error logging in: %s' % login_errors, expected=True) + raise ExtractorError( + 'Unable to login: %s' % login_errors[0], expected=True) + + self.report_warning('Login has probably failed') def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) From 97b01144bd9771f224749ffca10156a1cd7e9c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 20:00:00 +0700 Subject: [PATCH 071/187] [tumblr] Detect and report sensitive media (closes #13829) --- youtube_dl/extractor/tumblr.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 758ccbb44..89e6eb5ab 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -150,11 +151,19 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) + redirect_url = compat_str(urlh.geturl()) + if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): + raise ExtractorError( + 'This Tumblr may contain sensitive media. ' + 'Disable safe mode in your account settings ' + 'at https://www.tumblr.com/settings/account#safe_mode', + expected=True) + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url', default=None) if iframe_url is None: - return self.url_result(urlh.geturl(), 'Generic') + return self.url_result(redirect_url, 'Generic') iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') From 986c0b0215b127713825fa1523966ac66e03157b Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 26 May 2018 08:05:54 -0500 Subject: [PATCH 072/187] [cbc] Fix playlist title extraction (closes #16502) --- youtube_dl/extractor/cbc.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 54b4b9be9..ce8e3d346 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -20,6 +20,7 @@ from ..utils import ( parse_duration, parse_iso8601, parse_age_limit, + strip_or_none, int_or_none, ExtractorError, ) @@ -129,6 +130,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'([^<]+)', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -136,8 +140,7 @@ class CBCIE(InfoExtractor): self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) return self.playlist_result( - entries, display_id, - self._og_search_title(webpage, fatal=False), + entries, display_id, strip_or_none(title), self._og_search_description(webpage)) From c0fd20abcad16bb2e377b6342a894a374c219763 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 14:34:13 +0100 Subject: [PATCH 073/187] [soundcloud] detect format extension(closes #16549) --- youtube_dl/extractor/soundcloud.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 46332e5c2..81c81c8d5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -181,7 +181,6 @@ class SoundcloudIE(InfoExtractor): thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') - ext = 'mp3' result = { 'id': track_id, 'uploader': info.get('user', {}).get('username'), @@ -215,8 +214,11 @@ class SoundcloudIE(InfoExtractor): track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): - abr = int_or_none(self._search_regex( - r'_(\d+)_url', key, 'audio bitrate', default=None)) + ext, abr = 'mp3', None + mobj = re.search(r'_([^_]+)_(\d+)_url', key) + if mobj: + ext, abr = mobj.groups() + abr = int(abr) if key.startswith('http'): stream_formats = [{ 'format_id': key, @@ -234,13 +236,14 @@ class SoundcloudIE(InfoExtractor): }] elif key.startswith('hls'): stream_formats = self._extract_m3u8_formats( - stream_url, track_id, 'mp3', entry_protocol='m3u8_native', + stream_url, track_id, ext, entry_protocol='m3u8_native', m3u8_id=key, fatal=False) else: continue - for f in stream_formats: - f['abr'] = abr + if abr: + for f in stream_formats: + f['abr'] = abr formats.extend(stream_formats) @@ -250,7 +253,7 @@ class SoundcloudIE(InfoExtractor): formats.append({ 'format_id': 'fallback', 'url': update_url_query(info['stream_url'], query), - 'ext': ext, + 'ext': 'mp3', }) for f in formats: From 261f47306c594614edb8a5f0b8f5f3b8a87ce9c0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 14:35:47 +0100 Subject: [PATCH 074/187] [utils] fix style id extraction for namespaced id attribute(closes #16551) --- youtube_dl/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7b4fd882f..63f24c0b6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2667,6 +2667,7 @@ def dfxp2srt(dfxp_data): ] _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', 'ttml': 'http://www.w3.org/ns/ttml', 'tts': 'http://www.w3.org/ns/ttml#styling', }) @@ -2758,7 +2759,9 @@ def dfxp2srt(dfxp_data): repeat = False while True: for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue parent_style_id = style.get('style') if parent_style_id: if parent_style_id not in styles: From 2a49d01992e0b4b87d78da8f83af2f6e57fb8ba8 Mon Sep 17 00:00:00 2001 From: mars67857 Date: Sat, 14 Oct 2017 22:09:44 -0700 Subject: [PATCH 075/187] [cammodels] Add extractor --- youtube_dl/extractor/cammodels.py | 93 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 94 insertions(+) create mode 100644 youtube_dl/extractor/cammodels.py diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py new file mode 100644 index 000000000..1711d7096 --- /dev/null +++ b/youtube_dl/extractor/cammodels.py @@ -0,0 +1,93 @@ +from __future__ import unicode_literals +from .common import InfoExtractor +from .common import ExtractorError +import json +import re +from ..utils import int_or_none + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P\w+)' + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' + # Needed because server doesn't return links to video URLs if a browser-like User-Agent is not used + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, + video_id, + headers=self._HEADERS) + manifest_url_root = self._html_search_regex( + r'manifestUrlRoot=(?Phttps?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))', + webpage, + 'manifest', + None, + False) + if not manifest_url_root: + offline = self._html_search_regex( + r'(?PI\'m offline, but let\'s stay connected!)', + webpage, + 'offline indicator', + None, + False) + private = self._html_search_regex( + r'(?PI’m in a private show right now)', + webpage, + 'private show indicator', + None, + False) + err = 'This user is currently offline, so nothing can be downloaded.' if offline \ + else 'This user is doing a private show, which requires payment. This extractor currently does not support private streams.' if private \ + else 'Unable to find link to stream info on webpage. Room is not offline, so something else is wrong.' + raise ExtractorError( + err, + expected=True if offline or private else False, + video_id=video_id + ) + manifest_url = manifest_url_root + video_id + '.json' + manifest = self._download_json( + manifest_url, + video_id, + 'Downloading links to streams.', + 'Link to stream URLs was found, but we couldn\'t access it.', + headers=self._HEADERS) + try: + formats = [] + for fmtName in ['mp4-rtmp', 'mp4-hls']: + for encoding in manifest['formats'][fmtName]['encodings']: + formats.append({ + 'ext': 'mp4', + 'url': encoding['location'], + 'width': int_or_none(encoding.get('videoWidth')), + 'height': int_or_none(encoding.get('videoHeight')), + 'vbr': int_or_none(encoding.get('videoKbps')), + 'abr': int_or_none(encoding.get('audioKbps')), + 'format_id': fmtName + str(encoding.get('videoWidth')) + }) + # If they change the JSON format, then fallback to parsing out RTMP links via regex. + except KeyError: + manifest_json = json.dumps(manifest) + manifest_links = re.finditer( + r'(?Prtmp?:\/\/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#&//=]*))', + manifest_json) + if not manifest_links: + raise ExtractorError( + 'Link to stream info was found, but we couldn\'t read the response. This is probably a bug.', + expected=False, + video_id=video_id) + formats = [] + for manifest_link in manifest_links: + url = manifest_link.group('id') + formats.append({ + 'ext': 'mp4', + 'url': url, + 'format_id': url.split(sep='/')[-1] + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'formats': formats + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9b49a0cd..d54e8df9f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -145,6 +145,7 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .cammodels import CamModelsIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE From 8b1da46e8f6dd0de790a54a4809d224041262537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 21:25:01 +0700 Subject: [PATCH 076/187] [cammodels] Improve and simplify (closes #14499) --- youtube_dl/extractor/cammodels.py | 159 +++++++++++++++--------------- 1 file changed, 80 insertions(+), 79 deletions(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 1711d7096..4f1b88d14 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -1,93 +1,94 @@ +# coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor -from .common import ExtractorError -import json -import re -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, +) class CamModelsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P\w+)' - _HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' - # Needed because server doesn't return links to video URLs if a browser-like User-Agent is not used - } + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, - video_id, - headers=self._HEADERS) - manifest_url_root = self._html_search_regex( - r'manifestUrlRoot=(?Phttps?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))', - webpage, - 'manifest', - None, - False) - if not manifest_url_root: - offline = self._html_search_regex( - r'(?PI\'m offline, but let\'s stay connected!)', - webpage, - 'offline indicator', - None, - False) - private = self._html_search_regex( - r'(?PI’m in a private show right now)', - webpage, - 'private show indicator', - None, - False) - err = 'This user is currently offline, so nothing can be downloaded.' if offline \ - else 'This user is doing a private show, which requires payment. This extractor currently does not support private streams.' if private \ - else 'Unable to find link to stream info on webpage. Room is not offline, so something else is wrong.' - raise ExtractorError( - err, - expected=True if offline or private else False, - video_id=video_id + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), ) - manifest_url = manifest_url_root + video_id + '.json' + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + manifest = self._download_json( - manifest_url, - video_id, - 'Downloading links to streams.', - 'Link to stream URLs was found, but we couldn\'t access it.', - headers=self._HEADERS) - try: - formats = [] - for fmtName in ['mp4-rtmp', 'mp4-hls']: - for encoding in manifest['formats'][fmtName]['encodings']: - formats.append({ + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = media.get('location') + if not media_url or not isinstance(media_url, compat_str): + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ 'ext': 'mp4', - 'url': encoding['location'], - 'width': int_or_none(encoding.get('videoWidth')), - 'height': int_or_none(encoding.get('videoHeight')), - 'vbr': int_or_none(encoding.get('videoKbps')), - 'abr': int_or_none(encoding.get('audioKbps')), - 'format_id': fmtName + str(encoding.get('videoWidth')) + # hls skips fragments, preferring rtmp + 'preference': -1, }) - # If they change the JSON format, then fallback to parsing out RTMP links via regex. - except KeyError: - manifest_json = json.dumps(manifest) - manifest_links = re.finditer( - r'(?Prtmp?:\/\/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#&//=]*))', - manifest_json) - if not manifest_links: - raise ExtractorError( - 'Link to stream info was found, but we couldn\'t read the response. This is probably a bug.', - expected=False, - video_id=video_id) - formats = [] - for manifest_link in manifest_links: - url = manifest_link.group('id') - formats.append({ - 'ext': 'mp4', - 'url': url, - 'format_id': url.split(sep='/')[-1] - }) + else: + continue + formats.append(f) self._sort_formats(formats) + return { - 'id': video_id, - 'title': self._live_title(video_id), - 'formats': formats + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, } From ec2f3d2800185920629a7e6946701edebbf14dd6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 15:34:36 +0100 Subject: [PATCH 077/187] [ufctv] add support for authentication(closes #16542) --- youtube_dl/extractor/ufctv.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py index ab823814b..f3eaee6b3 100644 --- a/youtube_dl/extractor/ufctv.py +++ b/youtube_dl/extractor/ufctv.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, parse_duration, parse_iso8601, + urlencode_postdata, ) class UFCTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P[^/]+)' + _NETRC_MACHINE = 'ufctv' _TEST = { 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', 'info_dict': { @@ -26,6 +29,21 @@ class UFCTVIE(InfoExtractor): } } + def _real_initialize(self): + username, password = self._get_login_info() + if username is None: + return + + code = self._download_json( + 'https://www.ufc.tv/secure/authenticate', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + 'format': 'json', + })).get('code') + if code and code != 'loginsuccess': + raise ExtractorError(code, expected=True) + def _real_extract(self, url): display_id = self._match_id(url) video_data = self._download_json(url, display_id, query={ From 68217024e83c8e7965f2800e9ff7a9575f049b5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 16:12:44 +0100 Subject: [PATCH 078/187] remove unnecessary assignment parenthesis --- youtube_dl/extractor/animeondemand.py | 2 +- youtube_dl/extractor/atresplayer.py | 2 +- youtube_dl/extractor/bambuser.py | 2 +- youtube_dl/extractor/crunchyroll.py | 2 +- youtube_dl/extractor/curiositystream.py | 2 +- youtube_dl/extractor/dramafever.py | 2 +- youtube_dl/extractor/facebook.py | 2 +- youtube_dl/extractor/fc2.py | 2 +- youtube_dl/extractor/funimation.py | 2 +- youtube_dl/extractor/gdcvault.py | 2 +- youtube_dl/extractor/globo.py | 5 ----- youtube_dl/extractor/hidive.py | 7 +------ youtube_dl/extractor/hrti.py | 2 +- youtube_dl/extractor/iqiyi.py | 2 +- youtube_dl/extractor/niconico.py | 2 +- youtube_dl/extractor/noco.py | 2 +- youtube_dl/extractor/packtpub.py | 2 +- youtube_dl/extractor/patreon.py | 2 +- youtube_dl/extractor/pluralsight.py | 2 +- youtube_dl/extractor/roosterteeth.py | 2 +- youtube_dl/extractor/safari.py | 2 +- youtube_dl/extractor/sina.py | 2 +- youtube_dl/extractor/tennistv.py | 2 +- youtube_dl/extractor/tubitv.py | 2 +- youtube_dl/extractor/tumblr.py | 2 +- youtube_dl/extractor/twitch.py | 2 +- youtube_dl/extractor/udemy.py | 2 +- youtube_dl/extractor/vessel.py | 2 +- youtube_dl/extractor/viki.py | 2 +- youtube_dl/extractor/vimeo.py | 2 +- youtube_dl/extractor/vk.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- youtube_dl/extractor/zattoo.py | 2 +- 33 files changed, 32 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index e4fa72f46..1fe5d5e56 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -52,7 +52,7 @@ class AnimeOnDemandIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 1a31ebe08..ae1c09427 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -74,7 +74,7 @@ class AtresPlayerIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 633c57553..34f1b3d83 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -44,7 +44,7 @@ class BambuserIE(InfoExtractor): } def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 3efdc8c21..311da515d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -49,7 +49,7 @@ class CrunchyrollBaseIE(InfoExtractor): }) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 8e45923e3..35b1e7a34 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -35,7 +35,7 @@ class CuriosityStreamBaseIE(InfoExtractor): return result['data'] def _real_initialize(self): - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return result = self._download_json( diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ffbd2623d..ab32ba4ff 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -42,7 +42,7 @@ class DramaFeverBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 220ada3a6..0971ce356 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -226,7 +226,7 @@ class FacebookIE(InfoExtractor): return urls def _login(self): - (useremail, password) = self._get_login_info() + useremail, password = self._get_login_info() if useremail is None: return diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 448647d72..435561147 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -46,7 +46,7 @@ class FC2IE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: return False diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 107f658ba..07d01caec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -51,7 +51,7 @@ class FunimationIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index f71d9092e..8806dc48a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -91,7 +91,7 @@ class GDCVaultIE(InfoExtractor): ] def _login(self, webpage_url, display_id): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') return None diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 81d6d36d3..c2140c362 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -23,7 +23,6 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' - _LOGGED_IN = False _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', @@ -68,9 +67,6 @@ class GloboIE(InfoExtractor): }] def _real_initialize(self): - if self._LOGGED_IN: - return - email, password = self._get_login_info() if email is None: return @@ -91,7 +87,6 @@ class GloboIE(InfoExtractor): resp = self._parse_json(e.cause.read(), None) raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) raise - self._LOGGED_IN = True def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index d8f2e682f..39fabe8a5 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -18,7 +18,6 @@ class HiDiveIE(InfoExtractor): # so disabling geo bypass completely _GEO_BYPASS = False _NETRC_MACHINE = 'hidive' - _LOGGED_IN = False _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ @@ -38,10 +37,7 @@ class HiDiveIE(InfoExtractor): }] def _real_initialize(self): - if self._LOGGED_IN: - return - - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return @@ -56,7 +52,6 @@ class HiDiveIE(InfoExtractor): }) self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) - self._LOGGED_IN = True def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 6424d34ac..9ba1aa703 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -66,7 +66,7 @@ class HRTiBaseIE(InfoExtractor): self._logout_url = modules['user']['resources']['logout']['uri'] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # TODO: figure out authentication with cookies if username is None or password is None: self.raise_login_required() diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index fdfa7de9e..4b081bd46 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -239,7 +239,7 @@ class IqiyiIE(InfoExtractor): return ohdave_rsa_encrypt(data, e, N) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index df7f528be..dbe871f16 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -163,7 +163,7 @@ class NiconicoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: return True diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index a9f9b10c4..58b371ed7 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -65,7 +65,7 @@ class NocoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 8ed3c6347..56a2a1083 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -42,7 +42,7 @@ class PacktPubIE(PacktPubBaseIE): _TOKEN = None def _real_initialize(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index d4b1d34ca..9eb027679 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -53,7 +53,7 @@ class PatreonIE(InfoExtractor): # needed. Keeping this commented for when this inevitably changes. ''' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 3c508c9ca..a207ca9cb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -94,7 +94,7 @@ class PluralsightIE(PluralsightBaseIE): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8b703800e..857434540 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -50,7 +50,7 @@ class RoosterTeethIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index cc6698f88..8a5d48fc2 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -27,7 +27,7 @@ class SafariBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 8fc66732a..07b766b4a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -64,7 +64,7 @@ class SinaIE(InfoExtractor): # The video id is in the redirected url self.to_screen('Getting video id') request = HEADRequest(url) - (_, urlh) = self._download_webpage_handle(request, 'NA', False) + _, urlh = self._download_webpage_handle(request, 'NA', False) return self._real_extract(urlh.geturl()) else: pseudo_id = mobj.group('pseudo_id') diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index 0c6f70784..a586f30ad 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -32,7 +32,7 @@ class TennisTVIE(InfoExtractor): _NETRC_MACHINE = 'tennistv' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 36f6c1673..a51fa6515 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -36,7 +36,7 @@ class TubiTvIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 89e6eb5ab..edbb0aa69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -108,7 +108,7 @@ class TumblrIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 3ee2af52e..e01f11331 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -61,7 +61,7 @@ class TwitchBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 0a74a9768..a7196997e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -151,7 +151,7 @@ class UdemyIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 80a643dfe..31eee0ba7 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -75,7 +75,7 @@ class VesselIE(InfoExtractor): 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ad2a2a4b7..546de95d8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -88,7 +88,7 @@ class VikiBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8dfd8891c..3baa2d075 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -37,7 +37,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_URL = 'https://vimeo.com/log_in' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: if self._LOGIN_REQUIRED: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b50d4f170..29002b35f 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -32,7 +32,7 @@ class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e4eec7c30..379559825 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -85,7 +85,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 773073d85..b5a3a0716 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -24,7 +24,7 @@ class ZattooBaseIE(InfoExtractor): _power_guide_hash = None def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: self.raise_login_required( 'A valid %s account is needed to access this media.' From ddd8486a448ee94134a62f2488e5e39bbd72880e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:10:08 +0700 Subject: [PATCH 079/187] [downloader/rtmp] Gracefully handle live streams interrupted by user --- youtube_dl/downloader/rtmp.py | 119 +++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 53 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index b823b5171..63e2b5c89 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -24,71 +24,78 @@ class RtmpFD(FileDownloader): def real_download(self, filename, info_dict): def run_rtmpdump(args): start = time.time() - resume_percent = None - resume_downloaded_data_len = None proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True - proc_stderr_closed = False - while not proc_stderr_closed: - # read line from stderr - line = '' - while True: - char = proc.stderr.read(1) - if not char: - proc_stderr_closed = True - break - if char in [b'\r', b'\n']: - break - line += char.decode('ascii', 'replace') - if not line: - # proc_stderr_closed is True - continue - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - percent = float(mobj.group(2)) - if not resume_percent: - resume_percent = percent - resume_downloaded_data_len = downloaded_data_len - time_now = time.time() - eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) - data_len = None - if percent > 0: - data_len = int(downloaded_data_len * 100 / percent) - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': downloaded_data_len, - 'total_bytes_estimate': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - else: - # no percent for live streams - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + + def dl(): + resume_percent = None + resume_downloaded_data_len = None + proc_stderr_closed = False + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) if mobj: downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len time_now = time.time() - speed = self.calc_speed(start, time_now, downloaded_data_len) + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) self._hook_progress({ + 'status': 'downloading', 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, 'tmpfilename': tmpfilename, 'filename': filename, - 'status': 'downloading', + 'eta': eta, 'elapsed': time_now - start, 'speed': speed, }) cursor_in_new_line = False - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen('') - cursor_in_new_line = True - self.to_screen('[rtmpdump] ' + line) - proc.wait() + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + + try: + dl() + finally: + proc.wait() + if not cursor_in_new_line: self.to_screen('') return proc.returncode @@ -163,7 +170,13 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 - retval = run_rtmpdump(args) + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') if retval == RD_NO_CONNECT: self.report_error('[rtmpdump] Could not connect to RTMP server.') From f16f48779cbad4a6d39a908e131a8d55941d1671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:14:09 +0700 Subject: [PATCH 080/187] [downloader/rtmp] Generalize download messages and report time elapsed on finish --- youtube_dl/downloader/rtmp.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 63e2b5c89..9e0ddbb18 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -170,6 +170,8 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 + started = time.time() + try: retval = run_rtmpdump(args) except KeyboardInterrupt: @@ -184,7 +186,7 @@ class RtmpFD(FileDownloader): while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % prevsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed args = basic_args + ['--resume'] if retval == RD_FAILED: @@ -201,13 +203,14 @@ class RtmpFD(FileDownloader): break if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % fsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, 'total_bytes': fsize, 'filename': filename, 'status': 'finished', + 'elapsed': time.time() - started, }) return True else: From 2ce35d9f43328e82108bae6661c2ac0ba2a0498c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:21:55 +0700 Subject: [PATCH 081/187] [cammodels] Add another error pattern --- youtube_dl/extractor/cammodels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 4f1b88d14..17f7ac043 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -28,6 +28,7 @@ class CamModelsIE(InfoExtractor): ERRORS = ( ("I'm offline, but let's stay connected", 'This user is currently offline'), ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), ) for pattern, message in ERRORS: if pattern in webpage: From 8882840ec5d9536772d7de75b7fb6389103a3a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:22:28 +0700 Subject: [PATCH 082/187] [cammodels] Use geo verification headers --- youtube_dl/extractor/cammodels.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 17f7ac043..ee0165dba 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -19,7 +19,8 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage(url, user_id) + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) manifest_root = self._html_search_regex( r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) From c9e12a618c9420c2bb21c09bf47b9469785f492e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 May 2018 12:10:12 +0100 Subject: [PATCH 083/187] [9c9media] extract mpd formats and subtitles --- youtube_dl/extractor/ctvnews.py | 4 +- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/ninecninemedia.py | 93 ++++++++++---------------- youtube_dl/extractor/rds.py | 2 +- 4 files changed, 41 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index 55a127b76..03f8cefb7 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -11,10 +11,10 @@ class CTVNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', - 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'md5': '9b8624ba66351a23e0b6e1391971f9af', 'info_dict': { 'id': '901995', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Extended: \'That person cannot be me\' Johnson says', 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', 'timestamp': 1467286284, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d54e8df9f..2f485012f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -718,10 +718,7 @@ from .nick import ( NickRuIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import ( - NineCNineMediaStackIE, - NineCNineMediaIE, -) +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 8961309fd..875665d43 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -13,38 +13,11 @@ from ..utils import ( ) -class NineCNineMediaBaseIE(InfoExtractor): - _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' - - -class NineCNineMediaStackIE(NineCNineMediaBaseIE): - IE_NAME = '9c9media:stack' - _GEO_COUNTRIES = ['CA'] - _VALID_URL = r'9c9media:stack:(?P[^:]+):(?P\d+):(?P\d+):(?P\d+)' - - def _real_extract(self, url): - destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() - stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' - stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) - - formats = [] - formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', stack_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', stack_id, - f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - return { - 'id': stack_id, - 'formats': formats, - } - - -class NineCNineMediaIE(NineCNineMediaBaseIE): +class NineCNineMediaIE(InfoExtractor): IE_NAME = '9c9media' + _GEO_COUNTRIES = ['CA'] _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' def _real_extract(self, url): destination_code, content_id = re.match(self._VALID_URL, url).groups() @@ -58,13 +31,26 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id - content_package = self._download_json(content_package_url, content_id) + content_package = self._download_json( + content_package_url, content_id, query={ + '$include': '[HasClosedCaptions]', + }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + if content_package.get('Constraints', {}).get('Security', {}).get('Type'): raise ExtractorError('This video is DRM protected.', expected=True) - stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] - multistacks = len(stacks) > 1 + manifest_base_url = content_package_url + 'manifest.' + formats = [] + formats.extend(self._extract_m3u8_formats( + manifest_base_url + 'm3u8', content_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_base_url + 'f4m', content_id, + f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', content_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) thumbnails = [] for image in content.get('Images', []): @@ -85,10 +71,12 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): continue container.append(e_name) - description = content.get('Desc') or content.get('ShortDesc') season = content.get('Season', {}) - base_info = { - 'description': description, + + info = { + 'id': content_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), @@ -97,26 +85,19 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): 'series': content.get('Media', {}).get('Name'), 'tags': tags, 'categories': categories, + 'duration': float_or_none(content_package.get('Duration')), + 'formats': formats, } - entries = [] - for stack in stacks: - stack_id = compat_str(stack['Id']) - entry = { - '_type': 'url_transparent', - 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), - 'id': stack_id, - 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, - 'duration': float_or_none(stack.get('Duration')), - 'ie_key': 'NineCNineMediaStack', + if content_package.get('HasClosedCaptions'): + info['subtitles'] = { + 'en': [{ + 'url': manifest_base_url + 'vtt', + 'ext': 'vtt', + }, { + 'url': manifest_base_url + 'srt', + 'ext': 'srt', + }] } - entry.update(base_info) - entries.append(entry) - return { - '_type': 'multi_video', - 'id': content_id, - 'title': title, - 'description': description, - 'entries': entries, - } + return info diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index bf200ea4d..8c016a77d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -19,7 +19,7 @@ class RDSIE(InfoExtractor): 'info_dict': { 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Fowler Jr. prend la direction de Jacksonville', 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', 'timestamp': 1430397346, From 9c65c4a6cd981e081f4a99d11206e984999f51ff Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 May 2018 12:11:53 +0100 Subject: [PATCH 084/187] [bellmedia] add support for bnnbloomberg.ca(#16560) --- youtube_dl/extractor/bellmedia.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 8820a3914..f36a2452d 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -12,7 +12,7 @@ class BellMediaIE(InfoExtractor): (?: ctv| tsn| - bnn| + bnn(?:bloomberg)?| thecomedynetwork| discovery| discoveryvelocity| @@ -27,17 +27,16 @@ class BellMediaIE(InfoExtractor): much\.com )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ - 'url': 'http://www.ctv.ca/video/player?vid=706966', - 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', + 'md5': '36d3ef559cfe8af8efe15922cd3ce950', 'info_dict': { - 'id': '706966', - 'ext': 'mp4', - 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', - 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', - 'upload_date': '20150919', - 'timestamp': 1442624700, + 'id': '1403070', + 'ext': 'flv', + 'title': 'David Cockfield\'s Top Picks', + 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', + 'upload_date': '20180525', + 'timestamp': 1527288600, }, - 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', 'only_matching': True, @@ -70,6 +69,7 @@ class BellMediaIE(InfoExtractor): 'investigationdiscovery': 'invdisc', 'animalplanet': 'aniplan', 'etalk': 'ctv', + 'bnnbloomberg': 'bnn', } def _real_extract(self, url): From cfd7f2a6365e4d4ed9036b7fd873747be5e91d44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 May 2018 18:24:37 +0700 Subject: [PATCH 085/187] [apa] Add extractor (closes #15041, closes #15672) --- youtube_dl/extractor/apa.py | 94 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 23 ++++++++ 3 files changed, 118 insertions(+) create mode 100644 youtube_dl/extractor/apa.py diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py new file mode 100644 index 000000000..a30a935aa --- /dev/null +++ b/youtube_dl/extractor/apa.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + js_to_json, +) + + +class APAIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'md5': '2b12292faeb0a7d930c778c7a5b4759b', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + }, { + 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', + 'only_matching': True, + }, { + 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', + 'only_matching': True, + }, { + 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + jwplatform_id = self._search_regex( + r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, + 'jwplatform id', default=None) + + if jwplatform_id: + return self.url_result( + 'jwplatform:' + jwplatform_id, ie='JWPlatform', + video_id=video_id) + + sources = self._parse_json( + self._search_regex( + r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + if not isinstance(source, dict): + continue + source_url = source.get('file') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': source_url, + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2f485012f..5f829c72c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,6 +44,7 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0292e0458..dad951b75 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -110,6 +110,7 @@ from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE +from .apa import APAIE class GenericIE(InfoExtractor): @@ -2041,6 +2042,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # APA embed via JWPlatform embed + 'url': 'http://www.vol.at/blue-man-group/5593454', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3068,6 +3086,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + apa_urls = APAIE._extract_urls(webpage) + if apa_urls: + return self.playlist_from_matches( + apa_urls, video_id, video_title, ie=APAIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From a07879d6b2edc474b0595a29932726fa7aa14b3a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 May 2018 00:10:01 +0100 Subject: [PATCH 086/187] [spiegel] fix info extraction(#16538) --- youtube_dl/extractor/spiegel.py | 78 +++++++++++---------------------- 1 file changed, 25 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index fc995e8c1..4df7f4ddc 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -11,9 +11,9 @@ from .nexx import ( from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( - extract_attributes, - unified_strdate, - get_element_by_attribute, + parse_duration, + strip_or_none, + unified_timestamp, ) @@ -21,35 +21,38 @@ class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': '2c2754212136f35fb4b19767d242f66e', + 'md5': 'b57399839d055fccfeb9a0455c439868', 'info_dict': { - 'id': '1259285', + 'id': '563747', 'ext': 'mp4', 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, 'upload_date': '20130311', + 'timestamp': 1362994320, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', 'info_dict': { - 'id': '1309159', + 'id': '580988', 'ext': 'mp4', 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, 'upload_date': '20131115', + 'timestamp': 1384546642, }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', + 'md5': '97b91083a672d72976faa8433430afb9', 'info_dict': { - 'id': '1519126', + 'id': '601883', 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', 'upload_date': '20140904', + 'timestamp': 1409834160, } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -62,59 +65,28 @@ class SpiegelIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage, handle = self._download_webpage_handle(url, video_id) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - nexx_id = self._search_regex( - r'nexxOmniaId\s*:\s*(\d+)', webpage, 'nexx id', default=None) - if nexx_id: - domain_id = NexxIE._extract_domain_id(webpage) or '748' - return self.url_result( - 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), - video_id=nexx_id) - - video_data = extract_attributes(self._search_regex(r'(]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) - - title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) - description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') - - base_url = self._search_regex( - [r'server\s*:\s*(["\'])(?P.+?)\1', r'var\s+server\s*=\s*"(?P[^"]+)\"'], - webpage, 'server URL', group='url') - - xml_url = base_url + video_id + '.xml' - idoc = self._download_xml(xml_url, video_id) - - formats = [] - for n in list(idoc): - if n.tag.startswith('type') and n.tag != 'type6': - format_id = n.tag.rpartition('type')[2] - video_url = base_url + n.find('./filename').text - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int(n.find('./width').text), - 'height': int(n.find('./height').text), - 'abr': int(n.find('./audiobitrate').text), - 'vbr': int(n.find('./videobitrate').text), - 'vcodec': n.find('./codec').text, - 'acodec': 'MP4A', - }) - duration = float(idoc[0].findall('./duration')[0].text) - - self._check_formats(formats, video_id) - self._sort_formats(formats) + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' return { + '_type': 'url_transparent', 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), 'title': title, - 'description': description.strip() if description else None, - 'duration': duration, - 'upload_date': unified_strdate(video_data.get('data-video-date')), - 'formats': formats, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), } From e0d42dd4b270d06a953822c091afefd946bd93f2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 May 2018 13:21:07 +0100 Subject: [PATCH 087/187] [teamcoco] Fix extraction for full episodes(closes #16573) --- youtube_dl/extractor/tbs.py | 61 ++++++------------ youtube_dl/extractor/teamcoco.py | 102 ++++++++++++++++++------------- youtube_dl/extractor/turner.py | 47 +++++++++++++- 3 files changed, 122 insertions(+), 88 deletions(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index edc31729d..784f8ed66 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) from ..utils import ( float_or_none, int_or_none, @@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE): def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(self._search_regex( + drupal_settings = self._parse_json(self._search_regex( r']+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})', - webpage, 'drupal setting'), display_id)['turner_playlist'][0] + webpage, 'drupal setting'), display_id) + video_data = drupal_settings['turner_playlist'][0] media_id = video_data['mediaID'] title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) - streams_data = self._download_json( - 'http://medium.ngtv.io/media/%s/tv' % media_id, - media_id)['media']['tv'] - duration = None - chapters = [] - formats = [] - for supported_type in ('unprotected', 'bulkaes'): - stream_data = streams_data.get(supported_type, {}) - m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') - if not m3u8_url: - continue - if stream_data.get('playlistProtection') == 'spe': - m3u8_url = self._add_akamai_spe_token( - 'http://token.vgtf.net/token/token_spe', - m3u8_url, media_id, { - 'url': url, - 'site_name': site[:3].upper(), - 'auth_required': video_data.get('authRequired') == '1', - }) - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) - - if not chapters: - for chapter in stream_data.get('contentSegments', []): - start_time = float_or_none(chapter.get('start')) - duration = float_or_none(chapter.get('duration')) - if start_time is None or duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + duration, - }) - self._sort_formats(formats) + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) thumbnails = [] for image_id, image in video_data.get('images', {}).items(): @@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE): }) thumbnails.append(i) - return { + info.update({ 'id': media_id, 'title': title, 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), - 'duration': duration, + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), 'timestamp': int_or_none(video_data.get('created')), 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), - 'cahpters': chapters, 'thumbnails': thumbnails, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 63fd4fe1c..73469cc5d 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import json -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..utils import ( determine_ext, ExtractorError, @@ -15,7 +15,7 @@ from ..utils import ( ) -class TeamcocoIE(InfoExtractor): +class TeamcocoIE(TurnerBaseIE): _VALID_URL = r'https?://teamcoco\.com/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { @@ -110,6 +110,8 @@ class TeamcocoIE(InfoExtractor): name } duration + turnerMediaId + turnerMediaAuthToken } } ... on NotFoundSlug { @@ -123,53 +125,65 @@ class TeamcocoIE(InfoExtractor): record = response['record'] video_id = record['id'] - video_sources = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id) or {} - - formats = [] - get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in video_sources.get('src', {}).items(): - if not isinstance(src, dict): - continue - src_url = src.get('src') - if not src_url: - continue - ext = determine_ext(src_url, mimetype2ext(src.get('type'))) - if format_id == 'hls' or ext == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if src_url.startswith('/'): - src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - else: - if src_url.startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - tbr = int_or_none(self._search_regex( - r'(\d+)k\.mp4', src_url, 'tbr', default=None)) - - formats.append({ - 'url': src_url, - 'ext': ext, - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - if not formats: - formats = self._extract_m3u8_formats( - record['file']['url'], video_id, 'mp4', fatal=False) - self._sort_formats(formats) - - return { + info = { 'id': video_id, 'display_id': display_id, - 'formats': formats, 'title': record['title'], 'thumbnail': record.get('thumb', {}).get('preview'), 'description': record.get('teaser'), 'duration': parse_duration(record.get('duration')), 'timestamp': parse_iso8601(record.get('publishOn')), } + + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): + continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index e73b64aeb..2b7b0d6e1 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -9,6 +9,7 @@ from ..utils import ( xpath_text, int_or_none, determine_ext, + float_or_none, parse_duration, xpath_attr, update_url_query, @@ -23,14 +24,17 @@ class TurnerBaseIE(AdobePassIE): def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) if not token: query = { 'path': secure_path, - 'videoId': content_id, } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id if ap_data.get('auth_required'): query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) auth = self._download_xml( @@ -188,3 +192,42 @@ class TurnerBaseIE(AdobePassIE): 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'is_live': is_live, } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } From bc3143ac5e18731502df014e30c5fe89554e9d6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 May 2018 21:52:03 +0700 Subject: [PATCH 088/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 280390ea0..95a5c556f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + version 2018.05.26 Core From e425710554f1ed96504389fb526b898a942012dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 May 2018 21:54:30 +0700 Subject: [PATCH 089/187] release 2018.05.30 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c4d4e534e..b47a450a4 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.30** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.26 +[debug] youtube-dl version 2018.05.30 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 95a5c556f..4e989caf7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.30 Core * [downloader/rtmp] Generalize download messages and report time elapsed diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b60f2ff23..c2d5401d6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -15,7 +15,6 @@ - **8tracks** - **91porn** - **9c9media** - - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** @@ -48,6 +47,7 @@ - **anitube.se** - **Anvato** - **AnySex** + - **APA** - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 @@ -128,6 +128,7 @@ - **BYUtv** - **Camdemy** - **CamdemyFolder** + - **CamModels** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2253da927..0f15738b2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.26' +__version__ = '2018.05.30' From 4fd1437d9d617069494a471ba40341c2ad6623b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 May 2018 17:08:32 +0100 Subject: [PATCH 090/187] [rbmaradio] check formats availability(closes #16585) --- youtube_dl/extractor/rbmaradio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index afa7b9161..9c4d72bbd 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -54,6 +54,7 @@ class RBMARadioIE(InfoExtractor): 'abr': abr, 'vcodec': 'none', } for abr in (96, 128, 256)] + self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) From 128b58ad139f2e62274ab6a649b965f5fa01a533 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 02:49:35 +0100 Subject: [PATCH 091/187] [nhl] remove old extractors --- youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/nhl.py | 345 +++++------------------------ 2 files changed, 62 insertions(+), 290 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5f829c72c..93b22a8c3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -705,12 +705,7 @@ from .nexx import ( from .nfb import NFBIE from .nfl import NFLIE from .nhk import NhkVodIE -from .nhl import ( - NHLVideocenterIE, - NHLNewsIE, - NHLVideocenterCategoryIE, - NHLIE, -) +from .nhl import NHLIE from .nick import ( NickIE, NickBrIE, diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 62ce800c0..cf440f713 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,18 +1,10 @@ from __future__ import unicode_literals import re -import json -import os from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( - unified_strdate, determine_ext, int_or_none, parse_iso8601, @@ -20,236 +12,77 @@ from ..utils import ( ) -class NHLBaseInfoExtractor(InfoExtractor): - @staticmethod - def _fix_json(json_string): - return json_string.replace('\\\'', '\'') +class NHLBaseIE(InfoExtractor): + def _real_extract(self, url): + site, tmp_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://%s/%s/%sid/v1/%s/details/web-v1.json' + % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + if video_data.get('type') != 'video': + video_data = video_data['media'] + video = video_data.get('video') + if video: + video_data = video + else: + videos = video_data.get('videos') + if videos: + video_data = videos[0] - def _real_extract_video(self, video_id): - vid_parts = video_id.split(',') - if len(vid_parts) == 3: - video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0')) - json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id - data = self._download_json( - json_url, video_id, transform_source=self._fix_json) - return self._extract_video(data[0]) + video_id = compat_str(video_data['id']) + title = video_data['title'] - def _extract_video(self, info): - video_id = info['id'] - self.report_extraction(video_id) + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), + }) + self._sort_formats(formats) - initial_video_url = info['publishPoint'] - if info['formats'] == '1': - parsed_url = compat_urllib_parse_urlparse(initial_video_url) - filename, ext = os.path.splitext(parsed_url.path) - path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse_urlencode({ - 'type': 'fvod', - 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) + thumbnails = [] + cuts = video_data.get('image', {}).get('cuts') or [] + if isinstance(cuts, dict): + cuts = cuts.values() + for thumbnail_data in cuts: + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text - else: - video_url = initial_video_url - - join = compat_urlparse.urljoin - ret = { - 'id': video_id, - 'title': info['name'], - 'url': video_url, - 'description': info['description'], - 'duration': int(info['duration']), - 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), - 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), - } - if video_url.startswith('rtmp:'): - mobj = re.match(r'(?Prtmp://[^/]+/(?P[a-z0-9/]+))/(?Pmp4:.*)', video_url) - ret.update({ - 'tc_url': mobj.group('tc_url'), - 'play_path': mobj.group('play_path'), - 'app': mobj.group('app'), - 'no_resume': True, - }) - return ret - - -class NHLVideocenterIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter' - _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P[-0-9a-zA-Z,]+)' - - _TESTS = [{ - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', - 'md5': 'db704a4ea09e8d3988c85e36cc892d09', - 'info_dict': { - 'id': '453614', - 'ext': 'mp4', - 'title': 'Quick clip: Weise 4-3 goal vs Flames', - 'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.', - 'duration': 18, - 'upload_date': '20131006', - }, - }, { - 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', - 'md5': 'd22e82bc592f52d37d24b03531ee9696', - 'info_dict': { - 'id': '2014020024-628-h', - 'ext': 'mp4', - 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', - 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', - 'duration': 0, - 'upload_date': '20141011', - }, - }, { - 'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802', - 'md5': 'c78fc64ea01777e426cfc202b746c825', - 'info_dict': { - 'id': '58665', - 'ext': 'flv', - 'title': 'Classic Game In Six - April 22, 1979', - 'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.', - 'duration': 400, - 'upload_date': '20100129' - }, - }, { - 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/?id=736722', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en', - 'md5': '076fcb88c255154aacbf0a7accc3f340', - 'info_dict': { - 'id': '2014020299-X-h', - 'ext': 'mp4', - 'title': 'Penguins at Islanders / Game Highlights', - 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014', - 'duration': 268, - 'upload_date': '20141122', - } - }, { - 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4', - 'info_dict': { - 'id': '691469', - 'ext': 'mp4', - 'title': 'RAW | Craig MacTavish Full Press Conference', - 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.', - 'upload_date': '20141205', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - }, { - 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._real_extract_video(video_id) - - -class NHLNewsIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:news' - IE_DESC = 'NHL news' - _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P[-0-9a-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.nhl.com/ice/news.htm?id=750727', - 'md5': '4b3d1262e177687a3009937bd9ec0be8', - 'info_dict': { - 'id': '736722', - 'ext': 'mp4', - 'title': 'Cal Clutterbuck has been fined $2,000', - 'description': 'md5:45fe547d30edab88b23e0dd0ab1ed9e6', - 'duration': 37, - 'upload_date': '20150128', - }, - }, { - # iframe embed - 'url': 'http://sabres.nhl.com/club/news.htm?id=780189', - 'md5': '9f663d1c006c90ac9fb82777d4294e12', - 'info_dict': { - 'id': '836127', - 'ext': 'mp4', - 'title': 'Morning Skate: OTT vs. BUF (9/23/15)', - 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.", - 'duration': 93, - 'upload_date': '20150923', - }, - }] - - def _real_extract(self, url): - news_id = self._match_id(url) - webpage = self._download_webpage(url, news_id) - video_id = self._search_regex( - [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'", - r']+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'], - webpage, 'video id') - return self._real_extract_video(video_id) - - -class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter:category' - IE_DESC = 'NHL videocenter category' - _VALID_URL = r'https?://video\.(?P[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P[0-9]+)(?![&?]id=).*?)?$' - _TEST = { - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999', - 'info_dict': { - 'id': '999', - 'title': 'Highlights', - }, - 'playlist_count': 12, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - team = mobj.group('team') - webpage = self._download_webpage(url, team) - cat_id = self._search_regex( - [r'var defaultCatId = "(.+?)";', - r'{statusIndex:0,index:0,.*?id:(.*?),'], - webpage, 'category id') - playlist_title = self._html_search_regex( - r'tab0"[^>]*?>(.*?)', - webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - - data = compat_urllib_parse_urlencode({ - 'cid': cat_id, - # This is the default value - 'count': 12, - 'ptrs': 3, - 'format': 'json', - }) - path = '/videocenter/servlets/browse?' + data - request_url = compat_urlparse.urljoin(url, path) - response = self._download_webpage(request_url, playlist_title) - response = self._fix_json(response) - if not response.strip(): - self._downloader.report_warning('Got an empty response, trying ' - 'adding the "newvideos" parameter') - response = self._download_webpage(request_url + '&newvideos=true', - playlist_title) - response = self._fix_json(response) - videos = json.loads(response) return { - '_type': 'playlist', - 'title': playlist_title, - 'id': cat_id, - 'entries': [self._extract_video(v) for v in videos], + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, } -class NHLIE(InfoExtractor): +class NHLIE(NHLBaseIE): IE_NAME = 'nhl.com' _VALID_URL = r'https?://(?:www\.)?(?Pnhl|wch2016)\.com/(?:[^/]+/)*c-(?P\d+)' - _SITES_MAP = { - 'nhl': 'nhl', - 'wch2016': 'wch', - } + _CONTENT_DOMAIN = 'nhl.bamcontent.com' _TESTS = [{ # type=video 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', @@ -293,59 +126,3 @@ class NHLIE(InfoExtractor): 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', 'only_matching': True, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tmp_id, site = mobj.group('id'), mobj.group('site') - video_data = self._download_json( - 'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json' - % (self._SITES_MAP[site], tmp_id), tmp_id) - if video_data.get('type') == 'article': - video_data = video_data['media'] - - video_id = compat_str(video_data['id']) - title = video_data['title'] - - formats = [] - for playback in video_data.get('playbacks', []): - playback_url = playback.get('url') - if not playback_url: - continue - ext = determine_ext(playback_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False) - self._check_formats(m3u8_formats, video_id) - formats.extend(m3u8_formats) - else: - height = int_or_none(playback.get('height')) - formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), - 'url': playback_url, - 'width': int_or_none(playback.get('width')), - 'height': height, - }) - self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id')) - - thumbnails = [] - for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items(): - thumbnail_url = thumbnail_data.get('src') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_data.get('width')), - 'height': int_or_none(thumbnail_data.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('date')), - 'duration': parse_duration(video_data.get('duration')), - 'thumbnails': thumbnails, - 'formats': formats, - } From acca2ac7f3f4c78bce775d47736caa63e6872e26 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 02:50:14 +0100 Subject: [PATCH 092/187] [mlb] improve extraction(closes #16587) --- youtube_dl/extractor/mlb.py | 105 +++++++++--------------------------- 1 file changed, 24 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 675ff6873..b907f6b49 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,96 +1,90 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) +from .nhl import NHLBaseIE -class MLBIE(InfoExtractor): +class MLBIE(NHLBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*mlb\.com/ + (?:[\da-z_-]+\.)*(?Pmlb)\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| + (?:[^/]+/)*c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) - (?Pn?\d+)| - (?:[^/]+/)*(?P[^/]+) + (?P\d+) ) ''' + _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { - 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', 'info_dict': { 'id': '34698933', 'ext': 'mp4', 'title': "Ackley's spectacular catch", 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'duration': 66, - 'timestamp': 1405980600, - 'upload_date': '20140721', + 'timestamp': 1405995000, + 'upload_date': '20140722', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', - 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', + 'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663', + 'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f', 'info_dict': { 'id': '34496663', 'ext': 'mp4', 'title': 'Stanton prepares for Derby', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'duration': 46, - 'timestamp': 1405105800, + 'timestamp': 1405120200, 'upload_date': '20140711', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', - 'md5': '0e6e73d509321e142409b695eadd541f', + 'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115', + 'md5': '99bb9176531adc600b90880fb8be9328', 'info_dict': { 'id': '34578115', 'ext': 'mp4', 'title': 'Cespedes repeats as Derby champ', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'duration': 488, - 'timestamp': 1405399936, + 'timestamp': 1405414336, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', - 'md5': 'b8fd237347b844365d74ea61d4245967', + 'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915', + 'md5': 'da8b57a12b060e7663ee1eebd6f330ec', 'info_dict': { 'id': '34577915', 'ext': 'mp4', 'title': 'Bautista on Home Run Derby', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'duration': 52, - 'timestamp': 1405390722, + 'timestamp': 1405405122, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'aafaf5b0186fee8f32f20508092f8111', + 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', + 'md5': 'e09e37b552351fddbf4d9e699c924d68', 'info_dict': { 'id': '75609783', 'ext': 'mp4', 'title': 'Must C: Pillar climbs for catch', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429124820, + 'timestamp': 1429139220, 'upload_date': '20150415', } }, @@ -111,7 +105,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783', 'only_matching': True, }, { @@ -120,58 +114,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', + 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', 'only_matching': True, } ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if not video_id: - video_path = mobj.group('path') - webpage = self._download_webpage(url, video_path) - video_id = self._search_regex( - [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') - - detail = self._download_xml( - 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' - % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) - - title = detail.find('./headline').text - description = detail.find('./big-blurb').text - duration = parse_duration(detail.find('./duration').text) - timestamp = parse_iso8601(detail.attrib['date'][:-5]) - - thumbnails = [{ - 'url': thumbnail.text, - } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')] - - formats = [] - for media_url in detail.findall('./url'): - playback_scenario = media_url.attrib['playback_scenario'] - fmt = { - 'url': media_url.text, - 'format_id': playback_scenario, - } - m = re.search(r'(?P\d+)K_(?P\d+)X(?P\d+)', playback_scenario) - if m: - fmt.update({ - 'vbr': int(m.group('vbr')) * 1000, - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } From 3a8e3730c198dd7cb8be76f04d101c66361da6b9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 11:40:37 +0100 Subject: [PATCH 093/187] [francetv] add support for sport.francetvinfo.fr(closes #15645) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/francetv.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 93b22a8c3..b05afd101 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -381,6 +381,7 @@ from .francetv import ( FranceTVSiteIE, FranceTVEmbedIE, FranceTVInfoIE, + FranceTVInfoSportIE, FranceTVJeunesseIE, GenerationWhatIE, CultureboxIE, diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index c02cd03de..6fc6b0da0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -379,6 +379,31 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): return self._make_url_result(video_id, catalogue) +class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): + IE_NAME = 'sport.francetvinfo.fr' + _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', + 'info_dict': { + 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', + 'ext': 'mp4', + 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', + 'timestamp': 1523639962, + 'upload_date': '20180413', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') + return self._make_url_result(video_id, 'Sport-web') + + class GenerationWhatIE(InfoExtractor): IE_NAME = 'france2.fr:generation-what' _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P[^/?#&]+)' From c3f75e2454051021c33f88c982913cba8c651188 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 12:39:45 +0100 Subject: [PATCH 094/187] [audimedia] fix extraction(closes #15309) --- youtube_dl/extractor/audimedia.py | 48 +++++++++++++++++-------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index aa6925623..6bd48ef15 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -5,13 +5,12 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, - sanitized_Request, ) class AudiMediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P[^/?#]+)' + _TESTS = [{ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { @@ -24,41 +23,46 @@ class AudiMediaIE(InfoExtractor): 'duration': 74022, 'view_count': int, } - } - # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) - _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' + }, { + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) raw_payload = self._search_regex([ - r'class="amtv-embed"[^>]+id="([^"]+)"', - r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', + r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"', + r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"', + r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"', + r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})', ], webpage, 'raw payload') - _, stage_mode, video_id, lang = raw_payload.split('-') + _, stage_mode, video_id, _ = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) if stage_mode not in ('s', 'e'): - request = sanitized_Request( - 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), - headers={'X-Auth-Token': self._AUTH_TOKEN}) - json_data = self._download_json(request, video_id)['results'] + video_data = self._download_json( + 'https://www.audimedia.tv/api/video/v1/videos/' + video_id, + video_id, query={ + 'embed[]': ['video_versions', 'thumbnail_image'], + })['results'] formats = [] - stream_url_hls = json_data.get('stream_url_hls') + stream_url_hls = video_data.get('stream_url_hls') if stream_url_hls: formats.extend(self._extract_m3u8_formats( stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - stream_url_hds = json_data.get('stream_url_hds') + stream_url_hds = video_data.get('stream_url_hds') if stream_url_hds: formats.extend(self._extract_f4m_formats( stream_url_hds + '?hdcore=3.4.0', video_id, f4m_id='hds', fatal=False)) - for video_version in json_data.get('video_versions'): + for video_version in video_data.get('video_versions', []): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue @@ -79,11 +83,11 @@ class AudiMediaIE(InfoExtractor): return { 'id': video_id, - 'title': json_data['title'], - 'description': json_data.get('subtitle'), - 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), - 'timestamp': parse_iso8601(json_data.get('publication_date')), - 'duration': int_or_none(json_data.get('duration')), - 'view_count': int_or_none(json_data.get('view_count')), + 'title': video_data['title'], + 'description': video_data.get('subtitle'), + 'thumbnail': video_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(video_data.get('publication_date')), + 'duration': int_or_none(video_data.get('duration')), + 'view_count': int_or_none(video_data.get('view_count')), 'formats': formats, } From 0bfdcc14956557294d8b5ab7309a5f31b3710888 Mon Sep 17 00:00:00 2001 From: DroidFreak32 Date: Thu, 31 May 2018 20:31:44 +0530 Subject: [PATCH 095/187] [openload] Add support for oload.win and oload.download --- youtube_dl/extractor/openload.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d0bdd60b8..702f86b44 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -301,6 +301,12 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', 'only_matching': True, + }, { + 'url': 'https://oload.win/f/kUEfGclsU9o', + 'only_matching': True, + }, { + 'url': 'https://oload.download/f/kUEfGclsU9o', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From 2593725a9bd1347ab54435dc0b48dd7b878f38c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jun 2018 05:16:00 +0700 Subject: [PATCH 096/187] [twitter:card] Add support for another endpoint (closes #16586) --- youtube_dl/extractor/twitter.py | 49 +++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index d7e425041..4a77e792e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -63,7 +63,7 @@ class TwitterCardIE(TwitterBaseIE): 'id': '623160978427936768', 'ext': 'mp4', 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg', + 'thumbnail': r're:^https?://.*$', }, }, { @@ -223,15 +223,38 @@ class TwitterCardIE(TwitterBaseIE): formats.extend(self._extract_mobile_formats(username, video_id)) if formats: + title = self._search_regex(r'([^<]+)', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration'), scale=1000) or duration break + if not formats: + config = self._download_json( + 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % video_id, + video_id, headers={ + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE', + }) + track = config['track'] + vmap_url = track.get('vmapUrl') + if vmap_url: + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + else: + playback_url = track['playbackUrl'] + if determine_ext(playback_url) == 'm3u8': + formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + else: + formats = [{ + 'url': playback_url, + }] + title = 'Twitter web player' + thumbnail = config.get('posterImage') + duration = float_or_none(track.get('durationMs'), scale=1000) + self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex(r'([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - return { 'id': video_id, 'title': title, @@ -375,6 +398,22 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + # card via api.twitter.com/1.1/videos/tweet/config + 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', + 'info_dict': { + 'id': '1001551623938805763', + 'ext': 'mp4', + 'title': 're:.*?Shep is on a roll today.*?', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'uploader': 'Lis Power', + 'uploader_id': 'LisPower1', + 'duration': 111.278, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): From 926d97fc6b018a25ea777dfcfb9a84a10920c2b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Jun 2018 05:17:49 +0700 Subject: [PATCH 097/187] [9c9media] PEP 8 --- youtube_dl/extractor/ninecninemedia.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 875665d43..65754c5e7 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( parse_iso8601, float_or_none, From 85750f897293b5a56e6be521f8b0be3eec082899 Mon Sep 17 00:00:00 2001 From: Enes Date: Fri, 1 Jun 2018 20:16:22 +0300 Subject: [PATCH 098/187] [openload] Improve ext extraction --- test/test_utils.py | 1 + youtube_dl/extractor/openload.py | 7 +++++-- youtube_dl/utils.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index f2b51131c..e63af0166 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -361,6 +361,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') + self.assertEqual(determine_ext('foobar', None), None) def test_find_xpath_attr(self): testxml = ''' diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 702f86b44..d264fe206 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -307,6 +307,10 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.download/f/kUEfGclsU9o', 'only_matching': True, + }, { + # Its title has not got its extension but url has it + 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @@ -368,8 +372,7 @@ class OpenloadIE(InfoExtractor): 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, - # Seems all videos have extensions in their titles - 'ext': determine_ext(title, 'mp4'), + 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), 'subtitles': subtitles, 'http_headers': headers, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 63f24c0b6..6a3199fb9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1228,7 +1228,7 @@ def unified_timestamp(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): - if url is None: + if url is None or '.' not in url: return default_ext guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): From b995043ab8b987cb5d4d83a3b56bb28d009ac0cb Mon Sep 17 00:00:00 2001 From: Logan Fleur Date: Fri, 1 Jun 2018 19:18:57 +0200 Subject: [PATCH 099/187] Ignore venv directory --- .gitignore | 1 + setup.cfg | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index fbf7cecb2..f064a0d9e 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,4 @@ youtube-dl.zsh *.iml tmp/ +venv/ diff --git a/setup.cfg b/setup.cfg index 5208f7ae2..af9a554c6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv ignore = E402,E501,E731,E741 From f20f636596aa4ec949360e7b05f6b9499e28c2a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 00:35:07 +0700 Subject: [PATCH 100/187] [cbc] Improve extraction (closes #16583, closes #16593) --- youtube_dl/extractor/cbc.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index ce8e3d346..43f95c739 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -17,6 +17,7 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + orderedSet, parse_duration, parse_iso8601, parse_age_limit, @@ -136,9 +137,15 @@ class CBCIE(InfoExtractor): entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) entries.extend([ self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) + for media_id in orderedSet(media_ids)]) return self.playlist_result( entries, display_id, strip_or_none(title), self._og_search_description(webpage)) From 9d082e7cb81c63dc5c2e616cd7ca7237a5e0d642 Mon Sep 17 00:00:00 2001 From: Nathan Rossi Date: Sat, 26 May 2018 02:34:22 +1000 Subject: [PATCH 101/187] [facebook] Add support for tahoe player videos (closes #15441) Specific videos appear to use a newer/different player, this requires a second request for the video data as the initial request is missing the specified data. Additionally these videos have different page content for the uploader value, which is stored in the `` element of the initial request. --- youtube_dl/extractor/facebook.py | 38 +++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 0971ce356..8bbca4f56 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -56,6 +56,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -208,6 +209,17 @@ class FacebookIE(InfoExtractor): # no title 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', + 'info_dict': { + 'id': '359649331226507', + 'ext': 'mp4', + 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'uploader': 'ESL One Dota 2', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -323,6 +335,24 @@ class FacebookIE(InfoExtractor): server_js_data, lambda x: x['jsmods']['instances'], list) or []) + if not video_data: + # video info not in first request, do a secondary request using tahoe player specific url + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__user': 0, + '__a': 1, + '__pc': self._search_regex(r'"pkg_cohort":"(.*?)"', webpage, 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex(r'"client_revision":(\d+),', webpage, 'client revision', default=3944515), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json(self._search_regex( + r'for \(;;\);(.+)', tahoe_data, + 'tahoe js data', default='{}'), video_id, fatal=False) + video_data = extract_video_data(tahoe_js_data.get('jsmods', {}).get('instances', [])) + if not video_data: if not fatal_if_no_video: return webpage, False @@ -378,9 +408,11 @@ class FacebookIE(InfoExtractor): video_title = limit_length(video_title, 80) else: video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id( - 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + if not uploader: + uploader = self._search_regex( + [r'ownerName\s*:\s*"([^"]+)"', r'property="og:title"\s*content="(.*?)"'], + webpage, 'uploader', fatal=False) timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) From 9b89daefa6fc3dbfce0d725283ecef753a8603ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 01:32:18 +0700 Subject: [PATCH 102/187] [facebook] Improve extraction (closes #16554) --- youtube_dl/extractor/facebook.py | 66 ++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8bbca4f56..8a9ed96c2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -324,34 +324,18 @@ class FacebookIE(InfoExtractor): if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) + def extract_from_jsmods_instances(js_data): + if js_data: + return extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) + if not video_data: server_js_data = self._parse_json( self._search_regex( r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) - if server_js_data: - video_data = extract_video_data(try_get( - server_js_data, lambda x: x['jsmods']['instances'], - list) or []) - - if not video_data: - # video info not in first request, do a secondary request using tahoe player specific url - tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, - data=urlencode_postdata({ - '__user': 0, - '__a': 1, - '__pc': self._search_regex(r'"pkg_cohort":"(.*?)"', webpage, 'pkg cohort', default='PHASED:DEFAULT'), - '__rev': self._search_regex(r'"client_revision":(\d+),', webpage, 'client revision', default=3944515), - }), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - tahoe_js_data = self._parse_json(self._search_regex( - r'for \(;;\);(.+)', tahoe_data, - 'tahoe js data', default='{}'), video_id, fatal=False) - video_data = extract_video_data(tahoe_js_data.get('jsmods', {}).get('instances', [])) + video_data = extract_from_jsmods_instances(server_js_data) if not video_data: if not fatal_if_no_video: @@ -363,8 +347,33 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - else: - raise ExtractorError('Cannot parse data') + + # Video info not in first request, do a secondary request using + # tahoe player specific URL + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__user': 0, + '__a': 1, + '__pc': self._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex( + r'client_revision["\']\s*:\s*(\d+),', webpage, + 'client revision', default='3944515'), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) + + if not video_data: + raise ExtractorError('Cannot parse data') formats = [] for f in video_data: @@ -408,11 +417,10 @@ class FacebookIE(InfoExtractor): video_title = limit_length(video_title, 80) else: video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - if not uploader: - uploader = self._search_regex( - [r'ownerName\s*:\s*"([^"]+)"', r'property="og:title"\s*content="(.*?)"'], - webpage, 'uploader', fatal=False) + uploader = clean_html(get_element_by_id( + 'fbPhotoPageAuthorName', webpage)) or self._search_regex( + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', + fatal=False) or self._og_search_title(webpage, fatal=False) timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) From 73c938e46055690646ef6b81ed53cf1a0bd976a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 01:49:48 +0700 Subject: [PATCH 103/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4e989caf7..f75cb6f11 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version + +Core +* [utils] Improve determine_ext + +Extractors ++ [facebook] Add support for tahoe player videos (#15441, #16554) +* [cbc] Improve extraction (#16583, #16593) +* [openload] Improve ext extraction (#16595) ++ [twitter:card] Add support for another endpoint (#16586) ++ [openload] Add support for oload.win and oload.download (#16592) +* [audimedia] Fix extraction (#15309) ++ [francetv] Add support for sport.francetvinfo.fr (#15645) +* [mlb] Improve extraction (#16587) +- [nhl] Remove old extractors +* [rbmaradio] Check formats availability (#16585) + + version 2018.05.30 Core From 19e42ead9b6dc933fd6cc1adb6b10e2869ecb091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Jun 2018 01:51:31 +0700 Subject: [PATCH 104/187] release 2018.06.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 4 +--- youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b47a450a4..10efa29a4 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.30** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.30 +[debug] youtube-dl version 2018.06.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index f75cb6f11..edd190051 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.06.02 Core * [utils] Improve determine_ext diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c2d5401d6..8ce13581b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -553,9 +553,6 @@ - **nfl.com** - **NhkVod** - **nhl.com** - - **nhl.com:news**: NHL news - - **nhl.com:videocenter** - - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** - **nickelodeon:br** @@ -793,6 +790,7 @@ - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** + - **sport.francetvinfo.fr** - **Sport5** - **SportBoxEmbed** - **SportDeutschland** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0f15738b2..5ace1b355 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.30' +__version__ = '2018.06.02' From 1ea559c445d6612968b7586355f236cabfd42ef5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Jun 2018 18:07:36 +0100 Subject: [PATCH 105/187] [adn] fix extraction --- youtube_dl/extractor/adn.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 041c61aff..1eb99c39a 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import binascii import json import os +import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt @@ -12,9 +15,12 @@ from ..compat import ( ) from ..utils import ( bytes_to_intlist, + bytes_to_long, ExtractorError, float_or_none, intlist_to_bytes, + long_to_bytes, + pkcs1pad, srt_subtitles_timecode, strip_or_none, urljoin, @@ -35,6 +41,7 @@ class ADNIE(InfoExtractor): } } _BASE_URL = 'http://animedigitalnetwork.fr' + _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) def _get_subtitles(self, sub_path, video_id): if not sub_path: @@ -42,16 +49,14 @@ class ADNIE(InfoExtractor): enc_subtitles = self._download_webpage( urljoin(self._BASE_URL, sub_path), - video_id, fatal=False, headers={ - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', - }) + video_id, fatal=False) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), + bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -112,11 +117,24 @@ class ADNIE(InfoExtractor): error = None if not links: links_url = player_config.get('linksurl') or options['videoUrl'] - links_data = self._download_json(urljoin( - self._BASE_URL, links_url), video_id) + token = options['token'] + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 'e': 60, + 't': token, + })) + padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + n, e = self._RSA_KEY + encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) + authorization = base64.b64encode(encrypted_message).decode() + links_data = self._download_json( + urljoin(self._BASE_URL, links_url), video_id, headers={ + 'Authorization': 'Bearer ' + authorization, + }) links = links_data.get('links') or {} metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') + sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token error = links_data.get('error') title = metas.get('title') or video_info['title'] From 003fe73ccf06192bb94d524fe9c39252ff1b1dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jun 2018 00:52:22 +0700 Subject: [PATCH 106/187] [safari] Add support for new URL schema (closes #16614) --- youtube_dl/extractor/safari.py | 67 +++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 8a5d48fc2..30e2a38b4 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -74,7 +74,14 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P[^/]+)/(?P[^/?#&]+)\.html' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) + ) + ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -94,22 +101,41 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, }] + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, video_id) - reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'kaltura reference id', group='id') - partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'kaltura widget id', group='id') - ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'kaltura uiconf id', group='id') + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') query = { 'wid': '_%s' % partner_id, @@ -159,10 +185,15 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)| + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| techbus\.safaribooksonline\.com ) - /(?P[^/]+)/?(?:[#?]|$) + /(?P[^/]+) ''' _TESTS = [{ @@ -179,8 +210,16 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'http://techbus.safaribooksonline.com/9780134426365', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + def _real_extract(self, url): course_id = self._match_id(url) From 936784b272db3f85f5ff5bdd2d5a71e0397ee7bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jun 2018 02:05:14 +0700 Subject: [PATCH 107/187] [youtube] Extract track and artist --- youtube_dl/extractor/youtube.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 379559825..677907aba 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -510,6 +510,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'license': 'Standard YouTube License', 'creator': 'Icona Pop', + 'track': 'I Love It (feat. Charli XCX)', + 'artist': 'Icona Pop', } }, { @@ -528,6 +530,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', + 'track': 'Tunnel Vision`', + 'artist': 'Justin Timberlake', 'age_limit': 18, } }, @@ -1765,6 +1769,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_alt_title = video_creator = None + def extract_meta(field): + return self._html_search_regex( + r']+class="title"[^>]*>\s*%s\s*\s*]*>\s*
  • (.+?)
  • \s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + m_episode = re.search( r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)
    ', video_webpage) @@ -2055,9 +2067,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': video_uploader_url, 'upload_date': upload_date, 'license': video_license, - 'creator': video_creator, + 'creator': video_creator or artist, 'title': video_title, - 'alt_title': video_alt_title, + 'alt_title': video_alt_title or track, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, @@ -2080,6 +2092,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'series': series, 'season_number': season_number, 'episode_number': episode_number, + 'track': track, + 'artist': artist, } From 7e72694b5e0691adfd90f5d5ecd47647625511e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jun 2018 02:08:38 +0700 Subject: [PATCH 108/187] [youtube] Move metadata extraction after video availability check --- youtube_dl/extractor/youtube.py | 259 ++++++++++++++++---------------- 1 file changed, 128 insertions(+), 131 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 677907aba..b8cea1191 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -530,7 +530,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision`', + 'track': 'Tunnel Vision', 'artist': 'Justin Timberlake', 'age_limit': 18, } @@ -1698,136 +1698,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) - - track = extract_meta('Song') - artist = extract_meta('Artist') - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = m_episode.group('series') - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) - - chapters = self._extract_chapters(description_original, video_duration) - def _extract_filesize(media_url): return int_or_none(self._search_regex( r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) @@ -2002,6 +1872,133 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + # uploader + video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + if video_uploader: + video_uploader = compat_urllib_parse_unquote_plus(video_uploader) + else: + self._downloader.report_warning('unable to extract uploader name') + + # uploader_id + video_uploader_id = None + video_uploader_url = None + mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + video_webpage) + if mobj is not None: + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') + else: + self._downloader.report_warning('unable to extract uploader nickname') + + # thumbnail image + # We try first to get a high quality image: + m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: + self._downloader.report_warning('unable to extract video thumbnail') + video_thumbnail = None + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = self._html_search_meta( + 'datePublished', video_webpage, 'upload date', default=None) + if not upload_date: + upload_date = self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], + video_webpage, 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + video_license = self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', + video_webpage, 'license', default=None) + + m_music = re.search( + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + + def extract_meta(field): + return self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, + video_webpage, field, default=None) + + track = extract_meta('Song') + artist = extract_meta('Artist') + + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = m_episode.group('series') + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + + m_cat_container = self._search_regex( + r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', + video_webpage, 'categories', default=None) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + + def _extract_count(count_name): + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') + + # subtitles + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) + + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + + chapters = self._extract_chapters(description_original, video_duration) + # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd_fatal = True From eb6793ba970351ecc8f8a579ff4e4665fb649f9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 02:23:45 +0700 Subject: [PATCH 109/187] [youtube] Update tests --- youtube_dl/extractor/youtube.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b8cea1191..89c8b7f8d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -601,7 +601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'IB3lcPjvWLA', 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', - 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'description': 'md5:1900ed86ee514927b9e00fbead6969a5', 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', @@ -642,7 +642,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 219, 'upload_date': '20100909', - 'uploader': 'The Amazing Atheist', + 'uploader': 'TJ Kirk', 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'license': 'Standard YouTube License', @@ -672,10 +672,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 247, + 'duration': 246, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', @@ -737,7 +737,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫艾倫', + 'uploader': '孫ᄋᄅ', 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, @@ -764,7 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, @@ -773,8 +773,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:32', + 'formats': 'mincount:31', }, + 'skip': 'not actual anymore', }, # DASH manifest with segment_list { @@ -889,7 +890,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk', + 'alt_title': 'Dark Walk - Position Music', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', @@ -897,7 +898,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', 'license': 'Standard YouTube License', - 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', + 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk - Position Music', + 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', }, 'params': { 'skip_download': True, @@ -954,7 +957,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:dda0d780d5a6e120758d1711d062a867', 'duration': 4060, 'upload_date': '20151119', - 'uploader': 'Bernie 2016', + 'uploader': 'Bernie Sanders', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', @@ -989,6 +992,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', }, { # YouTube Red video with episode data @@ -997,7 +1001,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'description': 'md5:25b78d2f64ae81719f5c96319889b736', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', @@ -1030,7 +1034,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', 'license': 'Standard YouTube License', - 'view_count': int, }, 'params': { 'skip_download': True, From 6d155707e67dc9e3bc4e118dc7d6a06bf8af471f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 04:07:59 +0700 Subject: [PATCH 110/187] [bbc] Add support for bbcthree (closes #16612) --- youtube_dl/extractor/bbc.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 8b20c03d6..30a63a24e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..utils import ( float_or_none, get_element_by_class, int_or_none, + js_to_json, parse_duration, parse_iso8601, try_get, @@ -772,6 +773,17 @@ class BBCIE(BBCCoUkIE): # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } }] @classmethod @@ -994,6 +1006,36 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) + if bbc3_config: + bbc3_playlist = try_get( + bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), From 0a10f50e2f1fb8ed80a3707970ba44593cfff8eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 04:30:33 +0700 Subject: [PATCH 111/187] [chaturbate] Use geo verification headers --- youtube_dl/extractor/chaturbate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index e3eba4be9..e2b828d8a 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -31,7 +31,8 @@ class ChaturbateIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url, video_id, headers=self.geo_verification_headers()) m3u8_urls = [] From b6b2ccb72fb7da7563078d4bf047d1622ba89553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 15:53:20 +0700 Subject: [PATCH 112/187] [twitter:card] Extract guest token (closes #16609) --- youtube_dl/extractor/twitter.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 4a77e792e..f3fccbf1d 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -229,11 +229,22 @@ class TwitterCardIE(TwitterBaseIE): break if not formats: + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + 'Referer': url, + } + ct0 = self._get_cookies(url).get('ct0') + if ct0: + headers['csrf_token'] = ct0.value + guest_token = self._download_json( + 'https://api.twitter.com/1.1/guest/activate.json', video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = guest_token + self._set_cookie('api.twitter.com', 'gt', guest_token) config = self._download_json( 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % video_id, - video_id, headers={ - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE', - }) + video_id, headers=headers) track = config['track'] vmap_url = track.get('vmapUrl') if vmap_url: From 77053237c56341806c759f809a84975ede8141d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 15:58:12 +0700 Subject: [PATCH 113/187] [twitter:card] Generalize base API URL --- youtube_dl/extractor/twitter.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index f3fccbf1d..de41065d6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -108,6 +108,8 @@ class TwitterCardIE(TwitterBaseIE): }, ] + _API_BASE = 'https://api.twitter.com/1.1' + def _parse_media_info(self, media_info, video_id): formats = [] for media_variant in media_info.get('variants', []): @@ -149,7 +151,7 @@ class TwitterCardIE(TwitterBaseIE): main_script, 'bearer token') # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id api_data = self._download_json( - 'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, + '%s/statuses/show/%s.json' % (self._API_BASE, video_id), video_id, 'Downloading API data', headers={ 'Authorization': 'Bearer ' + bearer_token, @@ -237,13 +239,13 @@ class TwitterCardIE(TwitterBaseIE): if ct0: headers['csrf_token'] = ct0.value guest_token = self._download_json( - 'https://api.twitter.com/1.1/guest/activate.json', video_id, + '%s/guest/activate.json' % self._API_BASE, video_id, 'Downloading guest token', data=b'', headers=headers)['guest_token'] headers['x-guest-token'] = guest_token self._set_cookie('api.twitter.com', 'gt', guest_token) config = self._download_json( - 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % video_id, + '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), video_id, headers=headers) track = config['track'] vmap_url = track.get('vmapUrl') From c3023e9f2e9e84942df0789b8bc799b6ea51d690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jun 2018 17:09:20 +0700 Subject: [PATCH 114/187] [camtube] Add extractor --- youtube_dl/extractor/camtube.py | 69 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/camtube.py diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py new file mode 100644 index 000000000..c7d40f849 --- /dev/null +++ b/youtube_dl/extractor/camtube.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b05afd101..6df829054 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -147,6 +147,7 @@ from .camdemy import ( CamdemyFolderIE ) from .cammodels import CamModelsIE +from .camtube import CamTubeIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE From c6c478f40d4afa3fb97bf25eeff902ce95a5aa88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 02:16:33 +0700 Subject: [PATCH 115/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index edd190051..db6f8a3ad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version <unreleased> + +Extractors ++ [camtube] Add support for camtube.co ++ [twitter:card] Extract guest token (#16609) ++ [chaturbate] Use geo verification headers ++ [bbc] Add support for bbcthree (#16612) +* [youtube] Move metadata extraction after video availability check ++ [youtube] Extract track and artist ++ [safari] Add support for new URL schema (#16614) +* [adn] Fix extraction + + version 2018.06.02 Core From f7560859a3e25ccaa74123428d42f821299a2bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 02:33:54 +0700 Subject: [PATCH 116/187] [devscripts/update-copyright] Update copyright year --- devscripts/gh-pages/update-copyright.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devscripts/gh-pages/update-copyright.py b/devscripts/gh-pages/update-copyright.py index e6c3abc8d..61487f925 100755 --- a/devscripts/gh-pages/update-copyright.py +++ b/devscripts/gh-pages/update-copyright.py @@ -13,7 +13,7 @@ year = str(datetime.datetime.now().year) for fn in glob.glob('*.html*'): with io.open(fn, encoding='utf-8') as f: content = f.read() - newc = re.sub(r'(?P<copyright>Copyright © 2006-)(?P<year>[0-9]{4})', 'Copyright © 2006-' + year, content) + newc = re.sub(r'(?P<copyright>Copyright © 2011-)(?P<year>[0-9]{4})', 'Copyright © 2011-' + year, content) if content != newc: tmpFn = fn + '.part' with io.open(tmpFn, 'wt', encoding='utf-8') as outf: From 94418c8eb3e458a16fa7301c0987dec5f04faa6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 02:41:53 +0700 Subject: [PATCH 117/187] release 2018.06.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 10efa29a4..aa7686efd 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.02 +[debug] youtube-dl version 2018.06.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index db6f8a3ad..5375e03fc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.04 Extractors + [camtube] Add support for camtube.co diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8ce13581b..e1a9f2236 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -129,6 +129,7 @@ - **Camdemy** - **CamdemyFolder** - **CamModels** + - **CamTube** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5ace1b355..ab3419f0c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.02' +__version__ = '2018.06.04' From 2e190c2ad9a985940fa0ca2cb7a09398319dd2c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Jun 2018 23:51:25 +0700 Subject: [PATCH 118/187] [rbmaradio] Add support for 192k format (closes #16631) --- youtube_dl/extractor/rbmaradio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 9c4d72bbd..ae7413fb5 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -53,7 +53,7 @@ class RBMARadioIE(InfoExtractor): 'format_id': compat_str(abr), 'abr': abr, 'vcodec': 'none', - } for abr in (96, 128, 256)] + } for abr in (96, 128, 192, 256)] self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) From d7be7053082055a001b788453d66131a62692b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Jun 2018 00:17:26 +0700 Subject: [PATCH 119/187] [pbs] Add another cove id pattern (closes #15373) --- youtube_dl/extractor/pbs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a28ee17ca..8d6f2dd3d 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -360,6 +360,21 @@ class PBSIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', + 'info_dict': { + 'id': '2365936247', + 'ext': 'mp4', + 'title': 'Antiques Roadshow - Indianapolis, Hour 2', + 'description': 'md5:524b32249db55663e7231b6b8d1671a2', + 'duration': 3180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -422,6 +437,7 @@ class PBSIE(InfoExtractor): r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", + r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ ] media_id = self._search_regex( From 06ea7bdd99c16b718c977a652f0ef66341ac2b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Jun 2018 02:55:54 +0700 Subject: [PATCH 120/187] [nexx] Add support for free cdn (closes #16538) --- youtube_dl/extractor/nexx.py | 209 ++++++++++++++++++++++++----------- 1 file changed, 144 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 5e46a75c0..6f40d7f89 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -77,6 +77,21 @@ class NexxIE(InfoExtractor): 'timestamp': 1518614955, 'upload_date': '20180214', }, + }, { + # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html + 'url': 'nexx:747:1533779', + 'md5': '6bf6883912b82b7069fb86c2297e9893', + 'info_dict': { + 'id': '1533779', + 'ext': 'mp4', + 'title': 'Aufregung um ausgebrochene Raubtiere', + 'alt_title': 'Eifel-Zoo', + 'description': 'md5:f21375c91c74ad741dcb164c427999d2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 111, + 'timestamp': 1527874460, + 'upload_date': '20180601', + }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -141,6 +156,127 @@ class NexxIE(InfoExtractor): self._handle_error(result) return result['result'] + def _extract_free_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'free' + + hash = video['general']['hash'] + + ps = compat_str(stream_data['originalDomain']) + if stream_data['applyFolderHierarchy'] == 1: + s = ('%04d' % int(video_id))[::-1] + ps += '/%s/%s' % (s[0:2], s[2:4]) + ps += '/%s/%s_' % (video_id, hash) + + formats = [{ + 'url': 'http://%s%s2500_var.mp4' % (stream_data['cdnPathHTTP'], ps), + 'format_id': '%s-http' % cdn, + }] + + def make_url(root, protocol): + t = 'http://' + root + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] + + def p0(p): + return '_%s' % int(p[0]) if stream_data['applyAzureStructure'] == 1 else '' + + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(p) + ',' + t += '.mp4.csmil/master.m3u8' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + t = '/'.join(k) + t += '/asset.ism/manifest.' + ('m3u8' if protocol == 'hls' else 'mpd') + '?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + a = '%s%s%s.mp4:%s' % (h, p[1], p0(p), int(p[0]) * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + return t + + formats.extend(self._extract_mpd_formats( + make_url(stream_data['cdnPathDASH'], 'dash'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_m3u8_formats( + make_url(stream_data['cdnPathHLS'], 'hls'), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) + + return formats + + def _extract_azure_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'azure' + + azure_locator = stream_data['azureLocator'] + + def get_cdn_shield_base(shield_type='', static=False): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + + language = video['general'].get('language_raw') or '' + + azure_stream_base = get_cdn_shield_base() + is_ml = ',' in language + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' + + protection_token = try_get( + video, lambda x: x['protectiondata']['token'], compat_str) + if protection_token: + azure_manifest_url += '?hdnts=%s' % protection_token + + formats = self._extract_m3u8_formats( + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', True) + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') @@ -220,72 +356,15 @@ class NexxIE(InfoExtractor): general = video['general'] title = general['title'] - stream_data = video['streamdata'] - language = general.get('language_raw') or '' + cdn = video['streamdata']['cdnType'] - # TODO: reverse more cdns - - cdn = stream_data['cdnType'] - assert cdn == 'azure' - - azure_locator = stream_data['azureLocator'] - - def get_cdn_shield_base(shield_type='', static=False): - for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) - if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) - else: - if 'fb' in stream_data['azureAccount']: - prefix = 'df' if static else 'f' - else: - prefix = 'd' if static else 'p' - account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) - return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) - - azure_stream_base = get_cdn_shield_base() - is_ml = ',' in language - azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( - azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' - - protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) - if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token - - formats = self._extract_m3u8_formats( - azure_manifest_url % '(format=m3u8-aapl)', - video_id, 'mp4', 'm3u8_native', - m3u8_id='%s-hls' % cdn, fatal=False) - formats.extend(self._extract_mpd_formats( - azure_manifest_url % '(format=mpd-time-csf)', - video_id, mpd_id='%s-dash' % cdn, fatal=False)) - formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - - azure_progressive_base = get_cdn_shield_base('Prog', True) - azure_file_distribution = stream_data.get('azureFileDistribution') - if azure_file_distribution: - fds = azure_file_distribution.split(',') - if fds: - for fd in fds: - ss = fd.split(':') - if len(ss) == 2: - tbr = int_or_none(ss[0]) - if tbr: - f = { - 'url': '%s%s/%s_src_%s_%d.mp4' % ( - azure_progressive_base, azure_locator, video_id, ss[1], tbr), - 'format_id': '%s-http-%d' % (cdn, tbr), - 'tbr': tbr, - } - width_height = ss[1].split('x') - if len(width_height) == 2: - f.update({ - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - }) - formats.append(f) + if cdn == 'azure': + formats = self._extract_azure_formats(video, video_id) + elif cdn == 'free': + formats = self._extract_free_formats(video, video_id) + else: + # TODO: reverse more cdns + assert False self._sort_formats(formats) From 2e6975306a51d3ee7dae71bc93d57951487ea6f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Jun 2018 02:59:25 +0700 Subject: [PATCH 121/187] [nexx] Update tests --- youtube_dl/extractor/nexx.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 6f40d7f89..40946d26b 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -29,14 +29,13 @@ class NexxIE(InfoExtractor): _TESTS = [{ # movie 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '828cea195be04e66057b846288295ba1', + 'md5': '31899fd683de49ad46f4ee67e53e83fe', 'info_dict': { 'id': '128907', 'ext': 'mp4', 'title': 'Stiftung Warentest', 'alt_title': 'Wie ein Test abläuft', 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', - 'release_year': 2013, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2509, @@ -62,6 +61,7 @@ class NexxIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { # does not work via arc 'url': 'nexx:741:1269984', @@ -71,7 +71,6 @@ class NexxIE(InfoExtractor): 'ext': 'mp4', 'title': '1 TAG ohne KLO... wortwörtlich! 😑', 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'description': 'md5:4604539793c49eda9443ab5c5b1d612f', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 607, 'timestamp': 1518614955, From 9afd74d70558d7db9e2e13d2afa84746e61f193c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Jun 2018 01:02:46 +0100 Subject: [PATCH 122/187] [nexx] extract free cdn http formats --- youtube_dl/extractor/nexx.py | 76 +++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 40946d26b..82d526c22 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -168,42 +168,54 @@ class NexxIE(InfoExtractor): ps += '/%s/%s' % (s[0:2], s[2:4]) ps += '/%s/%s_' % (video_id, hash) - formats = [{ - 'url': 'http://%s%s2500_var.mp4' % (stream_data['cdnPathHTTP'], ps), - 'format_id': '%s-http' % cdn, - }] + t = 'http://%s' + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] - def make_url(root, protocol): - t = 'http://' + root + ps - fd = stream_data['azureFileDistribution'].split(',') - cdn_provider = stream_data['cdnProvider'] + def p0(p): + return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' - def p0(p): - return '_%s' % int(p[0]) if stream_data['applyAzureStructure'] == 1 else '' + formats = [] + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(int(p[0])) + ',' + t += '.mp4.csmil/master.%s' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + http_base = t = '/'.join(k) + http_base = http_base % stream_data['cdnPathHTTP'] + t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + tbr = int(p[0]) + filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) + f = { + 'url': http_base + '/' + filename, + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = p[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + a = filename + ':%s' % (tbr * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + else: + assert False - if cdn_provider == 'ak': - t += ',' - for i in fd: - p = i.split(':') - t += p[1] + p0(p) + ',' - t += '.mp4.csmil/master.m3u8' - elif cdn_provider == 'ce': - k = t.split('/') - h = k.pop() - t = '/'.join(k) - t += '/asset.ism/manifest.' + ('m3u8' if protocol == 'hls' else 'mpd') + '?dcp_ver=aos4&videostream=' - for i in fd: - p = i.split(':') - a = '%s%s%s.mp4:%s' % (h, p[1], p0(p), int(p[0]) * 1000) - t += a + ',' - t = t[:-1] + '&audiostream=' + a.split(':')[0] - return t - - formats.extend(self._extract_mpd_formats( - make_url(stream_data['cdnPathDASH'], 'dash'), video_id, - mpd_id='%s-dash' % cdn, fatal=False)) + if cdn_provider == 'ce': + formats.extend(self._extract_mpd_formats( + t % (stream_data['cdnPathDASH'], 'mpd'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) formats.extend(self._extract_m3u8_formats( - make_url(stream_data['cdnPathHLS'], 'hls'), video_id, 'mp4', + t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) return formats From 6ae36035d9ba2eae88070eb647f45cc4ceeb0998 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Jun 2018 00:41:08 +0100 Subject: [PATCH 123/187] [tv4] fix format extraction(closes #16650) --- youtube_dl/extractor/tv4.py | 62 +++++++++++++------------------------ 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index cfcce020a..51923e44a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -1,13 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_iso8601, - try_get, - determine_ext, ) @@ -78,42 +77,25 @@ class TV4IE(InfoExtractor): title = info['title'] - subtitles = {} - formats = [] - # http formats are linked with unresolvable host - for kind in ('hls3', ''): - data = self._download_json( - 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, - video_id, 'Downloading sources JSON', query={ - 'protocol': kind, - 'videoFormat': 'MP4+WEBVTT', - }) - items = try_get(data, lambda x: x['playback']['items']['item']) - if not items: - continue - if isinstance(items, dict): - items = [items] - for item in items: - manifest_url = item.get('url') - if not isinstance(manifest_url, compat_str): - continue - ext = determine_ext(manifest_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=kind, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_akamai_formats( - manifest_url, video_id, { - 'hls': 'tv4play-i.akamaihd.net', - })) - elif ext == 'webvtt': - subtitles = self._merge_subtitles( - subtitles, { - 'sv': [{ - 'url': manifest_url, - 'ext': 'vtt', - }]}) + manifest_url = self._download_json( + 'https://playback-api.b17g.net/media/' + video_id, + video_id, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + })['playbackItem']['manifestUrl'] + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_ism_formats( + re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES) @@ -124,7 +106,7 @@ class TV4IE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, - 'subtitles': subtitles, + # 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), From ff2e4862210d69f2cee509002a9d96ba3a0877d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 9 Jun 2018 02:53:04 +0700 Subject: [PATCH 124/187] [inc] Add support for another embed schema (closes #16666) --- youtube_dl/extractor/inc.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py index 241ec83c4..8dee143ca 100644 --- a/youtube_dl/extractor/inc.py +++ b/youtube_dl/extractor/inc.py @@ -21,6 +21,21 @@ class IncIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # div with id=kaltura_player_1_kqs38cgm + 'url': 'https://www.inc.com/oscar-raymundo/richard-branson-young-entrepeneurs.html', + 'info_dict': { + 'id': '1_kqs38cgm', + 'ext': 'mp4', + 'title': 'Branson: "In the end, you have to say, Screw it. Just do it."', + 'description': 'md5:21b832d034f9af5191ca5959da5e9cb6', + 'timestamp': 1364403232, + 'upload_date': '20130327', + 'uploader_id': 'incdigital@inc.com', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', 'only_matching': True, @@ -31,9 +46,12 @@ class IncIE(InfoExtractor): webpage = self._download_webpage(url, display_id) partner_id = self._search_regex( - r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') + r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, + 'partner id', default='1034971') - kaltura_id = self._parse_json(self._search_regex( + kaltura_id = self._search_regex( + r'id=(["\'])kaltura_player_(?P<id>.+?)\1', webpage, 'kaltura id', + default=None, group='id') or self._parse_json(self._search_regex( r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), display_id)['vid_kaltura_id'] From 9d581efe053a22042ea7530a8e92097d0a91eea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Jun 2018 00:26:16 +0700 Subject: [PATCH 125/187] [npo] Extend _VALID_URL (closes #16682) --- youtube_dl/extractor/npo.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index ff2153387..cb8319f0d 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -36,8 +36,8 @@ class NPOIE(NPOBaseIE): https?:// (?:www\.)? (?: - npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| + npo\.nl/(?:[^/]+/)*| + (?:ntr|npostart)\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| (?:zapp|npo3)\.nl/(?:[^/]+/){2,} ) @@ -160,8 +160,20 @@ class NPOIE(NPOBaseIE): }, { 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996', + 'only_matching': True, + }, { + 'url': 'https://npo.nl/KN_1698996', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if any(ie.suitable(url) + for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) + else super(NPOIE, cls).suitable(url)) + def _real_extract(self, url): video_id = self._match_id(url) return self._get_info(video_id) @@ -389,7 +401,7 @@ class NPOLiveIE(NPOBaseIE): class NPORadioIE(InfoExtractor): IE_NAME = 'npo.nl:radio' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.npo.nl/radio/radio-1', @@ -404,6 +416,10 @@ class NPORadioIE(InfoExtractor): } } + @classmethod + def suitable(cls, url): + return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) + @staticmethod def _html_get_attribute_regex(attribute): return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) From cc37cc3f9935dfcdd3c3b73ee1419c1f8592e056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 01:55:16 +0700 Subject: [PATCH 126/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 5375e03fc..a1db9df4e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Extractors +* [npo] Extend URL regular expression and add support for npostart.nl (#16682) ++ [inc] Add support for another embed schema (#16666) +* [tv4] Fix format extraction (#16650) ++ [nexx] Add support for free cdn (#16538) ++ [pbs] Add another cove id pattern (#15373) ++ [rbmaradio] Add support for 192k format (#16631) + + version 2018.06.04 Extractors From e8c6afc16877a6b975895bf236c8cdaff01733cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 01:57:30 +0700 Subject: [PATCH 127/187] release 2018.06.11 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index aa7686efd..d5be8003b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.04*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.11** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.04 +[debug] youtube-dl version 2018.06.11 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a1db9df4e..b808ad6ba 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.11 Extractors * [npo] Extend URL regular expression and add support for npostart.nl (#16682) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ab3419f0c..e72f42cf2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.04' +__version__ = '2018.06.11' From d253df2f65b15d1e3bb5e9703b05d98532337c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 02:40:17 +0700 Subject: [PATCH 128/187] [wimp] Fix Youtube embeds extraction --- youtube_dl/extractor/wimp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index c022fb33e..3dab9145b 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -36,7 +36,8 @@ class WimpIE(InfoExtractor): webpage = self._download_webpage(url, video_id) youtube_id = self._search_regex( - r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + r'data-id=["\']([0-9A-Za-z_-]{11})'), webpage, 'video URL', default=None) if youtube_id: return { From 93cffb1444131375e822b0c01e23dc4819911419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Jun 2018 03:08:36 +0700 Subject: [PATCH 129/187] [nrk] Update API hosts and try all previously known ones (closes #16690) --- youtube_dl/extractor/nrk.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b4f51f61..7157e2390 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -16,12 +16,22 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _api_host = None + def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://%s/mediaelement/%s' % (self._API_HOST, video_id), - video_id, 'Downloading mediaelement JSON') + api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS + + for api_host in api_hosts: + data = self._download_json( + 'http://%s/mediaelement/%s' % (api_host, video_id), + video_id, 'Downloading mediaelement JSON', + fatal=api_host == api_hosts[-1]) + if not data: + continue + self._api_host = api_host + break title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id @@ -191,7 +201,7 @@ class NRKIE(NRKBaseIE): ) (?P<id>[^?#&]+) ''' - _API_HOST = 'v8-psapi.nrk.no' + _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -237,8 +247,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-we.nrk.no' - + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '4e9ca6629f09e588ed240fb11619922a', From b2df66aecab9faea206cb72e715dfa1394a6d182 Mon Sep 17 00:00:00 2001 From: Thomas van der Berg <ik@thomasvanderberg.nl> Date: Wed, 31 Jan 2018 23:00:30 +0100 Subject: [PATCH 130/187] [tvnet] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vtv.py | 91 ++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/vtv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6df829054..e6d1fe70e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1306,6 +1306,7 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtv import VTVIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/vtv.py b/youtube_dl/extractor/vtv.py new file mode 100644 index 000000000..a9683dd85 --- /dev/null +++ b/youtube_dl/extractor/vtv.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re + +from ..utils import extract_attributes + +class VTVIE(InfoExtractor): + _VALID_URL = r'https?://(au|ca|cz|de|jp|kr|tw|us|vn)\.tvnet\.gov\.vn/[^/]*/(?P<id>[0-9]+)/?' + _TESTS = [{ + # Livestream. Channel: VTV 1 + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:https?://.*\.png$', + } + }, { + # Downloading a video. + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': '5263c63d738569ed507980f1e49ebc03', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang - TV Net', + 'thumbnail': r're:https?://.*\.JPG$', + } + }, { + # Radio live stream. Channel: VOV 1 + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'vcodec': 'none', + 'title': r're:VOV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:https?://.*\.png$', + } + + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title>(.+?)', webpage, 'title', default=None, fatal=False) + if title is None: + title = self._og_search_title(webpage) + title.strip() + + mediaplayer_div = self._search_regex(r'(]*id="mediaplayer"[^>]*>)', webpage, 'mediaplayer element') + mediaplayer_div_attributes = extract_attributes(mediaplayer_div) + + thumbnail = mediaplayer_div_attributes.get("data-image") + + json_url = mediaplayer_div_attributes["data-file"] + video_streams = self._download_json(json_url, video_id) + + + # get any working playlist from streams. Currently there's 2 and the first always works, + # but you never know in the future + for stream in video_streams: + formats = self._extract_m3u8_formats(stream.get("url"), video_id, ext="mp4", fatal=False) + if formats: + break + + # better support radio streams + if title.startswith("VOV"): + for f in formats: + f["ext"] = "m4a" + f["vcodec"] = "none" + + if "/video/" in url or "/radio/" in url: + is_live = False + elif "/kenh-truyen-hinh/" in url: + is_live = True + else: + is_live = None + + if is_live: + title = self._live_title(title) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + 'is_live': is_live, + } From a572ae6114deca4e8f2f0365ca7091749f01deaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 01:35:23 +0700 Subject: [PATCH 131/187] [tvnet] Improve and fix issues (closes #15462) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tvnet.py | 133 +++++++++++++++++++++++++++++ youtube_dl/extractor/vtv.py | 91 -------------------- 3 files changed, 134 insertions(+), 92 deletions(-) create mode 100644 youtube_dl/extractor/tvnet.py delete mode 100644 youtube_dl/extractor/vtv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e6d1fe70e..d4583b8e4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1139,6 +1139,7 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE +from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, @@ -1306,7 +1307,6 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE -from .vtv import VTVIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py new file mode 100644 index 000000000..0ec2da4da --- /dev/null +++ b/youtube_dl/extractor/tvnet.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class TVNetIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P[0-9]+)' + _TESTS = [{ + # video + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': 'b4d7abe0252c9b47774760b7519c7558', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + 'view_count': int, + }, + }, { + # audio + 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', + 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', + 'info_dict': { + 'id': '27017', + 'ext': 'm4a', + 'title': 'VOV1 - Bản tin chiều (10/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + }, { + # live stream + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + # radio live stream + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, default=None) or self._search_regex( + r'([^<]+)<', webpage, 'title') + title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + + if '/video/' in url or '/radio/' in url: + is_live = False + elif '/kenh-truyen-hinh/' in url: + is_live = True + else: + is_live = None + + data_file = unescapeHTML(self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'data file', group='url')) + + stream_urls = set() + formats = [] + for stream in self._download_json(data_file, video_id): + if not isinstance(stream, dict): + continue + stream_url = stream.get('url') + if (stream_url in stream_urls or not stream_url or + not isinstance(stream_url, compat_str)): + continue + stream_urls.add(stream_url) + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + # better support for radio streams + if title.startswith('VOV'): + for f in formats: + f.update({ + 'ext': 'm4a', + 'vcodec': 'none', + }) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or unescapeHTML( + self._search_regex( + r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'thumbnail', default=None, group='url')) + + if is_live: + title = self._live_title(title) + + view_count = int_or_none(self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', + webpage, 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/vtv.py b/youtube_dl/extractor/vtv.py deleted file mode 100644 index a9683dd85..000000000 --- a/youtube_dl/extractor/vtv.py +++ /dev/null @@ -1,91 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -import re - -from ..utils import extract_attributes - -class VTVIE(InfoExtractor): - _VALID_URL = r'https?://(au|ca|cz|de|jp|kr|tw|us|vn)\.tvnet\.gov\.vn/[^/]*/(?P<id>[0-9]+)/?' - _TESTS = [{ - # Livestream. Channel: VTV 1 - 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', - 'info_dict': { - 'id': '1011', - 'ext': 'mp4', - 'title': r're:^VTV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:https?://.*\.png$', - } - }, { - # Downloading a video. - 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', - 'md5': '5263c63d738569ed507980f1e49ebc03', - 'info_dict': { - 'id': '109788', - 'ext': 'mp4', - 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang - TV Net', - 'thumbnail': r're:https?://.*\.JPG$', - } - }, { - # Radio live stream. Channel: VOV 1 - 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', - 'info_dict': { - 'id': '1014', - 'ext': 'm4a', - 'vcodec': 'none', - 'title': r're:VOV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:https?://.*\.png$', - } - - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'<title>(.+?)', webpage, 'title', default=None, fatal=False) - if title is None: - title = self._og_search_title(webpage) - title.strip() - - mediaplayer_div = self._search_regex(r'(]*id="mediaplayer"[^>]*>)', webpage, 'mediaplayer element') - mediaplayer_div_attributes = extract_attributes(mediaplayer_div) - - thumbnail = mediaplayer_div_attributes.get("data-image") - - json_url = mediaplayer_div_attributes["data-file"] - video_streams = self._download_json(json_url, video_id) - - - # get any working playlist from streams. Currently there's 2 and the first always works, - # but you never know in the future - for stream in video_streams: - formats = self._extract_m3u8_formats(stream.get("url"), video_id, ext="mp4", fatal=False) - if formats: - break - - # better support radio streams - if title.startswith("VOV"): - for f in formats: - f["ext"] = "m4a" - f["vcodec"] = "none" - - if "/video/" in url or "/radio/" in url: - is_live = False - elif "/kenh-truyen-hinh/" in url: - is_live = True - else: - is_live = None - - if is_live: - title = self._live_title(title) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'is_live': is_live, - } From 0645be49cb06f54135a9b92556207c1c468853ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 01:41:23 +0700 Subject: [PATCH 132/187] [inc] PEP 8 --- youtube_dl/extractor/inc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py index 8dee143ca..d5b258a0f 100644 --- a/youtube_dl/extractor/inc.py +++ b/youtube_dl/extractor/inc.py @@ -52,7 +52,7 @@ class IncIE(InfoExtractor): kaltura_id = self._search_regex( r'id=(["\'])kaltura_player_(?P.+?)\1', webpage, 'kaltura id', default=None, group='id') or self._parse_json(self._search_regex( - r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), + r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), display_id)['vid_kaltura_id'] return self.url_result( From e51752754d0dfe3e6597634c0c0f65508c55bcb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 01:50:43 +0700 Subject: [PATCH 133/187] [tvnet] Improve video id extraction --- youtube_dl/extractor/tvnet.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py index 0ec2da4da..1083d2730 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/youtube_dl/extractor/tvnet.py @@ -12,7 +12,7 @@ from ..utils import ( class TVNetIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P[0-9]+)/' _TESTS = [{ # video 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', @@ -36,6 +36,18 @@ class TVNetIE(InfoExtractor): 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', 'is_live': False, }, + }, { + 'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705', + 'info_dict': { + 'id': '129999', + 'ext': 'mp4', + 'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, }, { # live stream 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', @@ -62,6 +74,9 @@ class TVNetIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh', + 'only_matching': True, }] def _real_extract(self, url): From 7dc9c60b4b848f4b1d38b2995c7d421c89be93c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 02:05:58 +0700 Subject: [PATCH 134/187] [tvnet] Fix _VALID_URL --- youtube_dl/extractor/tvnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py index 1083d2730..2b2630b91 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/youtube_dl/extractor/tvnet.py @@ -12,7 +12,7 @@ from ..utils import ( class TVNetIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P[0-9]+)/' + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P\d+)(?:/|$)' _TESTS = [{ # video 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', From dc53c7863481f927bb011ad28784facec428ce22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 02:06:30 +0700 Subject: [PATCH 135/187] [crackle] Add support for sonycrackle.com (closes #16698) --- youtube_dl/extractor/crackle.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index fc014f8b5..f4a616455 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -19,8 +19,8 @@ from ..utils import ( class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' - _TEST = { + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' + _TESTS = [{ # geo restricted to CA 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { @@ -45,7 +45,10 @@ class CrackleIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 5d6c81b63f08f1ea0c2d01579c99fe7c8f0aa6e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 03:12:29 +0700 Subject: [PATCH 136/187] [downloader/http] Fix resume when writing ot stdout (closes #16699) --- youtube_dl/downloader/http.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index a22875f69..5b1e96013 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -217,10 +217,11 @@ class HttpFD(FileDownloader): before = start # start measuring def retry(e): - if ctx.tmpfilename != '-': + to_stdout = ctx.tmpfilename == '-' + if not to_stdout: ctx.stream.close() ctx.stream = None - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) raise RetryDownload(e) while True: From e0671819e71744b6a9a32fc3f4d5fbc8aca8a8f1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Jun 2018 13:07:20 +0100 Subject: [PATCH 137/187] [abc] fix ABC IView extraction and add support for livestreams(closes #16704)(closes #12354) --- youtube_dl/extractor/abc.py | 42 +++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 512f04684..266d76481 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -105,22 +105,22 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' - _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', + 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZY9247A021S00', + 'id': 'ZX9371A050S00', 'ext': 'mp4', - 'title': "Gaston's Visit", + 'title': "Gaston's Birthday", 'series': "Ben And Holly's Little Kingdom", - 'description': 'md5:18db170ad71cf161e006a4c688e33155', - 'upload_date': '20180318', + 'description': 'md5:f9de914d02f226968f598ac76f105bcf', + 'upload_date': '20180604', 'uploader_id': 'abc4kids', - 'timestamp': 1521400959, + 'timestamp': 1528140219, }, 'params': { 'skip_download': True, @@ -129,17 +129,16 @@ class ABCIViewIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_params = self._parse_json(self._search_regex( - r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) - title = video_params.get('title') or video_params['seriesTitle'] - stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + video_params = self._download_json( + 'https://iview.abc.net.au/api/programs/' + video_id, video_id) + title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) + stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) - house_number = video_params.get('episodeHouseNumber') - path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + house_number = video_params.get('episodeHouseNumber') or video_id + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( int(time.time()), house_number) sig = hmac.new( - 'android.content.res.Resources'.encode('utf-8'), + b'android.content.res.Resources', path.encode('utf-8'), hashlib.sha256).hexdigest() token = self._download_webpage( 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) @@ -169,18 +168,21 @@ class ABCIViewIE(InfoExtractor): 'ext': 'vtt', }] + is_live = video_params.get('livestream') == '1' + if is_live: + title = self._live_title(title) + return { 'id': video_id, - 'title': unescapeHTML(title), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), - 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'title': title, + 'description': video_params.get('description'), + 'thumbnail': video_params.get('thumbnail'), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], - 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), - 'episode': self._html_search_meta('episode_title', webpage, default=None), 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } From 9aca7fe6a3551df6379079f570e5a8bdf517c670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 20:25:50 +0700 Subject: [PATCH 138/187] [abc:iview] Extract more series metadata --- youtube_dl/extractor/abc.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 266d76481..4ac323bf6 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -181,6 +181,11 @@ class ABCIViewIE(InfoExtractor): 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], + 'season_number': int_or_none(self._search_regex( + r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), + 'episode_number': int_or_none(self._search_regex( + r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), + 'episode_id': house_number, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, From f15f7a674b309eff00a66d16449f8d5abb1c6682 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Jun 2018 14:46:00 +0100 Subject: [PATCH 139/187] [dailymotion] add support for password protected videos(closes #9789) --- youtube_dl/extractor/dailymotion.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index de27fffd4..0afb6a158 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,9 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json +import base64 +import hashlib import itertools +import json +import random +import re +import string +import struct from .common import InfoExtractor @@ -64,7 +69,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, - 'view_count': int, }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -167,6 +171,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + if metadata.get('error', {}).get('type') == 'password_protected': + password = self._downloader.params.get('videopassword') + if password: + r = int(metadata['id'][1:], 36) + us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') + t = ''.join(random.choice(string.ascii_letters) for i in range(10)) + n = us64e(struct.pack('I', r)) + i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) + metadata = self._download_json( + 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) + self._check_error(metadata) formats = [] @@ -302,8 +317,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): error = info.get('error') - if info.get('error') is not None: - title = error['title'] + if error: + title = error.get('title') or error['message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': self.raise_geo_restricted(msg=title) From 18d66f04107b584c2d6ee6c175c44c7f2d81ecba Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Jun 2018 15:12:42 +0100 Subject: [PATCH 140/187] [dailymotion] use compat_struct_pack --- youtube_dl/extractor/dailymotion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 0afb6a158..9a74906cb 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -8,10 +8,9 @@ import json import random import re import string -import struct from .common import InfoExtractor - +from ..compat import compat_struct_pack from ..utils import ( determine_ext, error_to_compat_str, @@ -177,7 +176,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): r = int(metadata['id'][1:], 36) us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') t = ''.join(random.choice(string.ascii_letters) for i in range(10)) - n = us64e(struct.pack('I', r)) + n = us64e(compat_struct_pack('I', r)) i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) metadata = self._download_json( 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) From aa56061627f9871b4793414b71a26976befd3a9c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Jun 2018 16:46:59 +0100 Subject: [PATCH 141/187] [discoverynetworks] Add support for disco-api videos(closes #16724) --- youtube_dl/extractor/discoverynetworks.py | 19 ++- youtube_dl/extractor/dplay.py | 137 +++++++++++----------- 2 files changed, 87 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index b6653784c..fba1ef221 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE +from .dplay import DPlayIE from ..compat import ( compat_parse_qs, compat_urlparse, @@ -12,8 +12,13 @@ from ..compat import ( from ..utils import smuggle_url -class DiscoveryNetworksDeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P\d+)|(?:[^/]+/)*videos/(?P[^/?#]+))' +class DiscoveryNetworksDeIE(DPlayIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ + (?: + .*\#(?P<id>\d+)| + (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| + programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) + )''' _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -40,6 +45,14 @@ class DiscoveryNetworksDeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + alternate_id = mobj.group('alternate_id') + if alternate_id: + self._initialize_geo_bypass({ + 'countries': ['DE'], + }) + return self._get_disco_api_info( + url, '%s/%s' % (mobj.group('programme'), alternate_id), + 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') brightcove_id = mobj.group('id') if not brightcove_id: title = mobj.group('title') diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 8e0374320..fe47f6dce 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -97,6 +97,75 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _get_disco_api_info(self, url, display_id, disco_host, realm): + disco_base = 'https://' + disco_host + token = self._download_json( + '%s/token' % disco_base, display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + headers = { + 'Referer': url, + 'Authorization': 'Bearer ' + token, + } + video = self._download_json( + '%s/content/videos/%s' % (disco_base, display_id), display_id, + headers=headers, query={ + 'include': 'show' + }) + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'] + formats = [] + for format_id, format_dict in self._download_json( + '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), + display_id, headers=headers)['data']['attributes']['streaming'].items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + series = None + try: + included = video.get('included') + if isinstance(included, list): + show = next(e for e in included if e.get('type') == 'show') + series = try_get( + show, lambda x: x['attributes']['name'], compat_str) + except StopIteration: + pass + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description'), + 'duration': float_or_none( + info.get('videoDuration'), scale=1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') @@ -113,72 +182,8 @@ class DPlayIE(InfoExtractor): if not video_id: host = mobj.group('host') - disco_base = 'https://disco-api.%s' % host - self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', - query={ - 'realm': host.replace('.', ''), - }) - video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, - headers={ - 'Referer': url, - 'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1', - }, query={ - 'include': 'show' - }) - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'] - formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id)['data']['attributes']['streaming'].items(): - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - } + return self._get_disco_api_info( + url, display_id, 'disco-api.' + host, host.replace('.', '')) info = self._download_json( 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), From 03eef0f03259bfcae284a56d00035950dd54f316 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Jun 2018 01:22:42 +0700 Subject: [PATCH 142/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ChangeLog b/ChangeLog index b808ad6ba..11812b195 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +version <unreleased> + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + version 2018.06.11 Extractors From c797db4a2fb7c8e41485bac74fe7f78295bab556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Jun 2018 01:24:53 +0700 Subject: [PATCH 143/187] release 2018.06.14 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d5be8003b..1cfb54bfd 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.11** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.11 +[debug] youtube-dl version 2018.06.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 11812b195..062000594 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.14 Core * [downloader/http] Fix retry on error when streaming to stdout (#16699) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e1a9f2236..705279ac1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -893,6 +893,7 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** + - **TVNet** - **TVNoe** - **TVNow** - **TVNowList** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e72f42cf2..1533dceb4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.11' +__version__ = '2018.06.14' From 61cb66830f5097b528aab381eb6b343a89f73cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Jun 2018 22:40:30 +0700 Subject: [PATCH 144/187] [bilibili] Restrict cid regex (closes #16638, closes #16734) --- youtube_dl/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 3e3348ef5..4d6b051fe 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -114,7 +114,7 @@ class BiliBiliIE(InfoExtractor): if 'anime/' not in url: cid = self._search_regex( - r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', From 9b0b62753432244fd062d99cc5dc604d6bad7877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Jun 2018 02:59:15 +0700 Subject: [PATCH 145/187] [downloader/rtmp] Fix downloading in verbose mode (closes #16736) --- youtube_dl/downloader/rtmp.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 9e0ddbb18..fbb7f51b0 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -24,13 +24,12 @@ class RtmpFD(FileDownloader): def real_download(self, filename, info_dict): def run_rtmpdump(args): start = time.time() + resume_percent = None + resume_downloaded_data_len = None proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True - - def dl(): - resume_percent = None - resume_downloaded_data_len = None - proc_stderr_closed = False + proc_stderr_closed = False + try: while not proc_stderr_closed: # read line from stderr line = '' @@ -90,12 +89,8 @@ class RtmpFD(FileDownloader): self.to_screen('') cursor_in_new_line = True self.to_screen('[rtmpdump] ' + line) - - try: - dl() finally: proc.wait() - if not cursor_in_new_line: self.to_screen('') return proc.returncode From 87f89dacddfa46399aea9252ca078f5f386dce38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 16 Jun 2018 02:55:20 +0700 Subject: [PATCH 146/187] [pbs] Improve extraction (closes #16623, closes #16684) --- youtube_dl/extractor/pbs.py | 57 ++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8d6f2dd3d..52ab2f158 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -375,6 +376,35 @@ class PBSIE(InfoExtractor): }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -438,6 +468,7 @@ class PBSIE(InfoExtractor): r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -472,7 +503,8 @@ class PBSIE(InfoExtractor): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -482,13 +514,27 @@ class PBSIE(InfoExtractor): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'<div\s+id="video_([0-9]+)"', player_page, 'video ID') + r'<div\s+id=["\']video_(\d+)', player_page, 'video ID', + default=None) + if not video_id: + video_info = self._extract_video_data( + player_page, 'video data', display_id) + video_id = compat_str( + video_info.get('id') or video_info['contentID']) else: video_id = mobj.group('id') display_id = video_id return video_id, display_id, None, description + def _extract_video_data(self, string, name, video_id, fatal=True): + return self._parse_json( + self._search_regex( + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + r'window\.videoBridge\s*=\s*({.+?});'], + string, name, default='{}'), + video_id, transform_source=js_to_json, fatal=fatal) + def _real_extract(self, url): video_id, display_id, upload_date, description = self._extract_webpage(url) @@ -519,11 +565,8 @@ class PBSIE(InfoExtractor): 'http://player.pbs.org/%s/%s' % (page, video_id), display_id, 'Downloading %s page' % page, fatal=False) if player: - video_info = self._parse_json( - self._search_regex( - [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], - player, '%s video data' % page, default='{}'), - display_id, transform_source=js_to_json, fatal=False) + video_info = self._extract_video_data( + player, '%s video data' % page, display_id, fatal=False) if video_info: extract_redirect_urls(video_info) if not info: From 81c5df4f2ce33cff5d47b9ee29edf08b50998a53 Mon Sep 17 00:00:00 2001 From: Urgau <lolo.branstett@numericable.fr> Date: Sat, 16 Jun 2018 00:08:44 +0200 Subject: [PATCH 147/187] [vidzi] Fix extraction (closes #16678) --- youtube_dl/extractor/vidzi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9026e778c..d70283479 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -54,7 +54,8 @@ class VidziIE(InfoExtractor): self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=js_to_json) + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) if jwplayer_data: break From 734d461ca04a9f271dd463aa75d44ac82377057e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 16 Jun 2018 21:14:36 +0700 Subject: [PATCH 148/187] [expressen] Add extractor --- youtube_dl/extractor/expressen.py | 77 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 78 insertions(+) create mode 100644 youtube_dl/extractor/expressen.py diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py new file mode 100644 index 000000000..f61178012 --- /dev/null +++ b/youtube_dl/extractor/expressen.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, + unified_timestamp, +) + + +class ExpressenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', + 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'info_dict': { + 'id': '8690962', + 'ext': 'mp4', + 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', + 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 788, + 'timestamp': 1526639109, + 'upload_date': '20180518', + }, + }, { + 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + def extract_data(name): + return self._parse_json( + self._search_regex( + r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name, + webpage, 'info', group='value'), + display_id, transform_source=unescapeHTML) + + info = extract_data('video-tracking-info') + video_id = info['videoId'] + + data = extract_data('article-data') + stream = data['stream'] + + if determine_ext(stream) == 'm3u8': + formats = self._extract_m3u8_formats( + stream, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{ + 'url': stream, + }] + self._sort_formats(formats) + + title = info.get('titleRaw') or data['title'] + description = info.get('descriptionRaw') + thumbnail = info.get('socialMediaImage') or data.get('image') + duration = int_or_none(info.get('videoTotalSecondsDuration') or + data.get('totalSecondsDuration')) + timestamp = unified_timestamp(info.get('publishDate')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d4583b8e4..c3e6daa24 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -335,6 +335,7 @@ from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE +from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE from .facebook import ( From 764cd4e6f3450997eb0499b68b17b580a5e074f3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 02:43:24 +0100 Subject: [PATCH 149/187] [rtbf] improve extraction - add support for audio and live streams(closes #11923)(closes #9638) - extract HLS, DASH and all HTTP formats - extract subtitles - fixup specific http urls(fixes #16101) --- youtube_dl/extractor/rtbf.py | 127 ++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 28cc5522d..acff9766a 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - int_or_none, ExtractorError, + float_or_none, + int_or_none, + strip_or_none, ) @@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor): (?: video/[^?]+\?.*\bid=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*id= + auvio/[^/]+\?.*\b(?P<live>l)?id= )(?P<id>\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', + 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', - 'duration': 3099, + 'description': '(du 25/04/2014)', + 'duration': 3099.54, 'upload_date': '20140425', - 'timestamp': 1398456336, - 'uploader': 'rtbfsport', + 'timestamp': 1398456300, } }, { # geo restricted @@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor): }, { 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, }] _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _PROVIDERS = { @@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) error = data.get('error') if error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - data = data['data'] - provider = data.get('provider') if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' formats = [] - for key, format_id in self._QUALITIES: - format_url = data.get(key + 'Url') - if format_url: + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats.copy(): + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': fix_url(format_url), + 'height': height, }) - thumbnails = [] - for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): - if thumbnail_id != 'default': - thumbnails.append({ - 'url': self._IMAGE_HOST + thumbnail_url, - 'id': thumbnail_id, - }) + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) return { - 'id': video_id, + 'id': media_id, 'formats': formats, - 'title': data['title'], - 'description': data.get('description') or data.get('subtitle'), - 'thumbnails': thumbnails, - 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': int_or_none(data.get('created')), - 'view_count': int_or_none(data.get('viewCount')), - 'uploader': data.get('channel'), - 'tags': data.get('tags'), + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, } From 18825117545690499dc7064cd5ba207ca5ca3e23 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 12:01:14 +0100 Subject: [PATCH 150/187] [6play] add support for rtlplay.be and extract hd usp formats --- youtube_dl/extractor/sixplay.py | 43 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 69951e387..1f8469a90 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -19,29 +19,33 @@ from ..utils import ( class SixPlayIE(InfoExtractor): IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450', - 'md5': '42310bffe4ba3982db112b9cd3467328', + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay.be)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', 'info_dict': { - 'id': '11638450', + 'id': '12041051', 'ext': 'mp4', - 'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6', - 'description': 'md5:308853f6a5f9e2d55a30fc0654de415f', - 'duration': 39, - 'series': 'Le meilleur pâtissier', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', }, - 'params': { - 'skip_download': True, - }, - } + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + }.get(domain, ('6play', 'm6web')) data = self._download_json( - 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id, - video_id, query={ + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ 'csa': 5, 'with': 'clips', }) @@ -65,7 +69,12 @@ class SixPlayIE(InfoExtractor): subtitles.setdefault('fr', []).append({'url': asset_url}) continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage(asset_url, video_id, fatal=False) + if not urlh: + continue + asset_url = urlh.geturl() asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', From 8b183bd5f800792cfc37da8ef2383fb5ba88195c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 15:53:29 +0100 Subject: [PATCH 151/187] [tf1] try all supported adaptive urls --- youtube_dl/extractor/tf1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e595c4a69..903f47380 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -19,6 +19,7 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { From 0adf213d8cce21e1a6ca6be7df532d67d184fbe2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 15:56:52 +0100 Subject: [PATCH 152/187] [wat] try all supported adaptive urls --- youtube_dl/extractor/wat.py | 41 +++++++++++++++---------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 20fef1f04..8ef3e0906 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -19,7 +19,6 @@ class WatIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': '83d882d9de5c9d97f0bb2c6273cde56a', 'info_dict': { 'id': '11713067', 'ext': 'mp4', @@ -28,10 +27,15 @@ class WatIE(InfoExtractor): 'upload_date': '20140819', 'duration': 120, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', 'info_dict': { 'id': '11713075', 'ext': 'mp4', @@ -98,38 +102,25 @@ class WatIE(InfoExtractor): formats = [] try: + alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id) m3u8_url = manifest_urls.get('hls') if m3u8_url: m3u8_url = remove_bitrate_limit(m3u8_url) - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + for m3u8_alt_url in alt_urls(m3u8_url): + formats.extend(self._extract_m3u8_formats( + m3u8_alt_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - http_url = extract_url('android5/%s.mp4', 'http') - if http_url: - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - format_id = m3u8_format['format_id'].replace('hls', 'http') - fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) - if self._is_valid_url(fmt_url, video_id, format_id): - f = m3u8_format.copy() - f.update({ - 'url': fmt_url, - 'format_id': format_id, - 'protocol': 'http', - }) - formats.append(f) mpd_url = manifest_urls.get('mpd') if mpd_url: - formats.extend(self._extract_mpd_formats(remove_bitrate_limit( - mpd_url), video_id, mpd_id='dash', fatal=False)) + mpd_url = remove_bitrate_limit(mpd_url) + for mpd_alt_url in alt_urls(mpd_url): + formats.extend(self._extract_mpd_formats( + mpd_alt_url, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) except ExtractorError: abr = 64 From ce0edda0f9c0d8cf6250edfa7a43ddbccd101cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 00:49:50 +0700 Subject: [PATCH 153/187] [markiza] Add extractors (closes #16750) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/markiza.py | 121 +++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 youtube_dl/extractor/markiza.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c3e6daa24..3b3964c01 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -590,6 +590,10 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py new file mode 100644 index 000000000..e6bfab114 --- /dev/null +++ b/youtube_dl/extractor/markiza.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + orderedSet, + parse_duration, + try_get, +) + + +class MarkizaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P<id>\d+)(?:[_/]|$)' + _TESTS = [{ + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139078', + 'ext': 'mp4', + 'title': 'Oteckovia 109', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2760, + }, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny', + 'info_dict': { + 'id': '85430', + 'title': 'Televízne noviny', + }, + 'playlist_count': 23, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/84723', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky', + 'only_matching': True, + }, { + 'url': 'http://videoarchiv.markiza.sk/embed/85295', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://videoarchiv.markiza.sk/json/video_jwplayer7.json', + video_id, query={'id': video_id}) + + info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash') + + if info.get('_type') == 'playlist': + info.update({ + 'id': video_id, + 'title': try_get( + data, lambda x: x['details']['name'], compat_str), + }) + else: + info['duration'] = parse_duration( + try_get(data, lambda x: x['details']['duration'], compat_str)) + return info + + +class MarkizaPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P<id>\d+)_' + _TESTS = [{ + 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', + 'md5': 'ada4e9fad038abeed971843aa028c7b0', + 'info_dict': { + 'id': '139355', + 'ext': 'mp4', + 'title': 'Oteckovia 110', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2604, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas', + 'only_matching': True, + }, { + 'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach', + 'only_matching': True, + }, { + 'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili', + 'only_matching': True, + }, { + 'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba', + 'only_matching': True, + }, { + 'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) + for video_id in orderedSet(re.findall( + r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)', + webpage))] + + return self.playlist_result(entries, playlist_id) From 9e761fe6f555a3ad0b92bdc2c651a4c5b8aff887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 01:31:49 +0700 Subject: [PATCH 154/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 062000594..38cfcd8fd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version <unreleased> + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + version 2018.06.14 Core From 858cf4dc2966d398d939cedffc160afad2484f8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 01:34:36 +0700 Subject: [PATCH 155/187] release 2018.06.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1cfb54bfd..de3888214 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.14 +[debug] youtube-dl version 2018.06.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 38cfcd8fd..fe5087097 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.18 Core * [downloader/rtmp] Fix downloading in verbose mode (#16736) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 705279ac1..432a7ba93 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -266,6 +266,7 @@ - **Europa** - **EveryonesMixtape** - **ExpoTV** + - **Expressen** - **ExtremeTube** - **EyedoTV** - **facebook** @@ -455,6 +456,8 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **Markiza** + - **MarkizaPage** - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1533dceb4..49fef60ea 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.14' +__version__ = '2018.06.18' From 8ba84e4600229c9baec6410b0c0c9e500c0105b5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 17 Jun 2018 20:40:35 +0100 Subject: [PATCH 156/187] [tvnow] try all clear manifest urls(closes #15361) --- youtube_dl/extractor/tvnow.py | 53 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index 808571ece..60937616f 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -19,8 +19,8 @@ class TVNowBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', - 'format.defaultImage169Logo') + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -31,27 +31,42 @@ class TVNowBaseIE(InfoExtractor): video_id = compat_str(info['id']) title = info['title'] - mpd_url = info['manifest']['dashclear'] - if not mpd_url: + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: if info.get('isDrm'): raise ExtractorError( 'Video %s is DRM protected' % video_id, expected=True) if info.get('geoblocked'): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) + raise self.raise_geo_restricted() if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - - mpd_url = update_url_query(mpd_url, {'filter': ''}) - formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) description = info.get('articleLong') or info.get('articleShort') @@ -88,7 +103,7 @@ class TVNowBaseIE(InfoExtractor): class TVNowIE(TVNowBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ (?P<show_id>[^/]+)/ (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) ''' @@ -140,11 +155,13 @@ class TVNowIE(TVNowBaseIE): }] def _real_extract(self, url): - display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + display_id = '%s/%s' % mobj.group(2, 3) info = self._call_api( 'movies/' + display_id, display_id, query={ 'fields': ','.join(self._VIDEO_FIELDS), + 'station': mobj.group(1), }) return self._extract_video(info, display_id) From 075a13d3e9e860f0033ea5a37795bebba02690b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 03:22:08 +0700 Subject: [PATCH 157/187] [compat] Introduce compat_integer_types --- youtube_dl/compat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4a611f183..7b770340f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2787,6 +2787,12 @@ except NameError: # Python 3 compat_numeric_types = (int, float, complex) +try: + compat_integer_types = (int, long) +except NameError: # Python 3 + compat_integer_types = (int, ) + + if sys.version_info < (2, 7): def compat_socket_create_connection(address, timeout, source_address=None): host, port = address @@ -2974,6 +2980,7 @@ __all__ = [ 'compat_http_client', 'compat_http_server', 'compat_input', + 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', 'compat_numeric_types', From d391b7e23d3d6c2af03c6329b4bf059ec095f33d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 04:01:48 +0700 Subject: [PATCH 158/187] [extractor/common] Introduce expected_status for convenient accept of failed HTTP requests Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response. --- youtube_dl/extractor/common.py | 135 +++++++++++++++++++++++++++------ 1 file changed, 113 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a2548dba3..394f34372 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -19,6 +19,7 @@ from ..compat import ( compat_cookies, compat_etree_fromstring, compat_getpass, + compat_integer_types, compat_http_client, compat_os_name, compat_str, @@ -548,8 +549,26 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - """ Returns the response handle """ + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ if note is None: self.report_download_webpage(video_id) elif note is not False: @@ -578,6 +597,10 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + return err.fp + if errnote is False: return False if errnote is None: @@ -590,13 +613,17 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): - """ Returns a tuple (page content as string, URL handle) """ + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) if urlh is False: assert not fatal return False @@ -685,13 +712,52 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): - """ Returns the data of the page as a string """ + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=5, encoding=None, data=None, + headers={}, query={}, expected_status=None): + """ + Return the data of the page as a string. + + Arguments: + url_or_request -- plain text URL as a string or + a compat_urllib_request.Requestobject + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + tries -- number of tries + timeout -- sleep interval between tries + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -707,11 +773,17 @@ class InfoExtractor(object): def _download_xml_handle( self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res xml_string, urlh = res @@ -719,15 +791,21 @@ class InfoExtractor(object): xml_string, video_id, transform_source=transform_source, fatal=fatal), urlh - def _download_xml(self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}): - """Return the xml as an xml.etree.ElementTree.Element""" + def _download_xml( + self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}, expected_status=None): + """ + Return the xml as an xml.etree.ElementTree.Element. + + See _download_webpage docstring for arguments specification. + """ res = self._download_xml_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): @@ -745,11 +823,17 @@ class InfoExtractor(object): def _download_json_handle( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - """Return a tuple (JSON object, URL handle)""" + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query) + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) if res is False: return res json_string, urlh = res @@ -760,11 +844,18 @@ class InfoExtractor(object): def _download_json( self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ res = self._download_json_handle( url_or_request, video_id, note=note, errnote=errnote, transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query) + data=data, headers=headers, query=query, + expected_status=expected_status) return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): From 00a429bea3c2deacef5dbfb2b0b7e191b1dbaf62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 04:04:13 +0700 Subject: [PATCH 159/187] [markiza] Expect 500 status code --- youtube_dl/extractor/markiza.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py index e6bfab114..def960a0c 100644 --- a/youtube_dl/extractor/markiza.py +++ b/youtube_dl/extractor/markiza.py @@ -110,7 +110,11 @@ class MarkizaPageIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage( + # Downloading for some hosts (e.g. dajto, doma) fails with 500 + # although everything seems to be OK, so considering 500 + # status code to be expected. + url, playlist_id, expected_status=500) entries = [ self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) From 9283d4ea03f907f2b9e7954b0897075a165b4d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 04:04:47 +0700 Subject: [PATCH 160/187] [bbccouk] Use expected_status --- youtube_dl/extractor/bbc.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 30a63a24e..293d82b0f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -21,7 +21,6 @@ from ..utils import ( urljoin, ) from ..compat import ( - compat_etree_fromstring, compat_HTTPError, compat_urlparse, ) @@ -334,14 +333,9 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - try: - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): - media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) - else: - raise + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): From 721a877d2fb82de18e4aeec27d70f84f9b41f766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 23:08:35 +0700 Subject: [PATCH 161/187] [vgtv] Add support for www.aftonbladet.se/tv/ URLs --- youtube_dl/extractor/vgtv.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index c21a09c01..d430e2944 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -24,6 +24,7 @@ class VGTVIE(XstreamIE): 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', } _APP_NAME_TO_VENDOR = { @@ -44,7 +45,7 @@ class VGTVIE(XstreamIE): (?: (?:\#!/)?(?:video|live)/| embed?.*id=| - articles/ + a(?:rticles)?/ )| (?P<appname> %s @@ -143,6 +144,10 @@ class VGTVIE(XstreamIE): 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, { 'url': 'abtv:140026', 'only_matching': True, From 713afa705c228c2caa6054fff19a7690ba19d64a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Jun 2018 23:15:38 +0700 Subject: [PATCH 162/187] [vgtv] Improve HLS formats extraction --- youtube_dl/extractor/vgtv.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index d430e2944..fe7a26b62 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -183,13 +183,15 @@ class VGTVIE(XstreamIE): streams = data['streamUrls'] stream_type = data.get('streamType') - + is_live = stream_type == 'live' formats = [] hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -234,13 +236,13 @@ class VGTVIE(XstreamIE): info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], + 'title': self._live_title(data['title']) if is_live else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'is_live': True if stream_type == 'live' else False, + 'is_live': is_live, }) return info From 18806e3b6b95d03c773c89e465e1b28b2f12a618 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 18 Jun 2018 19:08:54 +0100 Subject: [PATCH 163/187] [rtbf] fix extraction for python 3.2 and older --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index acff9766a..3b0f3080b 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -99,7 +99,7 @@ class RTBFIE(InfoExtractor): http_url = data.get('url') if formats and http_url and re.search(height_re, http_url): http_url = fix_url(http_url) - for m3u8_f in formats.copy(): + for m3u8_f in formats[:]: height = m3u8_f.get('height') if not height: continue From e12b4b8bccd364cb5cc68aab4888209965a82dc1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 19 Jun 2018 10:35:42 +0100 Subject: [PATCH 164/187] [6play] use geo verfication headers --- youtube_dl/extractor/sixplay.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 1f8469a90..a363221bc 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -71,7 +71,9 @@ class SixPlayIE(InfoExtractor): if container == 'm3u8' or ext == 'm3u8': if protocol == 'usp': if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: - urlh = self._request_webpage(asset_url, video_id, fatal=False) + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) if not urlh: continue asset_url = urlh.geturl() From 8b4b400aef83b233502ece7321ee84f6ab9e213e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:00:36 +0700 Subject: [PATCH 165/187] [peertube] Improve generic support (closes #16733) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/peertube.py | 47 +++++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dad951b75..6c0f772ac 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3076,7 +3076,7 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - peertube_urls = PeerTubeIE._extract_urls(webpage) + peertube_urls = PeerTubeIE._extract_urls(webpage, url) if peertube_urls: return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index a481b3151..d9849a2ba 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -116,12 +116,14 @@ class PeerTubeIE(InfoExtractor): videos\.tcit\.fr| peertube\.cpy\.re )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _VALID_URL = r'''(?x) - https?:// - %s - /(?:videos/(?:watch|embed)|api/v\d/videos)/ - (?P<id>[^/?\#&]+) - ''' % _INSTANCES_RE + (?: + peertube:(?P<host>[^:]+):| + https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P<id>%s) + ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', @@ -157,21 +159,40 @@ class PeerTubeIE(InfoExtractor): }, { 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, }] @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'''(?x)<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' - % PeerTubeIE._INSTANCES_RE, webpage)] + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P<host>[^/]+)/videos/watch/(?P<id>%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + '<title>PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') video = self._download_json( - urljoin(url, '/api/v1/videos/%s' % video_id), video_id) + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) title = video['name'] From e73050882763705ccb8e487edbc3983b5582b1a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:12:53 +0700 Subject: [PATCH 166/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index fe5087097..1494081b8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version <unreleased> + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + version 2018.06.18 Core From c9b983ff827aae25a0fe2116c98c26702c581b81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Jun 2018 23:16:04 +0700 Subject: [PATCH 167/187] release 2018.06.19 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index de3888214..d254678b5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.19*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.19** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.18 +[debug] youtube-dl version 2018.06.19 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 1494081b8..93dc40d8c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.19 Core + [extractor/common] Introduce expected_status in _download_* methods diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 49fef60ea..dd4795cd1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.18' +__version__ = '2018.06.19' From f51f526b0acb5943332452d1958581cb1135bfe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 20 Jun 2018 23:51:14 +0700 Subject: [PATCH 168/187] [foxnews] Add support for iframe embeds (closes #15810, closes #16711) --- youtube_dl/extractor/foxnews.py | 42 +++++++++++++++++++++++++++------ youtube_dl/extractor/generic.py | 6 +++++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index dc0662f74..4c402806a 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -58,6 +58,14 @@ class FoxNewsIE(AMPIE): }, ] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', + webpage)] + def _real_extract(self, url): host, video_id = re.match(self._VALID_URL, url).groups() @@ -71,18 +79,35 @@ class FoxNewsArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' - _TEST = { + _TESTS = [{ + # data-video-id 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'md5': '83d44e1aff1433e7a29a7b537d1700b5', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', 'description': 'Veterans react on \'The Kelly File\'', - 'timestamp': 1473299755, + 'timestamp': 1473301045, 'upload_date': '20160908', }, - } + }, { + # iframe embed + 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'info_dict': { + 'id': '5748266721001', + 'ext': 'flv', + 'title': 'Kyle Kashuv has a positive message for the Trump White House', + 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 229, + 'timestamp': 1520594670, + 'upload_date': '20180309', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -90,10 +115,13 @@ class FoxNewsArticleIE(InfoExtractor): video_id = self._html_search_regex( r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', - webpage, 'video ID', group='id') + webpage, 'video ID', group='id', default=None) + if video_id: + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) + return self.url_result( - 'http://video.foxnews.com/v/' + video_id, - FoxNewsIE.ie_key()) + FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) class FoxNewsInsiderIE(InfoExtractor): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6c0f772ac..d71cb9050 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -111,6 +111,7 @@ from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE from .apa import APAIE +from .foxnews import FoxNewsIE class GenericIE(InfoExtractor): @@ -3091,6 +3092,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( apa_urls, video_id, video_title, ie=APAIE.ie_key()) + foxnews_urls = FoxNewsIE._extract_urls(webpage) + if foxnews_urls: + return self.playlist_from_matches( + foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From 91aa502d916fd3f103d34f927748767413f1d1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 20 Jun 2018 23:59:37 +0700 Subject: [PATCH 169/187] [foxnews:insider] Remove extractor (#15810) Now covered by foxnews:article --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/foxnews.py | 49 +++--------------------------- 2 files changed, 4 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3b3964c01..27ece3b53 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -373,7 +373,6 @@ from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, FoxNewsArticleIE, - FoxNewsInsiderIE, ) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 4c402806a..63613cb85 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -76,7 +76,7 @@ class FoxNewsIE(AMPIE): class FoxNewsArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' _TESTS = [{ @@ -107,6 +107,9 @@ class FoxNewsArticleIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'only_matching': True, }] def _real_extract(self, url): @@ -122,47 +125,3 @@ class FoxNewsArticleIE(InfoExtractor): return self.url_result( FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) - - -class FoxNewsInsiderIE(InfoExtractor): - _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)' - IE_NAME = 'foxnews:insider' - - _TEST = { - 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', - 'md5': 'a10c755e582d28120c62749b4feb4c0c', - 'info_dict': { - 'id': '5099377331001', - 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', - 'ext': 'mp4', - 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', - 'description': 'Is campus censorship getting out of control?', - 'timestamp': 1472168725, - 'upload_date': '20160825', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': [FoxNewsIE.ie_key()], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - - return { - '_type': 'url_transparent', - 'ie_key': FoxNewsIE.ie_key(), - 'url': embed_url, - 'display_id': display_id, - 'title': title, - 'description': description, - } From 30374f4d40d8c993bf92c5af9b9c073da49fe8b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:06:58 +0700 Subject: [PATCH 170/187] [itv] Make SOAP request non fatal and extract metadata from a webpage (closes #16780) --- youtube_dl/extractor/itv.py | 126 ++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 6a4f8a505..40cffed46 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -18,6 +18,7 @@ from ..utils import ( xpath_element, xpath_text, int_or_none, + merge_dicts, parse_duration, smuggle_url, ExtractorError, @@ -129,64 +130,65 @@ class ITVIE(InfoExtractor): resp_env = self._download_xml( params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env)) - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) - else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), - }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + headers=headers, data=etree.tostring(req_env), fatal=False) + if resp_env: + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_code = xpath_text(resp_env, './/faultcode') + fault_string = xpath_text(resp_env, './/faultstring') + if fault_code == 'InvalidGeoRegion': + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) + elif fault_code not in ( + 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') @@ -261,7 +263,17 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, }) - return info + + webpage_info = self._search_json_ld(webpage, video_id, default={}) + if not webpage_info.get('title'): + webpage_info['title'] = self._html_search_regex( + r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or webpage_info['episode'] + + return merge_dicts(info, webpage_info) class ITVBTCCIE(InfoExtractor): From a4ec45179e554e9b24e32c3c06908804b42a5a9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:12:40 +0700 Subject: [PATCH 171/187] [itv] Sort imports --- youtube_dl/extractor/itv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 40cffed46..d05a7b68d 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -13,16 +13,16 @@ from ..compat import ( compat_etree_register_namespace, ) from ..utils import ( + determine_ext, + ExtractorError, extract_attributes, - xpath_with_ns, - xpath_element, - xpath_text, int_or_none, merge_dicts, parse_duration, smuggle_url, - ExtractorError, - determine_ext, + xpath_with_ns, + xpath_element, + xpath_text, ) From b71cc719103c45365244334a4c481f88cd3534fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:38:32 +0700 Subject: [PATCH 172/187] [motherless] Fix extraction (closes #16786) --- youtube_dl/extractor/motherless.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index e24396e79..f191310e1 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -77,8 +77,11 @@ class MotherlessIE(InfoExtractor): title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - video_url = self._html_search_regex( - r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') + video_url = (self._html_search_regex( + (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), + webpage, 'video URL', default=None, group='url') or + 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( r'<strong>Views</strong>\s+([^<]+)<', From 9fb62e35f6e7d865a73cc310f24ccfa0700e5e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Jun 2018 23:39:13 +0700 Subject: [PATCH 173/187] [motherless:group] Fix _VALID_URL --- youtube_dl/extractor/motherless.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index f191310e1..bed5645f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -123,7 +123,7 @@ class MotherlessIE(InfoExtractor): class MotherlessGroupIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://motherless.com/g/movie_scenes', 'info_dict': { From 74caf528bc822738dffe231df86ed399fc97a38a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 24 Jun 2018 12:02:16 +0100 Subject: [PATCH 174/187] [brightcove] workaround sonyliv DRM protected videos(closes #16807) --- youtube_dl/extractor/brightcove.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ab62e54d6..14f9a14ed 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -572,7 +572,8 @@ class BrightcoveNewIE(AdobePassIE): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism' or container == 'WVM': + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -629,6 +630,14 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) errors = json_data.get('errors') if not formats and errors: From a0949fec081d0badd6a584526cd66e8f170625c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jun 2018 23:57:22 +0700 Subject: [PATCH 175/187] [joj] Relax _VALID_URL (closes #16771) --- youtube_dl/extractor/joj.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index a764023e9..d9f8dbfd2 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -18,7 +18,7 @@ class JojIE(InfoExtractor): joj:| https?://media\.joj\.sk/embed/ ) - (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + (?P<id>[^/?#^]+) ''' _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', @@ -29,16 +29,24 @@ class JojIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3118, } + }, { + 'url': 'https://media.joj.sk/embed/9i1cxv', + 'only_matching': True, }, { 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', 'only_matching': True, + }, { + 'url': 'joj:9i1cxv', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - webpage) + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', + webpage)] def _real_extract(self, url): video_id = self._match_id(url) From c306f076ec81334b458f61b2a4ae683a9e732d06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jun 2018 02:17:14 +0700 Subject: [PATCH 176/187] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 93dc40d8c..327580328 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + version 2018.06.19 Core From 1f6cc5807ec69584664388b8edfaf6b3ae442cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jun 2018 02:26:02 +0700 Subject: [PATCH 177/187] release 2018.06.25 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d254678b5..128e6e681 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.19*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.19** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.06.19 +[debug] youtube-dl version 2018.06.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 327580328..8eb7469d4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.06.25 Extractors * [joj] Relax URL regular expression (#16771) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 432a7ba93..a78fabb02 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -290,7 +290,6 @@ - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - - **foxnews:insider** - **FoxSports** - **france2.fr:generation-what** - **FranceCulture** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dd4795cd1..8fbafd6a1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.06.19' +__version__ = '2018.06.25' From c3bcd206eb031de30179c88ac7acd806a477ceae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jun 2018 00:01:06 +0700 Subject: [PATCH 178/187] [porncom] Fix extraction (closes #16808) --- youtube_dl/extractor/porncom.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 60ade06da..5726cab3a 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -43,7 +43,8 @@ class PornComIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), webpage, 'config', default='{}'), display_id, transform_source=js_to_json, fatal=False) @@ -69,7 +70,7 @@ class PornComIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), } for format_url, height, filesize in re.findall( - r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', + r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', webpage)] thumbnail = None duration = None From 7b393f9cc5dc4790bcb623c768fa4a3046ef80bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jun 2018 04:29:11 +0700 Subject: [PATCH 179/187] [svt] Improve extraction and add support for pages (closes #16802) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 11 --- youtube_dl/extractor/svt.py | 117 ++++++++++++++++++++++++----- 3 files changed, 98 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 27ece3b53..f2377521b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1040,6 +1040,7 @@ from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, + SVTPageIE, SVTPlayIE, SVTSeriesIE, ) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d71cb9050..aa04905ed 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1395,17 +1395,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # SVT embed - { - 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', - 'info_dict': { - 'id': '2900353', - 'ext': 'flv', - 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', - 'duration': 27, - 'age_limit': 0, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index f71eab8b2..0901c3163 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -12,6 +12,8 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + orderedSet, + strip_or_none, try_get, urljoin, compat_str, @@ -137,7 +139,12 @@ class SVTPlayBaseIE(SVTBaseIE): class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' + _VALID_URL = r'''(?x) + (?: + svt:(?P<svt_id>[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -164,10 +171,40 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/kanaler/svt1', 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, }] + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict + def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, svt_id = mobj.group('id', 'svt_id') + + if svt_id: + return self._extract_by_video_id(svt_id) webpage = self._download_webpage(url, video_id) @@ -179,10 +216,6 @@ class SVTPlayIE(SVTPlayBaseIE): thumbnail = self._og_search_thumbnail(webpage) - def adjust_title(info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -193,24 +226,14 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - adjust_title(info_dict) + self._adjust_title(info_dict) return info_dict - video_id = self._search_regex( + svt_id = self._search_regex( r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id', default=None) + webpage, 'video id') - if video_id: - data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, - video_id, headers=self.geo_verification_headers()) - info_dict = self._extract_video(data, video_id) - if not info_dict.get('title'): - info_dict['title'] = re.sub( - r'\s*\|\s*.+?$', '', - info_dict.get('episode') or self._og_search_title(webpage)) - adjust_title(info_dict) - return info_dict + return self._extract_by_video_id(svt_id, webpage) class SVTSeriesIE(SVTPlayBaseIE): @@ -292,3 +315,57 @@ class SVTSeriesIE(SVTPlayBaseIE): return self.playlist_result( entries, series_id, title, metadata.get('description')) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'info_dict': { + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'info_dict': { + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + }, + 'playlist_count': 1, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] + + title = strip_or_none(self._og_search_title(webpage, default=None)) + + return self.playlist_result(entries, playlist_id, title) From acbd0ff5df5ff9d69e6707ea4fa3e3b4f9cc6528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 00:35:05 +0700 Subject: [PATCH 180/187] [dctptv] Restore extraction based on REST API (closes #16850) --- youtube_dl/extractor/dctp.py | 82 ++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 3a6d0560e..dc0c41b8a 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -5,13 +5,15 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, - unified_strdate, + int_or_none, + unified_timestamp, ) class DctpTvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ + # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', @@ -19,31 +21,49 @@ class DctpTvIE(InfoExtractor): 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', - 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 71.24, + 'timestamp': 1302172322, + 'upload_date': '20110407', }, 'params': { # rtmp download 'skip_download': True, }, - } + }, { + # 16x9 + 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', + 'only_matching': True, + }] + + _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com' def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + version = self._download_json( + '%s/version.json' % self._BASE_URL, display_id, + 'Downloading version JSON') - video_id = self._html_search_meta( - 'DC.identifier', webpage, 'video id', - default=None) or self._search_regex( - r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') + restapi_base = '%s/%s/restapi' % ( + self._BASE_URL, version['version_name']) - title = self._og_search_title(webpage) + info = self._download_json( + '%s/slugs/%s.json' % (restapi_base, display_id), display_id, + 'Downloading video info JSON') + + media = self._download_json( + '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])), + display_id, 'Downloading media JSON') + + uuid = media['uuid'] + title = media['title'] + ratio = '16x9' if media.get('is_wide') else '4x3' + play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) servers = self._download_json( 'http://www.dctp.tv/streaming_servers/', display_id, - note='Downloading server list', fatal=False) + note='Downloading server list JSON', fatal=False) if servers: endpoint = next( @@ -60,27 +80,35 @@ class DctpTvIE(InfoExtractor): formats = [{ 'url': endpoint, 'app': app, - 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'play_path': play_path, 'page_url': url, - 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', 'ext': 'flv', }] - description = self._html_search_meta('DC.description', webpage) - upload_date = unified_strdate( - self._html_search_meta('DC.date.created', webpage)) - thumbnail = self._og_search_thumbnail(webpage) - duration = float_or_none(self._search_regex( - r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', - default=None), scale=1000) + thumbnails = [] + images = media.get('images') + if isinstance(images, list): + for image in images: + if not isinstance(image, dict): + continue + image_url = image.get('url') + if not image_url or not isinstance(image_url, compat_str): + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) return { - 'id': video_id, - 'title': title, - 'formats': formats, + 'id': uuid, 'display_id': display_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'duration': duration, + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description') or media.get('teaser'), + 'timestamp': unified_timestamp(media.get('created')), + 'duration': float_or_none(media.get('duration_in_ms'), scale=1000), + 'thumbnails': thumbnails, + 'formats': formats, } From d4a24f4091a622b808ff621e78b5cfd0db3c8c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 01:09:14 +0700 Subject: [PATCH 181/187] Prefer ffmpeg over avconv by default (closes #8622) --- youtube_dl/YoutubeDL.py | 4 ++-- youtube_dl/options.py | 4 ++-- youtube_dl/postprocessor/ffmpeg.py | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a405c5ca..38ba43a97 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -305,8 +305,8 @@ class YoutubeDL(object): http_chunk_size. The following options are used by the post processors: - prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, - otherwise prefer avconv. + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. postprocessor_args: A list of additional command-line arguments for the postprocessor. diff --git a/youtube_dl/options.py b/youtube_dl/options.py index e83d546a0..e7d8e8910 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -841,11 +841,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors (default)') + help='Prefer avconv over ffmpeg for running the postprocessors') postproc.add_option( '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors') + help='Prefer ffmpeg over avconv for running the postprocessors (default)') postproc.add_option( '--ffmpeg-location', '--avconv-location', metavar='PATH', dest='ffmpeg_location', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3ea1afcf3..757b496a1 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -77,7 +77,7 @@ class FFmpegPostProcessor(PostProcessor): def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = False + prefer_ffmpeg = True self.basename = None self.probe_basename = None @@ -85,7 +85,7 @@ class FFmpegPostProcessor(PostProcessor): self._paths = None self._versions = None if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) location = self._downloader.params.get('ffmpeg_location') if location is not None: if not os.path.exists(location): @@ -117,19 +117,19 @@ class FFmpegPostProcessor(PostProcessor): (p, get_exe_version(p, args=['-version'])) for p in programs) self._paths = dict((p, p) for p in programs) - if prefer_ffmpeg: - prefs = ('ffmpeg', 'avconv') - else: + if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') for p in prefs: if self._versions[p]: self.basename = p break - if prefer_ffmpeg: - prefs = ('ffprobe', 'avprobe') - else: + if prefer_ffmpeg is False: prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') for p in prefs: if self._versions[p]: self.probe_basename = p From 5e8e2fa51f416e227367211ab937dfea17f89f57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jun 2018 01:25:05 +0700 Subject: [PATCH 182/187] [extractor/common] Use source URL as Referer for HTML5 entries (closes #16849) --- youtube_dl/extractor/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 394f34372..f3fec160d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2437,6 +2437,8 @@ class InfoExtractor(object): media_info['subtitles'].setdefault(lang, []).append({ 'url': absolute_url(src), }) + for f in media_info['formats']: + f.setdefault('http_headers', {})['Referer'] = base_url if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries From 9cf648c92bcc131db7d7fad673864bba06121482 Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Mon, 18 Jun 2018 11:50:06 +0200 Subject: [PATCH 183/187] [mediaset] Add support for new videos --- youtube_dl/extractor/mediaset.py | 60 ++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9760eafd5..76a2ae125 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -10,6 +10,7 @@ from ..utils import ( parse_duration, try_get, unified_strdate, + ExtractorError ) @@ -42,6 +43,22 @@ class MediasetIE(InfoExtractor): 'categories': ['reality'], }, 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', + 'md5': '1276f966ac423d16ba255ce867de073e', + 'info_dict': { + 'id': '846685', + 'ext': 'mp4', + 'title': 'Puntata del 25 maggio', + 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6565, + 'creator': 'mediaset', + 'upload_date': '20180525', + 'series': 'Matrix', + 'categories': ['infotainment'], + }, + 'expected_warnings': ['is not a supported codec'], }, { # clip 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', @@ -70,18 +87,29 @@ class MediasetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + media_info = self._download_json( + 'https://www.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading media info', query={ + 'id': video_id + })['video'] + + media_id = try_get(media_info, lambda x: x['guid']) or video_id + video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn.aspx', + 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', video_id, 'Downloading video CDN JSON', query={ - 'streamid': video_id, + 'streamid': media_id, 'format': 'json', })['videoList'] formats = [] for format_url in video_list: if '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) + try: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + except ExtractorError: + pass else: formats.append({ 'url': format_url, @@ -89,30 +117,24 @@ class MediasetIE(InfoExtractor): }) self._sort_formats(formats) - mediainfo = self._download_json( - 'http://plr.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading video info JSON', query={ - 'id': video_id, - })['video'] - - title = mediainfo['title'] + title = media_info['title'] creator = try_get( - mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + media_info, lambda x: x['brand-info']['publisher'], compat_str) category = try_get( - mediainfo, lambda x: x['brand-info']['category'], compat_str) + media_info, lambda x: x['brand-info']['category'], compat_str) categories = [category] if category else None return { 'id': video_id, 'title': title, - 'description': mediainfo.get('short-description'), - 'thumbnail': mediainfo.get('thumbnail'), - 'duration': parse_duration(mediainfo.get('duration')), + 'description': media_info.get('short-description'), + 'thumbnail': media_info.get('thumbnail'), + 'duration': parse_duration(media_info.get('duration')), 'creator': creator, - 'upload_date': unified_strdate(mediainfo.get('production-date')), - 'webpage_url': mediainfo.get('url'), - 'series': mediainfo.get('brand-value'), + 'upload_date': unified_strdate(media_info.get('production-date')), + 'webpage_url': media_info.get('url'), + 'series': media_info.get('brand-value'), 'categories': categories, 'formats': formats, } From 267d81962a0709f15f82f96b7aadbb5473a06992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 Jun 2018 02:16:44 +0700 Subject: [PATCH 184/187] [mediaset] Fix issues and extract all formats (closes #16568) --- youtube_dl/extractor/mediaset.py | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 76a2ae125..9f2b60dcc 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -10,7 +10,6 @@ from ..utils import ( parse_duration, try_get, unified_strdate, - ExtractorError ) @@ -58,7 +57,7 @@ class MediasetIE(InfoExtractor): 'series': 'Matrix', 'categories': ['infotainment'], }, - 'expected_warnings': ['is not a supported codec'], + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', @@ -87,13 +86,14 @@ class MediasetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - media_info = self._download_json( + video = self._download_json( 'https://www.video.mediaset.it/html/metainfo.sjson', video_id, 'Downloading media info', query={ 'id': video_id })['video'] - media_id = try_get(media_info, lambda x: x['guid']) or video_id + title = video['title'] + media_id = video.get('guid') or video_id video_list = self._download_json( 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', @@ -104,12 +104,17 @@ class MediasetIE(InfoExtractor): formats = [] for format_url in video_list: - if '.ism' in format_url: - try: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - except ExtractorError: - pass + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'ism' or '.ism' in format_url: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) else: formats.append({ 'url': format_url, @@ -117,24 +122,23 @@ class MediasetIE(InfoExtractor): }) self._sort_formats(formats) - title = media_info['title'] - creator = try_get( - media_info, lambda x: x['brand-info']['publisher'], compat_str) + video, lambda x: x['brand-info']['publisher'], compat_str) category = try_get( - media_info, lambda x: x['brand-info']['category'], compat_str) + video, lambda x: x['brand-info']['category'], compat_str) categories = [category] if category else None return { 'id': video_id, 'title': title, - 'description': media_info.get('short-description'), - 'thumbnail': media_info.get('thumbnail'), - 'duration': parse_duration(media_info.get('duration')), + 'description': video.get('short-description'), + 'thumbnail': video.get('thumbnail'), + 'duration': parse_duration(video.get('duration')), 'creator': creator, - 'upload_date': unified_strdate(media_info.get('production-date')), - 'webpage_url': media_info.get('url'), - 'series': media_info.get('brand-value'), + 'upload_date': unified_strdate(video.get('production-date')), + 'webpage_url': video.get('url'), + 'series': video.get('brand-value'), + 'season': video.get('season'), 'categories': categories, 'formats': formats, } From 2160768a215849e82a167912cb8f0aa054e87d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 Jun 2018 23:39:56 +0700 Subject: [PATCH 185/187] [npo] Fix typo (closes #16872) --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index cb8319f0d..c2cb85a73 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -282,7 +282,7 @@ class NPOIE(NPOBaseIE): video_url = stream_info.get('url') if not video_url or video_url in urls: continue - urls.add(item_url) + urls.add(video_url) if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, ext='mp4', From eca1f0d115e6a2712ff0d5f6b25e3ded5e52db71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Jul 2018 02:00:16 +0700 Subject: [PATCH 186/187] [extractor/common] Properly escape % in MPD templates (closes #16867) --- youtube_dl/extractor/common.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f3fec160d..78f053f18 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2106,7 +2106,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) From 973b6ceebbf0c79086cbf3203a8a8c79daf0b1ba Mon Sep 17 00:00:00 2001 From: coreynicholson <coreynicholson@users.noreply.github.com> Date: Sun, 1 Jul 2018 15:19:17 +0100 Subject: [PATCH 187/187] [vlive] Fix live streams extraction --- youtube_dl/extractor/vlive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 64d0224e6..0b5165fd0 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_FIELD = 'video params' @@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor): def _live(self, video_id, webpage): init_page = self._download_webpage( - 'http://www.vlive.tv/video/init/view', + 'https://www.vlive.tv/video/init/view', video_id, note='Downloading live webpage', data=urlencode_postdata({'videoSeq': video_id}), headers={ - 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, 'Content-Type': 'application/x-www-form-urlencoded' })