From 583174284065a41a70220ecc8de0d31d3aea5070 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 15 May 2019 10:38:33 +0100 Subject: [PATCH 01/30] [vrv] extract captions(closes #19238) --- youtube_dl/extractor/vrv.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 33530fe8a..b698bf66c 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -198,14 +198,15 @@ class VRVIE(VRVBaseIE): self._sort_formats(formats) subtitles = {} - for subtitle in streams_json.get('subtitles', {}).values(): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ - 'url': subtitle_url, - 'ext': subtitle.get('format', 'ass'), - }) + for k in ('captions', 'subtitles'): + for subtitle in streams_json.get(k, {}).values(): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) thumbnails = [] for thumbnail in video_data.get('images', {}).get('thumbnails', []): From 170d6444406a0401fb20ca612e3d19a7f972af15 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 15 May 2019 14:58:57 +0100 Subject: [PATCH 02/30] [canvas] add support for vrtnieuws and sporza site ids and extract AES HLS formats --- youtube_dl/extractor/canvas.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 174fd9e2b..c506bc5dd 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -17,7 +17,7 @@ from ..utils import ( class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrtvideo)/assets/(?P[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '90139b746a0a9bd7bb631283f6e2a64e', @@ -35,6 +35,10 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _HLS_ENTRY_PROTOCOLS_MAP = { + 'HLS': 'm3u8_native', + 'HLS_AES': 'm3u8', + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -52,9 +56,9 @@ class CanvasIE(InfoExtractor): format_url, format_type = target.get('url'), target.get('type') if not format_url or not format_type: continue - if format_type == 'HLS': + if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', + format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], m3u8_id=format_type, fatal=False)) elif format_type == 'HDS': formats.extend(self._extract_f4m_formats( From 82e91d20a0f698b13412dd7b200663c7485791bb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 15 May 2019 15:02:51 +0100 Subject: [PATCH 03/30] [vrt] fix extraction(closes #20527) --- youtube_dl/extractor/vrt.py | 193 ++++++++++++------------------------ 1 file changed, 63 insertions(+), 130 deletions(-) diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 444295d68..422025267 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -5,150 +5,83 @@ import re from .common import InfoExtractor from ..utils import ( + extract_attributes, float_or_none, + get_element_by_class, + strip_or_none, + unified_timestamp, ) class VRTIE(InfoExtractor): - IE_DESC = 'deredactie.be, sporza.be, cobra.be and cobra.canvas.be' - _VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P[^/]+)/*' - _TESTS = [ - # deredactie.be - { - 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', - 'md5': '4cebde1eb60a53782d4f3992cbd46ec8', - 'info_dict': { - 'id': '2129880', - 'ext': 'flv', - 'title': 'Het journaal L - 25/10/14', - 'description': None, - 'timestamp': 1414271750.949, - 'upload_date': '20141025', - 'duration': 929, - }, - 'skip': 'HTTP Error 404: Not Found', + IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' + _VALID_URL = r'https?://(?:www\.)?(?Pvrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', + 'md5': 'e1663accf5cf13f375f3cd0d10476669', + 'info_dict': { + 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', + 'ext': 'mp4', + 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', + 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', + 'timestamp': 1557924660, + 'upload_date': '20190515', + 'duration': 31.2, }, - # sporza.be - { - 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', - 'md5': '11f53088da9bf8e7cfc42456697953ff', - 'info_dict': { - 'id': '2124639', - 'ext': 'flv', - 'title': 'Bekijk Extra Time van 20 oktober', - 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426', - 'timestamp': 1413835980.560, - 'upload_date': '20141020', - 'duration': 3238, - }, - 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', + 'md5': '910bba927566e9ab992278f647eb4b75', + 'info_dict': { + 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', + 'ext': 'mp4', + 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', + 'timestamp': 1557923760, + 'upload_date': '20190515', + 'duration': 115.17, }, - # cobra.be - { - 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', - 'md5': '78a2b060a5083c4f055449a72477409d', - 'info_dict': { - 'id': '2126050', - 'ext': 'flv', - 'title': 'Bret Easton Ellis in Café Corsari', - 'description': 'md5:f699986e823f32fd6036c1855a724ee9', - 'timestamp': 1413967500.494, - 'upload_date': '20141022', - 'duration': 661, - }, - 'skip': 'HTTP Error 404: Not Found', - }, - { - # YouTube video - 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', - 'md5': 'b8b93da1df1cea6c8556255a796b7d61', - 'info_dict': { - 'id': 'Wji-BZ0oCwg', - 'ext': 'mp4', - 'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer', - 'description': 'md5:8e468944dce15567a786a67f74262583', - 'uploader': 'Star Wars', - 'uploader_id': 'starwars', - 'upload_date': '20160407', - }, - 'add_ie': ['Youtube'], - }, - { - 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'info_dict': { - 'id': '2377055', - 'ext': 'mp4', - 'title': 'Cafe Derby', - 'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.', - 'upload_date': '20150626', - 'timestamp': 1435305240.769, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - } - ] + }, { + 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', + 'only_matching': True, + }, { + 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', + 'only_matching': True, + }] + _CLIENT_MAP = { + 'vrt.be/vrtnws': 'vrtnieuws', + 'sporza.be': 'sporza', + } def _real_extract(self, url): - video_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + attrs = extract_attributes(self._search_regex( + r'(<[^>]+class="vrtvideo"[^>]*>)', webpage, 'vrt video')) - webpage = self._download_webpage(url, video_id) + asset_id = attrs['data-videoid'] + publication_id = attrs.get('data-publicationid') + if publication_id: + asset_id = publication_id + '$' + asset_id + client = attrs.get('data-client') or self._CLIENT_MAP[site] - video_id = self._search_regex( - r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) - - src = self._search_regex( - r'data-video-src="([^"]+)"', webpage, 'video src', default=None) - - video_type = self._search_regex( - r'data-video-type="([^"]+)"', webpage, 'video type', default=None) - - if video_type == 'YouTubeVideo': - return self.url_result(src, 'Youtube') - - formats = [] - - mobj = re.search( - r'data-video-iphone-server="(?P[^"]+)"\s+data-video-iphone-path="(?P[^"]+)"', - webpage) - if mobj: - formats.extend(self._extract_m3u8_formats( - '%s/%s' % (mobj.group('server'), mobj.group('path')), - video_id, 'mp4', m3u8_id='hls', fatal=False)) - - if src: - formats = self._extract_wowza_formats(src, video_id) - if 'data-video-geoblocking="true"' not in webpage: - for f in formats: - if f['url'].startswith('rtsp://'): - http_format = f.copy() - http_format.update({ - 'url': f['url'].replace('rtsp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), - 'format_id': f['format_id'].replace('rtsp', 'http'), - 'protocol': 'http', - }) - formats.append(http_format) - - if not formats and 'data-video-geoblocking="true"' in webpage: - self.raise_geo_restricted('This video is only available in Belgium') - - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage, default=None) - thumbnail = self._og_search_thumbnail(webpage) - timestamp = float_or_none(self._search_regex( - r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) - duration = float_or_none(self._search_regex( - r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) + title = strip_or_none(get_element_by_class( + 'vrt-title', webpage) or self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage)) + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], webpage) + if description == '…': + description = None + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage)) return { - 'id': video_id, + '_type': 'url_transparent', + 'id': asset_id, + 'display_id': display_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': attrs.get('data-posterimage'), 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, + 'duration': float_or_none(attrs.get('data-duration'), 1000), + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), + 'ie_key': 'Canvas', } From e3c1266f492d710e2acbf0d80f44f7f805eb5187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 May 2019 03:17:15 +0700 Subject: [PATCH 04/30] [extractor/common] Move workaround for applying first Set-Cookie header into a separate method --- youtube_dl/extractor/common.py | 23 +++++++++++++++++++++++ youtube_dl/extractor/vk.py | 22 +++------------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 69c3bc755..f994953bc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2817,6 +2817,29 @@ class InfoExtractor(object): self._downloader.cookiejar.add_cookie_header(req) return compat_cookies.SimpleCookie(req.get_header('Cookie')) + def _apply_first_set_cookie_header(self, url_handle, cookie): + # Some sites (e.g. [1-3]) may serve two cookies under the same name + # in Set-Cookie header and expect the first (old) one to be set rather + # than second (new). However, as of RFC6265 the newer one cookie + # should be set into cookie store what actually happens. + # We will workaround this issue by resetting the cookie to + # the first one manually. + # 1. https://new.vk.com/ + # 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 + # 3. https://learning.oreilly.com/ + for header, cookies in url_handle.headers.items(): + if header.lower() != 'set-cookie': + continue + if sys.version_info[0] >= 3: + cookies = cookies.encode('iso-8859-1') + cookies = cookies.decode('utf-8') + cookie_value = re.search( + r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) + if cookie_value: + value, domain = cookie_value.groups() + self._set_cookie(domain, cookie, value) + break + def get_testcases(self, include_onlymatching=False): t = getattr(self, '_TEST', None) if t: diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b7ce2fb97..f57ed2288 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import collections import re -import sys from .common import InfoExtractor from ..compat import compat_urlparse @@ -45,24 +44,9 @@ class VKBaseIE(InfoExtractor): 'pass': password.encode('cp1251'), }) - # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header - # and expects the first one to be set rather than second (see - # https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201). - # As of RFC6265 the newer one cookie should be set into cookie store - # what actually happens. - # We will workaround this VK issue by resetting the remixlhk cookie to - # the first one manually. - for header, cookies in url_handle.headers.items(): - if header.lower() != 'set-cookie': - continue - if sys.version_info[0] >= 3: - cookies = cookies.encode('iso-8859-1') - cookies = cookies.decode('utf-8') - remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) - if remixlhk: - value, domain = remixlhk.groups() - self._set_cookie(domain, 'remixlhk', value) - break + # vk serves two same remixlhk cookies in Set-Cookie header and expects + # first one to be actually set + self._apply_first_set_cookie_header(url_handle, 'remixlhk') login_page = self._download_webpage( 'https://login.vk.com/?act=login', None, From a9e03736dfb2038d6a569e6305f4ac727a8ac71b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 May 2019 03:23:40 +0700 Subject: [PATCH 05/30] [safari] Fix authentication (closes #21090) --- youtube_dl/extractor/safari.py | 87 ++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index c0d32a1b9..8d4806794 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -1,15 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, - sanitized_Request, - std_headers, - urlencode_postdata, update_url_query, ) @@ -31,44 +34,52 @@ class SafariBaseIE(InfoExtractor): if username is None: return - headers = std_headers.copy() - if 'Referer' not in headers: - headers['Referer'] = self._LOGIN_URL + _, urlh = self._download_webpage_handle( + 'https://learning.oreilly.com/accounts/login-check/', None, + 'Downloading login page') - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login form', headers=headers) + def is_logged(urlh): + return 'learning.oreilly.com/home/' in compat_str(urlh.geturl()) - def is_logged(webpage): - return any(re.search(p, webpage) for p in ( - r'href=["\']/accounts/logout/', r'>Sign Out<')) - - if is_logged(login_page): + if is_logged(urlh): self.LOGGED_IN = True return - csrf = self._html_search_regex( - r"name='csrfmiddlewaretoken'\s+value='([^']+)'", - login_page, 'csrf token') + redirect_url = compat_str(urlh.geturl()) + parsed_url = compat_urlparse.urlparse(redirect_url) + qs = compat_parse_qs(parsed_url.query) + next_uri = compat_urlparse.urljoin( + 'https://api.oreilly.com', qs['next'][0]) - login_form = { - 'csrfmiddlewaretoken': csrf, - 'email': username, - 'password1': password, - 'login': 'Sign In', - 'next': '', - } + auth, urlh = self._download_json_handle( + 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', + data=json.dumps({ + 'email': username, + 'password': password, + 'redirect_uri': next_uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': redirect_url, + }, expected_status=400) - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form), headers=headers) - login_page = self._download_webpage( - request, None, 'Logging in') - - if not is_logged(login_page): + credentials = auth.get('credentials') + if (not auth.get('logged_in') and not auth.get('redirect_uri') + and credentials): raise ExtractorError( - 'Login failed; make sure your credentials are correct and try again.', - expected=True) + 'Unable to login: %s' % credentials, expected=True) - self.LOGGED_IN = True + # oreilly serves two same groot_sessionid cookies in Set-Cookie header + # and expects first one to be actually set + self._apply_first_set_cookie_header(urlh, 'groot_sessionid') + + _, urlh = self._download_webpage_handle( + auth.get('redirect_uri') or next_uri, None, 'Completing login',) + + if is_logged(urlh): + self.LOGGED_IN = True + return + + raise ExtractorError('Unable to log in') class SafariIE(SafariBaseIE): @@ -76,7 +87,7 @@ class SafariIE(SafariBaseIE): IE_DESC = 'safaribooksonline.com online video' _VALID_URL = r'''(?x) https?:// - (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ (?: library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) @@ -107,6 +118,9 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', + 'only_matching': True, }] _PARTNER_ID = '1926081' @@ -163,7 +177,7 @@ class SafariIE(SafariBaseIE): class SafariApiIE(SafariBaseIE): IE_NAME = 'safari:api' - _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', @@ -188,7 +202,7 @@ class SafariCourseIE(SafariBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/ + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ (?: library/view/[^/]+| api/v1/book| @@ -219,6 +233,9 @@ class SafariCourseIE(SafariBaseIE): }, { 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'only_matching': True, }] @classmethod From ce2fe4c01cceef4b636995275b573baf51587fa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 May 2019 23:23:18 +0700 Subject: [PATCH 06/30] [extractor/common] Add doc string for _apply_first_set_cookie_header --- youtube_dl/extractor/common.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f994953bc..937237b3f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2818,15 +2818,19 @@ class InfoExtractor(object): return compat_cookies.SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): - # Some sites (e.g. [1-3]) may serve two cookies under the same name - # in Set-Cookie header and expect the first (old) one to be set rather - # than second (new). However, as of RFC6265 the newer one cookie - # should be set into cookie store what actually happens. - # We will workaround this issue by resetting the cookie to - # the first one manually. - # 1. https://new.vk.com/ - # 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 - # 3. https://learning.oreilly.com/ + """ + Apply first Set-Cookie header instead of the last. Experimental. + + Some sites (e.g. [1-3]) may serve two cookies under the same name + in Set-Cookie header and expect the first (old) one to be set rather + than second (new). However, as of RFC6265 the newer one cookie + should be set into cookie store what actually happens. + We will workaround this issue by resetting the cookie to + the first one manually. + 1. https://new.vk.com/ + 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 + 3. https://learning.oreilly.com/ + """ for header, cookies in url_handle.headers.items(): if header.lower() != 'set-cookie': continue From 42c971341b804b758d12b7a85547be05160f1b3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 May 2019 23:24:27 +0700 Subject: [PATCH 07/30] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 13cb6288d..eba7202dd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Core ++ [extractor/common] Move workaround for applying first Set-Cookie header + into a separate _apply_first_set_cookie_header method + +Extractors +* [safari] Fix authentication (#21090) +* [vk] Use _apply_first_set_cookie_header +* [vrt] Fix extraction (#20527) ++ [canvas] Add support for vrtnieuws and sporza site ids and extract + AES HLS formats ++ [vrv] Extract captions (#19238) +* [tele5] Improve video id extraction +* [tele5] Relax URL regular expression (#21020, #21063) +* [svtplay] Update API URL (#21075) ++ [yahoo:gyao] Add X-User-Agent header to dam proxy requests (#21071) + + version 2019.05.11 Core From 6ab30ff50bf6bd0585927cb73c7421bef184f87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 May 2019 23:29:49 +0700 Subject: [PATCH 08/30] release 2019.05.20 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 6b931b3cf..dc303946e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.05.11** +- [ ] I've verified that I'm running youtube-dl version **2019.05.20** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.05.11 + [debug] youtube-dl version 2019.05.20 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index f2dc784a2..46e143c8a 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.05.11** +- [ ] I've verified that I'm running youtube-dl version **2019.05.20** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 39a0af13f..bc6c4694b 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.05.11** +- [ ] I've verified that I'm running youtube-dl version **2019.05.20** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 139f36ab8..bcc51f986 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.05.11** +- [ ] I've verified that I'm running youtube-dl version **2019.05.20** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.05.11 + [debug] youtube-dl version 2019.05.20 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index d60da6db9..c8d16960e 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.05.11** +- [ ] I've verified that I'm running youtube-dl version **2019.05.20** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index eba7202dd..3babb6f48 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.05.20 Core + [extractor/common] Move workaround for applying first Set-Cookie header diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a8a9224cb..404a2f0a4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1071,7 +1071,7 @@ - **VoxMediaVolume** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be + - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - **VrtNU**: VrtNU.be - **vrv** - **vrv:series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e63527dbb..8df77378b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.05.11' +__version__ = '2019.05.20' From 0e6f914b3b40ef2ca78d82051a194faaad64dd9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20H=C3=B6pfl?= Date: Wed, 13 Feb 2019 16:29:43 +0100 Subject: [PATCH 09/30] [vivo] Fix extraction (closes #18906) --- youtube_dl/extractor/shared.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 931a0f70e..eade8fd9e 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( @@ -7,6 +9,7 @@ from ..utils import ( int_or_none, url_or_none, urlencode_postdata, + unescapeHTML, ) @@ -22,8 +25,7 @@ class SharedBaseIE(InfoExtractor): video_url = self._extract_video_url(webpage, video_id, url) - title = compat_b64decode(self._html_search_meta( - 'full:title', webpage, 'title')).decode('utf-8') + title = self._extract_title(webpage) filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) @@ -35,6 +37,10 @@ class SharedBaseIE(InfoExtractor): 'title': title, } + def _extract_title(self, webpage): + return compat_b64decode(self._html_search_meta( + 'full:title', webpage, 'title')).decode('utf-8') + class SharedIE(SharedBaseIE): IE_DESC = 'shared.sx' @@ -86,6 +92,14 @@ class VivoIE(SharedBaseIE): }, } + def _extract_title(self, webpage): + data_title = self._search_regex( + r'data-name\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='title') + if data_title: + return unescapeHTML(re.sub(r"\.[a-z0-9]{3,4}$", "", data_title)) + return self._og_search_title(webpage) + def _extract_video_url(self, webpage, video_id, *args): def decode_url(encoded_url): return compat_b64decode(encoded_url).decode('utf-8') From e438e8146965d2c650c1575dc97809bcc9504f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 May 2019 03:04:58 +0700 Subject: [PATCH 10/30] [vivo] Improve extraction (closes #19217) --- youtube_dl/extractor/shared.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index eade8fd9e..ff575f592 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,15 +1,15 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( + determine_ext, ExtractorError, int_or_none, + KNOWN_EXTENSIONS, + parse_filesize, url_or_none, urlencode_postdata, - unescapeHTML, ) @@ -26,8 +26,7 @@ class SharedBaseIE(InfoExtractor): video_url = self._extract_video_url(webpage, video_id, url) title = self._extract_title(webpage) - filesize = int_or_none(self._html_search_meta( - 'full:size', webpage, 'file size', fatal=False)) + filesize = int_or_none(self._extract_filesize(webpage)) return { 'id': video_id, @@ -41,6 +40,10 @@ class SharedBaseIE(InfoExtractor): return compat_b64decode(self._html_search_meta( 'full:title', webpage, 'title')).decode('utf-8') + def _extract_filesize(self, webpage): + return self._html_search_meta( + 'full:size', webpage, 'file size', fatal=False) + class SharedIE(SharedBaseIE): IE_DESC = 'shared.sx' @@ -88,19 +91,27 @@ class VivoIE(SharedBaseIE): 'id': 'd7ddda0e78', 'ext': 'mp4', 'title': 'Chicken', - 'filesize': 528031, + 'filesize': 515659, }, } def _extract_title(self, webpage): - data_title = self._search_regex( + title = self._html_search_regex( r'data-name\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', webpage, 'title', default=None, group='title') - if data_title: - return unescapeHTML(re.sub(r"\.[a-z0-9]{3,4}$", "", data_title)) + if title: + ext = determine_ext(title) + if ext.lower() in KNOWN_EXTENSIONS: + title = title.rpartition('.' + ext)[0] + return title return self._og_search_title(webpage) - def _extract_video_url(self, webpage, video_id, *args): + def _extract_filesize(self, webpage): + return parse_filesize(self._search_regex( + r'data-type=["\']video["\'][^>]*>Watch.*?<strong>\s*\((.+?)\)', + webpage, 'filesize', fatal=False)) + + def _extract_video_url(self, webpage, video_id, url): def decode_url(encoded_url): return compat_b64decode(encoded_url).decode('utf-8') From ea7538209468f630075d08d44ef7b0119f78d2eb Mon Sep 17 00:00:00 2001 From: smed79 <1873139+smed79@users.noreply.github.com> Date: Wed, 22 May 2019 21:30:17 +0100 Subject: [PATCH 11/30] [openload] Add support for oload.press (#21135) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index a8e906858..b96be6f64 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -244,7 +244,7 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _DOMAINS = r'(?:openload\.(?:co|io|link|pw)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live|space|services)|oladblock\.(?:services|xyz|me)|openloed\.co)' + _DOMAINS = r'(?:openload\.(?:co|io|link|pw)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|live|space|services)|oladblock\.(?:services|xyz|me)|openloed\.co)' _VALID_URL = r'''(?x) https?:// (?P<host> @@ -357,6 +357,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.services/embed/bs1NWj1dCag/', 'only_matching': True, + }, { + 'url': 'https://oload.press/embed/drTBl1aOTvk/', + 'only_matching': True, }, { 'url': 'https://oladblock.services/f/b8NWEgkqNLI/', 'only_matching': True, From 612300a686fd83d475b7fddc17cb2ccd8ca0b5ef Mon Sep 17 00:00:00 2001 From: ealgase <mostdigitsofpi@gmail.com> Date: Wed, 22 May 2019 16:38:48 -0400 Subject: [PATCH 12/30] [novamov] Remove extractors (#21077) Sites no longer exist --- youtube_dl/extractor/extractors.py | 7 - youtube_dl/extractor/generic.py | 13 -- youtube_dl/extractor/novamov.py | 212 ----------------------------- 3 files changed, 232 deletions(-) delete mode 100644 youtube_dl/extractor/novamov.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3037b5a45..e5aee96c2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -772,13 +772,6 @@ from .nova import ( NovaEmbedIE, NovaIE, ) -from .novamov import ( - AuroraVidIE, - CloudTimeIE, - NowVideoIE, - VideoWeedIE, - WholeCloudIE, -) from .nowness import ( NownessIE, NownessPlaylistIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3a13c62eb..eeb0d25f6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2583,19 +2583,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group(1), 'Mpora') - # Look for embedded NovaMov-based player - mobj = re.search( - r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\']) - (?P<url>http://(?:(?:embed|www)\.)? - (?:novamov\.com| - nowvideo\.(?:ch|sx|eu|at|ag|co)| - videoweed\.(?:es|com)| - movshare\.(?:net|sx|ag)| - divxstage\.(?:eu|net|ch|co|at|ag)) - /embed\.php.+?)\1''', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - # Look for embedded Facebook player facebook_urls = FacebookIE._extract_urls(webpage) if facebook_urls: diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py deleted file mode 100644 index 829c71960..000000000 --- a/youtube_dl/extractor/novamov.py +++ /dev/null @@ -1,212 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - NO_DEFAULT, - sanitized_Request, - urlencode_postdata, -) - - -class NovaMovIE(InfoExtractor): - IE_NAME = 'novamov' - IE_DESC = 'NovaMov' - - _VALID_URL_TEMPLATE = r'''(?x) - http:// - (?: - (?:www\.)?%(host)s/(?:file|video|mobile/\#/videos)/| - (?:(?:embed|www)\.)%(host)s/embed(?:\.php|/)?\?(?:.*?&)?\bv= - ) - (?P<id>[a-z\d]{13}) - ''' - _VALID_URL = _VALID_URL_TEMPLATE % {'host': r'novamov\.com'} - - _HOST = 'www.novamov.com' - - _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>' - _FILEKEY_REGEX = r'flashvars\.filekey=(?P<filekey>"?[^"]+"?);' - _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>' - _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>' - _URL_TEMPLATE = 'http://%s/video/%s' - - _TEST = None - - def _check_existence(self, webpage, video_id): - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - def _real_extract(self, url): - video_id = self._match_id(url) - - url = self._URL_TEMPLATE % (self._HOST, video_id) - - webpage = self._download_webpage( - url, video_id, 'Downloading video page') - - self._check_existence(webpage, video_id) - - def extract_filekey(default=NO_DEFAULT): - filekey = self._search_regex( - self._FILEKEY_REGEX, webpage, 'filekey', default=default) - if filekey is not default and (filekey[0] != '"' or filekey[-1] != '"'): - return self._search_regex( - r'var\s+%s\s*=\s*"([^"]+)"' % re.escape(filekey), webpage, 'filekey', default=default) - else: - return filekey - - filekey = extract_filekey(default=None) - - if not filekey: - fields = self._hidden_inputs(webpage) - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', webpage, - 'post url', default=url, group='url') - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(url, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(fields)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - request.add_header('Referer', post_url) - webpage = self._download_webpage( - request, video_id, 'Downloading continue to the video page') - self._check_existence(webpage, video_id) - - filekey = extract_filekey() - - title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEX, webpage, 'description', default='', fatal=False) - - api_response = self._download_webpage( - 'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id, - 'Downloading video api response') - - response = compat_urlparse.parse_qs(api_response) - - if 'error_msg' in response: - raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True) - - video_url = response['url'][0] - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'description': description - } - - -class WholeCloudIE(NovaMovIE): - IE_NAME = 'wholecloud' - IE_DESC = 'WholeCloud' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'(?:wholecloud\.net|movshare\.(?:net|sx|ag))'} - - _HOST = 'www.wholecloud.net' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>' - _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>' - - _TEST = { - 'url': 'http://www.wholecloud.net/video/559e28be54d96', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': '559e28be54d96', - 'ext': 'flv', - 'title': 'dissapeared image', - 'description': 'optical illusion dissapeared image magic illusion', - } - } - - -class NowVideoIE(NovaMovIE): - IE_NAME = 'nowvideo' - IE_DESC = 'NowVideo' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} - - _HOST = 'www.nowvideo.to' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<h4>([^<]+)</h4>' - _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' - - _TEST = { - 'url': 'http://www.nowvideo.sx/video/f1d6fce9a968b', - 'md5': '12c82cad4f2084881d8bc60ee29df092', - 'info_dict': { - 'id': 'f1d6fce9a968b', - 'ext': 'flv', - 'title': 'youtubedl test video BaWjenozKc', - 'description': 'Description', - }, - } - - -class VideoWeedIE(NovaMovIE): - IE_NAME = 'videoweed' - IE_DESC = 'VideoWeed' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'videoweed\.(?:es|com)'} - - _HOST = 'www.videoweed.es' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>' - _URL_TEMPLATE = 'http://%s/file/%s' - - _TEST = { - 'url': 'http://www.videoweed.es/file/b42178afbea14', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': 'b42178afbea14', - 'ext': 'flv', - 'title': 'optical illusion dissapeared image magic illusion', - 'description': '' - }, - } - - -class CloudTimeIE(NovaMovIE): - IE_NAME = 'cloudtime' - IE_DESC = 'CloudTime' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'cloudtime\.to'} - - _HOST = 'www.cloudtime.to' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<div[^>]+class=["\']video_det["\'][^>]*>\s*<strong>([^<]+)</strong>' - - _TEST = None - - -class AuroraVidIE(NovaMovIE): - IE_NAME = 'auroravid' - IE_DESC = 'AuroraVid' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'auroravid\.to'} - - _HOST = 'www.auroravid.to' - - _FILE_DELETED_REGEX = r'This file no longer exists on our servers!<' - - _TESTS = [{ - 'url': 'http://www.auroravid.to/video/4rurhn9x446jj', - 'md5': '7205f346a52bbeba427603ba10d4b935', - 'info_dict': { - 'id': '4rurhn9x446jj', - 'ext': 'flv', - 'title': 'search engine optimization', - 'description': 'search engine optimization is used to rank the web page in the google search engine' - }, - 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' - }, { - 'url': 'http://www.auroravid.to/embed/?v=4rurhn9x446jj', - 'only_matching': True, - }] From 186d185b6ecdee102866777121d6abe9ed7f59ba Mon Sep 17 00:00:00 2001 From: Malte Kiefer <malte.kiefer@mailgermania.de> Date: Wed, 22 May 2019 22:46:20 +0200 Subject: [PATCH 13/30] [streamcloud] Reduce waiting time to 6 seconds (#21092) --- youtube_dl/extractor/streamcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 4a410611d..b97bb4374 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -45,7 +45,7 @@ class StreamcloudIE(InfoExtractor): value="([^"]*)" ''', orig_webpage) - self._sleep(12, video_id) + self._sleep(6, video_id) webpage = self._download_webpage( url, video_id, data=urlencode_postdata(fields), headers={ From bbf1defe586f4b4cb7b35aa3da67c5dc786d9a2c Mon Sep 17 00:00:00 2001 From: Georgi Saev <georgi.saev@gmail.com> Date: Wed, 22 May 2019 23:51:50 +0300 Subject: [PATCH 14/30] [bitchute] Fix uploader extraction (#21076) --- youtube_dl/extractor/bitchute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 4f39424f5..1d69dafbd 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -65,8 +65,9 @@ class BitChuteIE(InfoExtractor): webpage, default=None) or self._html_search_meta( 'twitter:image:src', webpage, 'thumbnail') uploader = self._html_search_regex( - r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>', webpage, - 'uploader', fatal=False) + (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), + webpage, 'uploader', fatal=False) return { 'id': video_id, From 2c53c0ebc63b7fbb36d05491d5d3796d3e511e26 Mon Sep 17 00:00:00 2001 From: NRTICN <50528161+NRTICN@users.noreply.github.com> Date: Wed, 22 May 2019 20:56:54 +0000 Subject: [PATCH 15/30] [pornhub] Use https (#21061) --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index bf8f0be88..cb59d526f 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -170,7 +170,7 @@ class PornHubIE(PornHubBaseIE): def dl_webpage(platform): self._set_cookie(host, 'platform', platform) return self._download_webpage( - 'http://www.%s/view_video.php?viewkey=%s' % (host, video_id), + 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') From afd4985f72a6641907aee1cd0b4b42da524b0ff4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 May 2019 06:06:49 +0700 Subject: [PATCH 16/30] [travis] Force dist to Ubuntu Trusty by default According to https://blog.travis-ci.com/2019-04-15-xenial-default-build-environment Ubuntu Xenial is now default, but it lacks python 2.6, 3.2 and 3.3 support needed by tests --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 82e81d078..6d16c2955 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ python: - "3.6" - "pypy" - "pypy3" +dist: trusty env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download From 9c5f2988b91609d49f7010ac580376f42e01d4f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 May 2019 23:38:01 +0700 Subject: [PATCH 17/30] [criterion] Remove extractor (closes #21195) --- youtube_dl/extractor/criterion.py | 39 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 40 deletions(-) delete mode 100644 youtube_dl/extractor/criterion.py diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py deleted file mode 100644 index f7815b905..000000000 --- a/youtube_dl/extractor/criterion.py +++ /dev/null @@ -1,39 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class CriterionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?criterion\.com/films/(?P<id>[0-9]+)-.+' - _TEST = { - 'url': 'http://www.criterion.com/films/184-le-samourai', - 'md5': 'bc51beba55685509883a9a7830919ec3', - 'info_dict': { - 'id': '184', - 'ext': 'mp4', - 'title': 'Le Samouraï', - 'description': 'md5:a2b4b116326558149bef81f76dcbb93f', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - final_url = self._search_regex( - r'so\.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage) - thumbnail = self._search_regex( - r'so\.addVariable\("thumbnailURL", "(.+?)"\)\;', - webpage, 'thumbnail url') - - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5aee96c2..7705f9bdd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -240,7 +240,6 @@ from .condenast import CondeNastIE from .corus import CorusIE from .cracked import CrackedIE from .crackle import CrackleIE -from .criterion import CriterionIE from .crooksandliars import CrooksAndLiarsIE from .crunchyroll import ( CrunchyrollIE, From 8af49fc276b2cf2154b9342de4b4cd66f9d17af9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 May 2019 23:48:06 +0700 Subject: [PATCH 18/30] [pornflip] Remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/pornflip.py | 101 ----------------------------- 2 files changed, 102 deletions(-) delete mode 100644 youtube_dl/extractor/pornflip.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7705f9bdd..eb5efd1e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -888,7 +888,6 @@ from .polskieradio import ( from .popcorntv import PopcornTVIE from .porn91 import Porn91IE from .porncom import PornComIE -from .pornflip import PornFlipIE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py deleted file mode 100644 index 025985fbc..000000000 --- a/youtube_dl/extractor/pornflip.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, -) -from ..utils import ( - int_or_none, - try_get, - unified_timestamp, -) - - -class PornFlipIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', - 'md5': '98c46639849145ae1fd77af532a9278c', - 'info_dict': { - 'id': 'wz7DfNhMmep', - 'ext': 'mp4', - 'title': '2 Amateurs swallow make his dream cumshots true', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 112, - 'timestamp': 1481655502, - 'upload_date': '20161213', - 'uploader_id': '106786', - 'uploader': 'figifoto', - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'https://www.pornflip.com/embed/wz7DfNhMmep', - 'only_matching': True, - }, { - 'url': 'https://www.pornflip.com/v/EkRD6-vS2-s', - 'only_matching': True, - }, { - 'url': 'https://www.pornflip.com/embed/EkRD6-vS2-s', - 'only_matching': True, - }, { - 'url': 'https://www.pornflip.com/v/NG9q6Pb_iK8', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.pornflip.com/v/%s' % video_id, video_id) - - flashvars = compat_parse_qs(self._search_regex( - r'<embed[^>]+flashvars=(["\'])(?P<flashvars>(?:(?!\1).)+)\1', - webpage, 'flashvars', group='flashvars')) - - title = flashvars['video_vars[title]'][0] - - def flashvar(kind): - return try_get( - flashvars, lambda x: x['video_vars[%s]' % kind][0], compat_str) - - formats = [] - for key, value in flashvars.items(): - if not (value and isinstance(value, list)): - continue - format_url = value[0] - if key == 'video_vars[hds_manifest]': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - continue - height = self._search_regex( - r'video_vars\[video_urls\]\[(\d+)', key, 'height', default=None) - if not height: - continue - formats.append({ - 'url': format_url, - 'format_id': 'http-%s' % height, - 'height': int_or_none(height), - }) - self._sort_formats(formats) - - uploader = self._html_search_regex( - (r'<span[^>]+class="name"[^>]*>\s*<a[^>]+>\s*<strong>(?P<uploader>[^<]+)', - r'<meta[^>]+content=(["\'])[^>]*\buploaded by (?P<uploader>.+?)\1'), - webpage, 'uploader', fatal=False, group='uploader') - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'thumbnail': flashvar('big_thumb'), - 'duration': int_or_none(flashvar('duration')), - 'timestamp': unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')), - 'uploader_id': flashvar('author_id'), - 'uploader': uploader, - 'view_count': int_or_none(flashvar('views')), - 'age_limit': 18, - } From f856816b940d95e995c2b9995d01097ab144a1af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 May 2019 23:52:11 +0700 Subject: [PATCH 19/30] [extractor/common] Strip src attribute for HTML5 entries code (closes #18485, closes #21169) --- youtube_dl/extractor/common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 937237b3f..9c3e9eec6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -67,6 +67,7 @@ from ..utils import ( sanitized_Request, sanitize_filename, str_or_none, + strip_or_none, unescapeHTML, unified_strdate, unified_timestamp, @@ -2480,7 +2481,7 @@ class InfoExtractor(object): 'subtitles': {}, } media_attributes = extract_attributes(media_tag) - src = media_attributes.get('src') + src = strip_or_none(media_attributes.get('src')) if src: _, formats = _media_formats(src, media_type) media_info['formats'].extend(formats) @@ -2490,7 +2491,7 @@ class InfoExtractor(object): s_attr = extract_attributes(source_tag) # data-video-src and data-src are non standard but seen # several times in the wild - src = dict_get(s_attr, ('src', 'data-video-src', 'data-src')) + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) if not src: continue f = parse_content_type(s_attr.get('type')) @@ -2533,7 +2534,7 @@ class InfoExtractor(object): track_attributes = extract_attributes(track_tag) kind = track_attributes.get('kind') if not kind or kind in ('subtitles', 'captions'): - src = track_attributes.get('src') + src = strip_or_none(track_attributes.get('src')) if not src: continue lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') From 53cd37bac50e7a927deba5b67a4412301b96230d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 May 2019 23:58:35 +0700 Subject: [PATCH 20/30] [utils] Improve strip_or_none --- test/test_utils.py | 13 +++++++++++++ youtube_dl/utils.py | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9ef0e422b..71980b3fc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -73,6 +73,7 @@ from youtube_dl.utils import ( smuggle_url, str_to_int, strip_jsonp, + strip_or_none, timeconvert, unescapeHTML, unified_strdate, @@ -752,6 +753,18 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) + def test_strip_or_none(self): + self.assertEqual(strip_or_none(' abc'), 'abc') + self.assertEqual(strip_or_none('abc '), 'abc') + self.assertEqual(strip_or_none(' abc '), 'abc') + self.assertEqual(strip_or_none('\tabc\t'), 'abc') + self.assertEqual(strip_or_none('\n\tabc\n\t'), 'abc') + self.assertEqual(strip_or_none('abc'), 'abc') + self.assertEqual(strip_or_none(''), '') + self.assertEqual(strip_or_none(None), None) + self.assertEqual(strip_or_none(42), None) + self.assertEqual(strip_or_none([]), None) + def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9be9b2e76..ead9bd862 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1951,8 +1951,8 @@ def bool_or_none(v, default=None): return v if isinstance(v, bool) else default -def strip_or_none(v): - return None if v is None else v.strip() +def strip_or_none(v, default=None): + return v.strip() if isinstance(v, compat_str) else default def url_or_none(url): From 11ec06de7f01bced1f4f4b6484e5190cbb9ed9e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 May 2019 00:35:46 +0700 Subject: [PATCH 21/30] [24video] Add support for 24video.site (closes #21193) --- youtube_dl/extractor/twentyfourvideo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 4b3b3e705..d16f55500 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -14,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?P<host>(?:www\.)?24video\.(?:net|me|xxx|sexy?|tube|adult))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://(?P<host>(?:www\.)?24video\.(?:net|me|xxx|sexy?|tube|adult|site))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', @@ -42,6 +42,9 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'http://www.24video.tube/video/view/2363750', 'only_matching': True, + }, { + 'url': 'https://www.24video.site/video/view/2640421', + 'only_matching': True, }] def _real_extract(self, url): From f4cc2ca503239aadd5cb75f83cb873bc5816dfdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 May 2019 00:38:06 +0700 Subject: [PATCH 22/30] [24video] Add support for porno.24video.net (closes #21194) --- youtube_dl/extractor/twentyfourvideo.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index d16f55500..1d66eeaff 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -14,7 +14,18 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?P<host>(?:www\.)?24video\.(?:net|me|xxx|sexy?|tube|adult|site))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?P<host> + (?:(?:www|porno)\.)?24video\. + (?:net|me|xxx|sexy?|tube|adult|site) + )/ + (?: + video/(?:(?:view|xml)/)?| + player/new24_play\.swf\?id= + ) + (?P<id>\d+) + ''' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', @@ -45,6 +56,9 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'https://www.24video.site/video/view/2640421', 'only_matching': True, + }, { + 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', + 'only_matching': True, }] def _real_extract(self, url): From 3fe774722b0855d572b6885ff20fb6f4f96f3e08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20P=C3=B6schel?= <github@basicmaster.de> Date: Sun, 19 May 2019 18:11:17 +0200 Subject: [PATCH 23/30] [srgssrplay] Add support for popupvideoplayer URLs --- youtube_dl/extractor/srgssr.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index bb73eb1d5..ff1b67876 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -106,7 +106,7 @@ class SRGSSRIE(InfoExtractor): class SRGSSRPlayIE(InfoExtractor): IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/(?:[^/]+/|popup)(?P<type>video|audio)(?:/[^?]+|player)\?id=(?P<id>[0-9a-f\-]{36}|\d+)' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', @@ -163,6 +163,17 @@ class SRGSSRPlayIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', + 'md5': 'f6247aa7c905b81c9ba7f50fb22e2fbd', + 'info_dict': { + 'id': 'c4dba0ca-e75b-43b2-a34f-f708a4932e01', + 'ext': 'mp4', + 'upload_date': '20190122', + 'title': 'Erster Selfie-Stick (1983)', + 'description': 'md5:23a6b40024e583137e4137f5946543c1', + 'timestamp': 1548155133, + } }] def _real_extract(self, url): From 25b83c2a0e29c75372f0ce26d2b4ecf493e8b28c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 May 2019 00:43:22 +0700 Subject: [PATCH 24/30] [srgssrplay] Improve _VALID_URL (closes #21155) --- youtube_dl/extractor/srgssr.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index ff1b67876..170dce87f 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -106,7 +106,16 @@ class SRGSSRIE(InfoExtractor): class SRGSSRPlayIE(InfoExtractor): IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/(?:[^/]+/|popup)(?P<type>video|audio)(?:/[^?]+|player)\?id=(?P<id>[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|play)\.)? + (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/ + (?: + [^/]+/(?P<type>video|audio)/[^?]+| + popup(?P<type_2>video|audio)player + ) + \?id=(?P<id>[0-9a-f\-]{36}|\d+) + ''' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', @@ -165,18 +174,13 @@ class SRGSSRPlayIE(InfoExtractor): } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', - 'md5': 'f6247aa7c905b81c9ba7f50fb22e2fbd', - 'info_dict': { - 'id': 'c4dba0ca-e75b-43b2-a34f-f708a4932e01', - 'ext': 'mp4', - 'upload_date': '20190122', - 'title': 'Erster Selfie-Stick (1983)', - 'description': 'md5:23a6b40024e583137e4137f5946543c1', - 'timestamp': 1548155133, - } + 'only_matching': True, }] def _real_extract(self, url): - bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + bu = mobj.group('bu') + media_type = mobj.group('type') or mobj.group('type_2') + media_id = mobj.group('id') # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From 0d297518904f8537e8103052d19f0940a272413d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 25 May 2019 23:14:47 +0100 Subject: [PATCH 25/30] [youtube] improve DRM protected videos detection(#1774) --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 06005f8d2..5f1957a59 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1789,9 +1789,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - if video_info.get('license_info'): - raise ExtractorError('This video is DRM protected.', expected=True) - video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} @@ -1927,7 +1924,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data: + if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'): continue stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) # Unsupported FORMAT_STREAM_TYPE_OTF @@ -2323,6 +2320,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) self.mark_watched(video_id, video_info, player_response) From ead467a9c1cab3f26279ba9c57cf18598507ba79 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 28 May 2019 04:58:12 +0100 Subject: [PATCH 26/30] [rtp] fix extraction(closes #15099) --- youtube_dl/extractor/rtp.py | 83 ++++++++++++++----------------------- 1 file changed, 30 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 533ee27cb..02986f442 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -1,9 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, +) class RTPIE(InfoExtractor): @@ -18,10 +20,6 @@ class RTPIE(InfoExtractor): 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': r're:^https?://.*\.jpg', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, @@ -33,57 +31,36 @@ class RTPIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_meta( 'twitter:title', webpage, display_name='title', fatal=True) - description = self._html_search_meta('description', webpage) - thumbnail = self._og_search_thumbnail(webpage) - player_config = self._search_regex( - r'(?s)RTPPLAY\.player\.newPlayer\(\s*(\{.*?\})\s*\)', webpage, 'player config') - config = self._parse_json(player_config, video_id) - - path, ext = config.get('file').rsplit('.', 1) - formats = [{ - 'format_id': 'rtmp', - 'ext': ext, - 'vcodec': config.get('type') == 'audio' and 'none' or None, - 'preference': -2, - 'url': 'rtmp://{streamer:s}/{application:s}'.format(**config), - 'app': config.get('application'), - 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path), - 'page_url': url, - 'rtmp_live': config.get('live', False), - 'player_url': 'http://programas.rtp.pt/play/player.swf?v3', - 'rtmp_real_time': True, - }] - - # Construct regular HTTP download URLs - replacements = { - 'audio': { - 'format_id': 'mp3', - 'pattern': r'^nas2\.share/wavrss/', - 'repl': 'http://rsspod.rtp.pt/podcasts/', - 'vcodec': 'none', - }, - 'video': { - 'format_id': 'mp4_h264', - 'pattern': r'^nas2\.share/h264/', - 'repl': 'http://rsspod.rtp.pt/videocasts/', - 'vcodec': 'h264', - }, - } - r = replacements[config['type']] - if re.match(r['pattern'], config['file']) is not None: - formats.append({ - 'format_id': r['format_id'], - 'url': re.sub(r['pattern'], r['repl'], config['file']), - 'vcodec': r['vcodec'], - }) - - self._sort_formats(formats) + config = self._parse_json(self._search_regex( + r'(?s)RTPPlayer\(({.+?})\);', webpage, + 'player config'), video_id, js_to_json) + file_url = config['file'] + ext = determine_ext(file_url) + if ext == 'm3u8': + file_key = config.get('fileKey') + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=file_key) + if file_key: + formats.append({ + 'url': 'https://cdn-ondemand.rtp.pt' + file_key, + 'preference': 1, + }) + self._sort_formats(formats) + else: + formats = [{ + 'url': file_url, + 'ext': ext, + }] + if config.get('mediaType') == 'audio': + for f in formats: + f['vcodec'] = 'none' return { 'id': video_id, 'title': title, 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, + 'description': self._html_search_meta(['description', 'twitter:description'], webpage), + 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), } From 33b2218b2f67b4c06d45e28035968320e0a0bf05 Mon Sep 17 00:00:00 2001 From: bitraid <bitraid@protonmail.ch> Date: Tue, 28 May 2019 15:31:11 +0300 Subject: [PATCH 27/30] [LiveLeak] Check if the original videos exist (closes #21206) (#21208) --- youtube_dl/extractor/liveleak.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 5df14bb41..4ac437c8b 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -82,6 +82,10 @@ class LiveLeakIE(InfoExtractor): }, { 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', 'only_matching': True, + }, { + # No original video + 'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804', + 'only_matching': True, }] @staticmethod @@ -134,11 +138,13 @@ class LiveLeakIE(InfoExtractor): orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) if a_format['url'] != orig_url: format_id = a_format.get('format_id') - formats.append({ - 'format_id': 'original' + ('-' + format_id if format_id else ''), - 'url': orig_url, - 'preference': 1, - }) + format_id = 'original' + ('-' + format_id if format_id else '') + if self._is_valid_url(orig_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': orig_url, + 'preference': 1, + }) self._sort_formats(formats) info_dict['formats'] = formats From 26a87972a9dcd4eac6112c06372745d6d0983f92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 29 May 2019 04:42:19 +0700 Subject: [PATCH 28/30] [viki] Switch to HTTPS (closes #21001) --- youtube_dl/extractor/viki.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 546de95d8..b0dcdc0e6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -21,7 +21,7 @@ from ..utils import ( class VikiBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' - _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' + _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' _APP = '100005a' _APP_VERSION = '2.2.5.1428709186' @@ -377,7 +377,7 @@ class VikiChannelIE(VikiBaseIE): for video in page['response']: video_id = video['id'] entries.append(self.url_result( - 'http://www.viki.com/videos/%s' % video_id, 'Viki')) + 'https://www.viki.com/videos/%s' % video_id, 'Viki')) if not page['pagination']['next']: break From 0e2dd3fcbce3a01dc4f6ce3911f5f279c3827edf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 1 Jun 2019 11:16:44 +0100 Subject: [PATCH 29/30] [vrv] extract adaptive_hls formats(closes #21243) --- youtube_dl/extractor/vrv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index b698bf66c..c814a8a4a 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -130,7 +130,7 @@ class VRVIE(VRVBaseIE): self._TOKEN_SECRET = token_credentials['oauth_token_secret'] def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): - if not url or stream_format not in ('hls', 'dash'): + if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): return [] stream_id_list = [] if audio_lang: @@ -140,7 +140,7 @@ class VRVIE(VRVBaseIE): format_id = stream_format if stream_id_list: format_id += '-' + '-'.join(stream_id_list) - if stream_format == 'hls': + if 'hls' in stream_format: adaptive_formats = self._extract_m3u8_formats( url, video_id, 'mp4', m3u8_id=format_id, note='Downloading %s information' % format_id, From c5eb75b35a709baa134fb96e774a4b712cc65e93 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 1 Jun 2019 15:12:30 +0100 Subject: [PATCH 30/30] [prosiebensat1] add support for new API(closes #21272) --- youtube_dl/extractor/prosiebensat1.py | 197 ++++++++++++++++---------- 1 file changed, 121 insertions(+), 76 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7d11c2b9b..e19a470a5 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -16,6 +16,11 @@ from ..utils import ( class ProSiebenSat1BaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + _ACCESS_ID = None + _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' + _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' + def _extract_video_info(self, url, clip_id): client_location = url @@ -31,93 +36,128 @@ class ProSiebenSat1BaseIE(InfoExtractor): if video.get('is_protected') is True: raise ExtractorError('This video is DRM protected.', expected=True) - duration = float_or_none(video.get('duration')) - source_ids = [compat_str(source['id']) for source in video['sources']] - - client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() - - sources = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, - clip_id, 'Downloading sources JSON', query={ - 'access_token': self._TOKEN, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': self._CLIENT_NAME, - }) - server_id = sources['server_id'] - - def fix_bitrate(bitrate): - bitrate = int_or_none(bitrate) - if not bitrate: - return None - return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - formats = [] - for source_id in source_ids: - client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() - urls = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, - clip_id, 'Downloading urls JSON', fatal=False, query={ + if self._ACCESS_ID: + raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID + server_token = (self._download_json( + self._V4_BASE_URL + 'protocols', clip_id, + 'Downloading protocols JSON', + headers=self.geo_verification_headers(), query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct).encode()).hexdigest(), + 'video_id': clip_id, + }, fatal=False) or {}).get('server_token') + if server_token: + urls = (self._download_json( + self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), + 'protocols': self._SUPPORTED_PROTOCOLS, + 'server_token': server_token, + 'video_id': clip_id, + }, fatal=False) or {}).get('urls') or {} + for protocol, variant in urls.items(): + source_url = variant.get('clear', {}).get('url') + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + if not formats: + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ 'access_token': self._TOKEN, 'client_id': client_id, 'client_location': client_location, 'client_name': self._CLIENT_NAME, - 'server_id': server_id, - 'source_ids': source_id, }) - if not urls: - continue - if urls.get('status_code') != 0: - raise ExtractorError('This video is unavailable', expected=True) - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() - for source in urls_sources: - source_url = source.get('url') - if not source_url: + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: continue - protocol = source.get('protocol') - mimetype = source.get('mimetype') - if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - source_url, clip_id, f4m_id='hds', fatal=False)) - elif mimetype == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats( - source_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif mimetype == 'application/dash+xml': - formats.extend(self._extract_mpd_formats( - source_url, clip_id, mpd_id='dash', fatal=False)) - else: - tbr = fix_bitrate(source['bitrate']) - if protocol in ('rtmp', 'rtmpe'): - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) - if not mobj: - continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'tbr': tbr, - 'ext': 'flv', - 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), - }) + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) else: - formats.append({ - 'url': source_url, - 'tbr': tbr, - 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), - }) + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) self._sort_formats(formats) return { - 'duration': duration, + 'duration': float_or_none(video.get('duration')), 'formats': formats, } @@ -344,6 +384,11 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): _TOKEN = 'prosieben' _SALT = '01!8d8F_)r9]4s[qeuXfP%' _CLIENT_NAME = 'kolibri-2.0.19-splec4' + + _ACCESS_ID = 'x_prosiebenmaxx-de' + _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' + _IV = 'Aeluchoc6aevechuipiexeeboowedaok' + _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"',