From 1c45ff5572e0844b2ad26c2c0d477edc81e6b5b0 Mon Sep 17 00:00:00 2001 From: tsia Date: Mon, 2 Mar 2020 19:27:40 +0100 Subject: [PATCH 1/5] [vimeo] Fix subtitles URLs (#24209) --- youtube_dl/extractor/vimeo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 1da4ced96..8cd611e1e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -33,6 +33,7 @@ from ..utils import ( unified_timestamp, unsmuggle_url, urlencode_postdata, + urljoin, unescapeHTML, ) @@ -191,7 +192,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): for tt in text_tracks: subtitles[tt['lang']] = [{ 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], + 'url': urljoin('https://vimeo.com', tt['url']), }] thumbnails = [] From 3b5399ce0f85f46ce856d47c725a437c72dcce6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Mar 2020 01:40:48 +0700 Subject: [PATCH 2/5] [servus] Add support for new URL schema (closes #23475, closes #23583, closes #24142) --- youtube_dl/extractor/servus.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index e579d42cf..9401bf2cf 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -7,9 +7,18 @@ from .common import InfoExtractor class ServusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P[aA]{2}-\w+|\d+-\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| + servustv\.com/videos + ) + /(?P[aA]{2}-\w+|\d+-\d+) + ''' _TESTS = [{ - 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + # new URL schema + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', @@ -18,6 +27,10 @@ class ServusIE(InfoExtractor): 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', } + }, { + # old URL schema + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'only_matching': True, }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, From 0e30a7b9732dbecc63527df6037c5fbea964b1fd Mon Sep 17 00:00:00 2001 From: jxu <7989982+jxu@users.noreply.github.com> Date: Mon, 2 Mar 2020 13:46:00 -0500 Subject: [PATCH 3/5] [youtube:playlist] Fix tests (closes #23872) (#23885) --- youtube_dl/extractor/youtube.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eacaa5ecd..e06290427 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2495,20 +2495,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE = _VIDEO_RE_TPL % r'(?P[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', }, - 'playlist_count': 3, + 'playlist_count': 1, }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, - 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2518,7 +2521,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader': 'Christiaan008', 'uploader_id': 'ChRiStIaAn008', }, - 'playlist_count': 95, + 'playlist_count': 96, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', From ac379fa236c01ed1d3601f013d755066b92709a4 Mon Sep 17 00:00:00 2001 From: 3risian <59593325+3risian@users.noreply.github.com> Date: Tue, 7 Jan 2020 18:34:51 +1100 Subject: [PATCH 4/5] [peertube] Improve extraction --- youtube_dl/extractor/peertube.py | 87 +++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index d3a83ea2b..307712196 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( int_or_none, parse_resolution, + str_or_none, try_get, unified_timestamp, url_or_none, @@ -423,26 +424,30 @@ class PeerTubeIE(InfoExtractor): (?P%s) ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ - 'url': 'https://peertube.cpy.re/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', - 'md5': '80f24ff364cc9d333529506a263e7feb', + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '9bed8c0137913e17b86334e5885aacff', 'info_dict': { - 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'ext': 'mp4', - 'title': 'wow', - 'description': 'wow such video, so gif', + 'title': 'What is PeerTube?', + 'description': '**[Want to help to translate this video?](https://weblate.framasoft.org/projects/what-is-peertube-video/)**\r\n\r\n**Take back the control of your videos! [#JoinPeertube](https://joinpeertube.org)**\r\n*A decentralized video hosting network, based on free/libre software!*\r\n\r\n**Animation Produced by:** [LILA](https://libreart.info) - [ZeMarmot Team](https://film.zemarmot.net)\r\n*Directed by* Aryeom\r\n*Assistant* Jehan\r\n**Licence**: [CC-By-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)\r\n\r\n**Sponsored by** [Framasoft](https://framasoft.org)\r\n\r\n**Music**: [Red Step Forward](http://play.dogmazic.net/song.php?song_id=52491) - CC-By Ken Bushima\r\n\r\n**Movie Clip**: [Caminades 3: Llamigos](http://www.caminandes.com/) CC-By Blender Institute\r\n\r\n**Video sources**: https://gitlab.gnome.org/Jehan/what-is-peertube/', 'thumbnail': r're:https?://.*\.(?:jpg|png)', - 'timestamp': 1519297480, - 'upload_date': '20180222', - 'uploader': 'Luclu7', - 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', - 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', - 'license': 'Unknown', - 'duration': 3, + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'Les vidéos de Framasoft', + 'channel_id': '2', + 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, 'view_count': int, 'like_count': int, 'dislike_count': int, - 'tags': list, - 'categories': list, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', @@ -484,6 +489,23 @@ class PeerTubeIE(InfoExtractor): entries = [peertube_url] return entries + def _get_subtitles(self, host, video_id): + video_captions = self._download_json( + 'https://%s/api/v1/videos/%s/captions' % (host, video_id), video_id, fatal=False) + if not isinstance(video_captions, dict): + return None + + subtitles = {} + for entry in video_captions.get('data'): + language_id = try_get(entry, lambda x: x['language']['id'], compat_str) + caption_path = str_or_none(entry.get('captionPath')) + if language_id and caption_path: + caption_url = urljoin('https://%s' % host, entry.get('captionPath')) + subtitles.setdefault(language_id, []).append({ + 'url': caption_url, + }) + return subtitles + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or mobj.group('host_2') @@ -513,10 +535,25 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - def account_data(field): - return try_get(video, lambda x: x['account'][field], compat_str) + video_description = self._download_json( + 'https://%s/api/v1/videos/%s/description' % (host, video_id), video_id, fatal=False) - category = try_get(video, lambda x: x['category']['label'], compat_str) + description = None + if isinstance(video_description, dict): + description = str_or_none(video_description.get('description')) + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) categories = [category] if category else None nsfw = video.get('nsfw') @@ -528,14 +565,17 @@ class PeerTubeIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video.get('description'), + 'description': description, 'thumbnail': urljoin(url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName'), - 'uploader_id': account_data('uuid'), - 'uploder_url': account_data('url'), - 'license': try_get( - video, lambda x: x['licence']['label'], compat_str), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likes')), @@ -544,4 +584,5 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, + 'subtitles': subtitles } From 1e1c1960aa154a6e257e83e94e86ee6dc8b0b362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Mar 2020 03:01:23 +0700 Subject: [PATCH 5/5] [peertube] Fix issues and improve extraction (closes #23657) --- youtube_dl/extractor/peertube.py | 56 +++++++++++++++++++------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 307712196..48fb95416 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -416,6 +416,7 @@ class PeerTubeIE(InfoExtractor): peertube\.cpy\.re )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' _VALID_URL = r'''(?x) (?: peertube:(?P[^:]+):| @@ -430,7 +431,7 @@ class PeerTubeIE(InfoExtractor): 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'ext': 'mp4', 'title': 'What is PeerTube?', - 'description': '**[Want to help to translate this video?](https://weblate.framasoft.org/projects/what-is-peertube-video/)**\r\n\r\n**Take back the control of your videos! [#JoinPeertube](https://joinpeertube.org)**\r\n*A decentralized video hosting network, based on free/libre software!*\r\n\r\n**Animation Produced by:** [LILA](https://libreart.info) - [ZeMarmot Team](https://film.zemarmot.net)\r\n*Directed by* Aryeom\r\n*Assistant* Jehan\r\n**Licence**: [CC-By-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)\r\n\r\n**Sponsored by** [Framasoft](https://framasoft.org)\r\n\r\n**Music**: [Red Step Forward](http://play.dogmazic.net/song.php?song_id=52491) - CC-By Ken Bushima\r\n\r\n**Movie Clip**: [Caminades 3: Llamigos](http://www.caminandes.com/) CC-By Blender Institute\r\n\r\n**Video sources**: https://gitlab.gnome.org/Jehan/what-is-peertube/', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', 'thumbnail': r're:https?://.*\.(?:jpg|png)', 'timestamp': 1538391166, 'upload_date': '20181001', @@ -489,21 +490,29 @@ class PeerTubeIE(InfoExtractor): entries = [peertube_url] return entries - def _get_subtitles(self, host, video_id): - video_captions = self._download_json( - 'https://%s/api/v1/videos/%s/captions' % (host, video_id), video_id, fatal=False) - if not isinstance(video_captions, dict): - return None + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return subtitles = {} - for entry in video_captions.get('data'): - language_id = try_get(entry, lambda x: x['language']['id'], compat_str) - caption_path = str_or_none(entry.get('captionPath')) - if language_id and caption_path: - caption_url = urljoin('https://%s' % host, entry.get('captionPath')) - subtitles.setdefault(language_id, []).append({ - 'url': caption_url, - }) + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) return subtitles def _real_extract(self, url): @@ -511,8 +520,8 @@ class PeerTubeIE(InfoExtractor): host = mobj.group('host') or mobj.group('host_2') video_id = mobj.group('id') - video = self._download_json( - 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) + video = self._call_api( + host, video_id, '', note='Downloading video JSON') title = video['name'] @@ -535,12 +544,15 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - video_description = self._download_json( - 'https://%s/api/v1/videos/%s/description' % (host, video_id), video_id, fatal=False) + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) description = None - if isinstance(video_description, dict): - description = str_or_none(video_description.get('description')) + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) + if not description: + description = video.get('description') subtitles = self.extract_subtitles(host, video_id) @@ -569,10 +581,10 @@ class PeerTubeIE(InfoExtractor): 'thumbnail': urljoin(url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), - 'uploader_id': str(account_data('id', int)), + 'uploader_id': str_or_none(account_data('id', int)), 'uploader_url': url_or_none(account_data('url', compat_str)), 'channel': channel_data('displayName', compat_str), - 'channel_id': str(channel_data('id', int)), + 'channel_id': str_or_none(channel_data('id', int)), 'channel_url': url_or_none(channel_data('url', compat_str)), 'language': data('language', 'id', compat_str), 'license': data('licence', 'label', compat_str),