From 4b9fdaaad3a5b71088304895bcbcf73d21e602b4 Mon Sep 17 00:00:00 2001 From: "Sylvain Saubier (ResponSyS)" Date: Fri, 27 Oct 2017 01:16:09 +0200 Subject: [PATCH] [ENSSavoirs] Add new extractor --- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/enssavoirs.py | 63 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/enssavoirs.py diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a69240693..5713531c4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -180,7 +180,7 @@ class InfoExtractor(object): The following fields are optional: alt_title: A secondary title of the video. - display_id An alternative identifier for the video, not necessarily + display_id: An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" diff --git a/youtube_dl/extractor/enssavoirs.py b/youtube_dl/extractor/enssavoirs.py new file mode 100644 index 000000000..ada8717c5 --- /dev/null +++ b/youtube_dl/extractor/enssavoirs.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ENSSavoirsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?savoirs\.ens\.fr/expose\.php\?id=(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://savoirs.ens.fr/expose.php?id=2516#foo', + 'md5': 'a95f51212dd2d104fb5655bdf1d03071', + 'info_dict': { + 'id': '2516', + 'ext': 'mp4', + 'format_id': 'mp4', + # TODO ::: Mon 23 Oct 2017 10:38:07 PM CEST + # how to extract? + #'formats': { + # 'format_id': 'mp4', + # 'tbr': 1612, + # 'fps': 25, + # 'acodec': 'aac', + # 'vcodec': 'h264', + # 'width': 360, + # 'height': 640, + # 'filesize_approx': '600M' + # }, + 'title': 'Chiffrer mieux pour (dé)chiffrer plus', + 'thumbnail': r're:^https?://(?:www\.)?savoirs\.ens\.fr/uploads/images/exposes/2516\.jpg$', + 'creator': 'Anne Canteaut', + # TODO ::: Mon 23 Oct 2017 10:38:34 PM CEST + # hard to extract + #'release_date': 20160413, + #'description': "some long long text in
" + } + }] + + def _real_extract(self, url): + # TODO does this default to HTTP and not HTTPS? + url_base = "//savoirs.ens.fr/" + + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + media_url = url_base + self._html_search_regex(r']+href=["\'](?P[^"\']+)["\'][^>]*>Télécharger\sla\svidéo', webpage, 'media_url') + title = self._html_search_regex(r']+class=["\']titrePageExpose["\'][^>]*>(?P[^<]+)</span>', webpage, 'title') + ext = media_url.split('.')[-1] + thumbnail = "%suploads/images/exposes/%s.jpg" % (url_base, video_id) + creator = self._html_search_regex(r'<span[^>]+class=["\']exposeConferencierNom["\'][^>]*>(?P<creator>[^<]+)</span>', webpage, 'creator', fatal=False, default="ENS Savoirs") + + return { + 'id': video_id, + 'url': media_url, + 'title': title, + # Fails (need to extract text of <div id=description> and remove html tags) + #'description': self._og_search_description(webpage), + 'ext': ext, + 'format_id': ext, + 'thumbnail': thumbnail, + 'creator': creator, + #'formats': { 'format_id': ext } + #'release_date': release_date, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 09b20a39a..c4dae1eb4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -315,6 +315,7 @@ from .ellentv import ( from .elpais import ElPaisIE from .embedly import EmbedlyIE from .engadget import EngadgetIE +from .enssavoirs import ENSSavoirsIE from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE