[ENSSavoirs] Add new extractor

2024-11-22 16:44:32 +01:00 · 2017-10-27 01:16:09 +02:00 · 2017-10-27 01:16:09 +02:00 · 4b9fdaaad3
commit 4b9fdaaad3
parent 55c727a547
3 changed files with 65 additions and 1 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -180,7 +180,7 @@ class InfoExtractor(object):
    The following fields are optional:
    alt_title:      A secondary title of the video.
-    display_id      An alternative identifier for the video, not necessarily
+    display_id:     An alternative identifier for the video, not necessarily
                    unique, but available before title. Typically, id is
                    something like "4234987", title "Dancing naked mole rats",
                    and display_id "dancing-naked-mole-rats"
--- a/youtube_dl/extractor/enssavoirs.py
+++ b/youtube_dl/extractor/enssavoirs.py
@ -0,0 +1,63 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 class ENSSavoirsIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?savoirs\.ens\.fr/expose\.php\?id=(?P<id>[0-9]+)'
    _TESTS = [{
        'url': 'http://savoirs.ens.fr/expose.php?id=2516#foo',
        'md5': 'a95f51212dd2d104fb5655bdf1d03071',
        'info_dict': {
            'id': '2516',
            'ext': 'mp4',
            'format_id': 'mp4',
            # TODO ::: Mon 23 Oct 2017 10:38:07 PM CEST
            # how to extract?
            #'formats': {
            #   'format_id': 'mp4',
            #   'tbr': 1612,
            #   'fps': 25,
            #   'acodec': 'aac',
            #   'vcodec': 'h264',
            #   'width': 360,
            #   'height': 640,
            #   'filesize_approx': '600M'
            #   },
            'title': 'Chiffrer mieux pour (dé)chiffrer plus',
            'thumbnail': r're:^https?://(?:www\.)?savoirs\.ens\.fr/uploads/images/exposes/2516\.jpg$',
            'creator': 'Anne Canteaut',
            # TODO ::: Mon 23 Oct 2017 10:38:34 PM CEST
            # hard to extract
            #'release_date': 20160413,
            #'description': "some long long text in <div id=description>"
        }
    }]
    def _real_extract(self, url):
        # TODO does this default to HTTP and not HTTPS?
        url_base = "//savoirs.ens.fr/"
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        media_url = url_base + self._html_search_regex(r'<a[^>]+href=["\'](?P<media_url>[^"\']+)["\'][^>]*>Télécharger\sla\svidéo</a>', webpage, 'media_url')
        title = self._html_search_regex(r'<span[^>]+class=["\']titrePageExpose["\'][^>]*>(?P<title>[^<]+)</span>', webpage, 'title')
        ext = media_url.split('.')[-1]
        thumbnail = "%suploads/images/exposes/%s.jpg" % (url_base, video_id)
        creator = self._html_search_regex(r'<span[^>]+class=["\']exposeConferencierNom["\'][^>]*>(?P<creator>[^<]+)</span>', webpage, 'creator', fatal=False, default="ENS Savoirs")
        return {
            'id': video_id,
            'url': media_url,
            'title': title,
            # Fails (need to extract text of <div id=description> and remove html tags)
            #'description': self._og_search_description(webpage),
            'ext': ext,
            'format_id': ext,
            'thumbnail': thumbnail,
            'creator': creator,
            #'formats': { 'format_id': ext }
            #'release_date': release_date,
        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -315,6 +315,7 @@ from .ellentv import (
 from .elpais import ElPaisIE
 from .embedly import EmbedlyIE
 from .engadget import EngadgetIE
 from .enssavoirs import ENSSavoirsIE
 from .eporner import EpornerIE
 from .eroprofile import EroProfileIE
 from .escapist import EscapistIE