1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-26 10:24:33 +01:00

[ENSSavoirs] Add new extractor

This commit is contained in:
Sylvain Saubier (ResponSyS) 2017-10-27 01:16:09 +02:00 committed by ResponSySS
parent 55c727a547
commit 4b9fdaaad3
3 changed files with 65 additions and 1 deletions

View File

@ -180,7 +180,7 @@ class InfoExtractor(object):
The following fields are optional: The following fields are optional:
alt_title: A secondary title of the video. alt_title: A secondary title of the video.
display_id An alternative identifier for the video, not necessarily display_id: An alternative identifier for the video, not necessarily
unique, but available before title. Typically, id is unique, but available before title. Typically, id is
something like "4234987", title "Dancing naked mole rats", something like "4234987", title "Dancing naked mole rats",
and display_id "dancing-naked-mole-rats" and display_id "dancing-naked-mole-rats"

View File

@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class ENSSavoirsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?savoirs\.ens\.fr/expose\.php\?id=(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://savoirs.ens.fr/expose.php?id=2516#foo',
'md5': 'a95f51212dd2d104fb5655bdf1d03071',
'info_dict': {
'id': '2516',
'ext': 'mp4',
'format_id': 'mp4',
# TODO ::: Mon 23 Oct 2017 10:38:07 PM CEST
# how to extract?
#'formats': {
# 'format_id': 'mp4',
# 'tbr': 1612,
# 'fps': 25,
# 'acodec': 'aac',
# 'vcodec': 'h264',
# 'width': 360,
# 'height': 640,
# 'filesize_approx': '600M'
# },
'title': 'Chiffrer mieux pour (dé)chiffrer plus',
'thumbnail': r're:^https?://(?:www\.)?savoirs\.ens\.fr/uploads/images/exposes/2516\.jpg$',
'creator': 'Anne Canteaut',
# TODO ::: Mon 23 Oct 2017 10:38:34 PM CEST
# hard to extract
#'release_date': 20160413,
#'description': "some long long text in <div id=description>"
}
}]
def _real_extract(self, url):
# TODO does this default to HTTP and not HTTPS?
url_base = "//savoirs.ens.fr/"
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
media_url = url_base + self._html_search_regex(r'<a[^>]+href=["\'](?P<media_url>[^"\']+)["\'][^>]*>Télécharger\sla\svidéo</a>', webpage, 'media_url')
title = self._html_search_regex(r'<span[^>]+class=["\']titrePageExpose["\'][^>]*>(?P<title>[^<]+)</span>', webpage, 'title')
ext = media_url.split('.')[-1]
thumbnail = "%suploads/images/exposes/%s.jpg" % (url_base, video_id)
creator = self._html_search_regex(r'<span[^>]+class=["\']exposeConferencierNom["\'][^>]*>(?P<creator>[^<]+)</span>', webpage, 'creator', fatal=False, default="ENS Savoirs")
return {
'id': video_id,
'url': media_url,
'title': title,
# Fails (need to extract text of <div id=description> and remove html tags)
#'description': self._og_search_description(webpage),
'ext': ext,
'format_id': ext,
'thumbnail': thumbnail,
'creator': creator,
#'formats': { 'format_id': ext }
#'release_date': release_date,
}

View File

@ -315,6 +315,7 @@ from .ellentv import (
from .elpais import ElPaisIE from .elpais import ElPaisIE
from .embedly import EmbedlyIE from .embedly import EmbedlyIE
from .engadget import EngadgetIE from .engadget import EngadgetIE
from .enssavoirs import ENSSavoirsIE
from .eporner import EpornerIE from .eporner import EpornerIE
from .eroprofile import EroProfileIE from .eroprofile import EroProfileIE
from .escapist import EscapistIE from .escapist import EscapistIE