[SenateFloorGranicus] Add new extractor

2024-11-22 16:44:32 +01:00 · 2019-08-20 19:14:13 -07:00 · 2019-08-20 19:14:13 -07:00 · 71b4157df3
commit 71b4157df3
parent 820215f0e3
2 changed files with 78 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -994,6 +994,7 @@ from .screencast import ScreencastIE
 from .screencastomatic import ScreencastOMaticIE
 from .scrippsnetworks import ScrippsNetworksWatchIE
 from .seeker import SeekerIE
 from .senatefloor import SenateFloorGranicusIE
 from .senateisvp import SenateISVPIE
 from .sendtonews import SendtoNewsIE
 from .servus import ServusIE
--- a/youtube_dl/extractor/senatefloor.py
+++ b/youtube_dl/extractor/senatefloor.py
@ -0,0 +1,77 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 class SenateFloorGranicusIE(InfoExtractor):
    """extractor for videos at https://www.senate.gov/floor/ hosted by
    granicus.com
    granicus.com urls are present in content at urls like
    https://floor.senate.gov/videos/3385/player
    which is the iframe src url for the embedded video at
    https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3388 )
    """
    _VALID_URL = r'https?://(?:archive-media.granicus.com.*?/OnDemand/[0-9a-z-]+/(?P<id>[0-9a-z-]+_[0-9a-f-]+).mp4|floor.senate.gov/(?:.*?[?]view_id=(?:[0-9]+)&clip_id=(?P<clip_id>[0-9]+).*|videos/[0-9]+/player))'
    _TESTS = [
        {'url': 'http://archive-media.granicus.com:443/OnDemand/senate/senate_ff605d76-86c3-4e8d-9991-9f32efd782de.mp4',
         'info_dict': {
             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
             'ext': 'mp4',
             'title': 'The United States Senate'}
         },
        {'url': 'https://floor.senate.gov/videos/3385/player',
         'info_dict': {
             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
             'ext': 'mp4',
             'title': 'Senate Floor Proceedings - 2019-08-13'}
         },
        {'url': 'https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3385',
         'info_dict': {
             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
             'ext': 'mp4',
             'title': 'Senate Floor Proceedings - 2019-08-13'}
         },
        {'url': 'http://floor.senate.gov/ASX.php?view_id=2&clip_id=3385&sn=floor.senate.gov',
         'info_dict': {
             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
             'ext': 'mp4',
             'title': 'Senate Floor Proceedings - 2019-08-13'}
         }
    ]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        title = 'The United States Senate'
        m = re.match(self._VALID_URL, url)
        if m and m.group('clip_id'):
            # https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3385
            # http://floor.senate.gov/ASX.php?view_id=2&clip_id=894&sn=floor.senate.gov
            return {
                '_type': 'url',
                'url': 'https://floor.senate.gov/videos/{}/player'.format(m.group('clip_id')),
            }
        elif url.endswith('player'):
            # https://floor.senate.gov/videos/3385/player
            webpage = self._download_webpage(url, video_id)
            title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title',
                                            default=title)
            video_id = self._html_search_regex(
                r'src="//archive-stream.granicus.com/OnDemand/_definst_/mp4:[0-9a-z-]+/(?P<id>[0-9a-z-]+_[0-9a-f-]+).mp4',
                webpage, 'id')
            return {
                '_type': 'url_transparent',
                'url': 'http://archive-media.granicus.com:443/OnDemand/{}/{}.mp4'.format(video_id.split('_')[0], video_id),
                'id': video_id,
                'title': title,
            }
        # we found an mp4!
        return {
            'url': url,
            'id': video_id,
            'title': title
            # TODO more properties? (see youtube_dl/extractor/common.py)
        }