[SenateFloorGranicus] Add new extractor

2024-11-22 16:44:32 +01:00 · 2019-08-20 19:14:13 -07:00 · 2019-08-20 19:14:13 -07:00 · 71b4157df3
commit 71b4157df3
parent 820215f0e3
2 changed files with 78 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -994,6 +994,7 @@ from .screencast import ScreencastIE
 from .screencastomatic import ScreencastOMaticIE
 from .scrippsnetworks import ScrippsNetworksWatchIE
 from .seeker import SeekerIE
+from .senatefloor import SenateFloorGranicusIE
 from .senateisvp import SenateISVPIE
 from .sendtonews import SendtoNewsIE
 from .servus import ServusIE
--- a/youtube_dl/extractor/senatefloor.py
+++ b/youtube_dl/extractor/senatefloor.py
@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+
+
+class SenateFloorGranicusIE(InfoExtractor):
+    """extractor for videos at https://www.senate.gov/floor/ hosted by
+    granicus.com
+    granicus.com urls are present in content at urls like
+    https://floor.senate.gov/videos/3385/player
+    which is the iframe src url for the embedded video at
+    https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3388 )
+    """
+
+    _VALID_URL = r'https?://(?:archive-media.granicus.com.*?/OnDemand/[0-9a-z-]+/(?P<id>[0-9a-z-]+_[0-9a-f-]+).mp4|floor.senate.gov/(?:.*?[?]view_id=(?:[0-9]+)&clip_id=(?P<clip_id>[0-9]+).*|videos/[0-9]+/player))'
+    _TESTS = [
+        {'url': 'http://archive-media.granicus.com:443/OnDemand/senate/senate_ff605d76-86c3-4e8d-9991-9f32efd782de.mp4',
+         'info_dict': {
+             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
+             'ext': 'mp4',
+             'title': 'The United States Senate'}
+         },
+        {'url': 'https://floor.senate.gov/videos/3385/player',
+         'info_dict': {
+             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
+             'ext': 'mp4',
+             'title': 'Senate Floor Proceedings - 2019-08-13'}
+         },
+        {'url': 'https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3385',
+         'info_dict': {
+             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
+             'ext': 'mp4',
+             'title': 'Senate Floor Proceedings - 2019-08-13'}
+         },
+        {'url': 'http://floor.senate.gov/ASX.php?view_id=2&clip_id=3385&sn=floor.senate.gov',
+         'info_dict': {
+             'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
+             'ext': 'mp4',
+             'title': 'Senate Floor Proceedings - 2019-08-13'}
+         }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        title = 'The United States Senate'
+
+        m = re.match(self._VALID_URL, url)
+        if m and m.group('clip_id'):
+            # https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3385
+            # http://floor.senate.gov/ASX.php?view_id=2&clip_id=894&sn=floor.senate.gov
+            return {
+                '_type': 'url',
+                'url': 'https://floor.senate.gov/videos/{}/player'.format(m.group('clip_id')),
+            }
+        elif url.endswith('player'):
+            # https://floor.senate.gov/videos/3385/player
+            webpage = self._download_webpage(url, video_id)
+            title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title',
+                                            default=title)
+            video_id = self._html_search_regex(
+                r'src="//archive-stream.granicus.com/OnDemand/_definst_/mp4:[0-9a-z-]+/(?P<id>[0-9a-z-]+_[0-9a-f-]+).mp4',
+                webpage, 'id')
+            return {
+                '_type': 'url_transparent',
+                'url': 'http://archive-media.granicus.com:443/OnDemand/{}/{}.mp4'.format(video_id.split('_')[0], video_id),
+                'id': video_id,
+                'title': title,
+            }
+        # we found an mp4!
+        return {
+            'url': url,
+            'id': video_id,
+            'title': title
+            # TODO more properties? (see youtube_dl/extractor/common.py)
+        }