[SenateFloorGranicus] Add new extractor

This commit is contained in:
Barbara Miller 2019-08-20 19:14:13 -07:00
parent 820215f0e3
commit 71b4157df3
2 changed files with 78 additions and 0 deletions

View File

@ -994,6 +994,7 @@ from .screencast import ScreencastIE
from .screencastomatic import ScreencastOMaticIE
from .scrippsnetworks import ScrippsNetworksWatchIE
from .seeker import SeekerIE
from .senatefloor import SenateFloorGranicusIE
from .senateisvp import SenateISVPIE
from .sendtonews import SendtoNewsIE
from .servus import ServusIE

View File

@ -0,0 +1,77 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class SenateFloorGranicusIE(InfoExtractor):
"""extractor for videos at https://www.senate.gov/floor/ hosted by
granicus.com
granicus.com urls are present in content at urls like
https://floor.senate.gov/videos/3385/player
which is the iframe src url for the embedded video at
https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3388 )
"""
_VALID_URL = r'https?://(?:archive-media.granicus.com.*?/OnDemand/[0-9a-z-]+/(?P<id>[0-9a-z-]+_[0-9a-f-]+).mp4|floor.senate.gov/(?:.*?[?]view_id=(?:[0-9]+)&clip_id=(?P<clip_id>[0-9]+).*|videos/[0-9]+/player))'
_TESTS = [
{'url': 'http://archive-media.granicus.com:443/OnDemand/senate/senate_ff605d76-86c3-4e8d-9991-9f32efd782de.mp4',
'info_dict': {
'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
'ext': 'mp4',
'title': 'The United States Senate'}
},
{'url': 'https://floor.senate.gov/videos/3385/player',
'info_dict': {
'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
'ext': 'mp4',
'title': 'Senate Floor Proceedings - 2019-08-13'}
},
{'url': 'https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3385',
'info_dict': {
'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
'ext': 'mp4',
'title': 'Senate Floor Proceedings - 2019-08-13'}
},
{'url': 'http://floor.senate.gov/ASX.php?view_id=2&clip_id=3385&sn=floor.senate.gov',
'info_dict': {
'id': 'senate_ff605d76-86c3-4e8d-9991-9f32efd782de',
'ext': 'mp4',
'title': 'Senate Floor Proceedings - 2019-08-13'}
}
]
def _real_extract(self, url):
video_id = self._match_id(url)
title = 'The United States Senate'
m = re.match(self._VALID_URL, url)
if m and m.group('clip_id'):
# https://floor.senate.gov/MediaPlayer.php?view_id=2&clip_id=3385
# http://floor.senate.gov/ASX.php?view_id=2&clip_id=894&sn=floor.senate.gov
return {
'_type': 'url',
'url': 'https://floor.senate.gov/videos/{}/player'.format(m.group('clip_id')),
}
elif url.endswith('player'):
# https://floor.senate.gov/videos/3385/player
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title',
default=title)
video_id = self._html_search_regex(
r'src="//archive-stream.granicus.com/OnDemand/_definst_/mp4:[0-9a-z-]+/(?P<id>[0-9a-z-]+_[0-9a-f-]+).mp4',
webpage, 'id')
return {
'_type': 'url_transparent',
'url': 'http://archive-media.granicus.com:443/OnDemand/{}/{}.mp4'.format(video_id.split('_')[0], video_id),
'id': video_id,
'title': title,
}
# we found an mp4!
return {
'url': url,
'id': video_id,
'title': title
# TODO more properties? (see youtube_dl/extractor/common.py)
}