From 30787f7259c4e6a08f691cc691f14fa0c8fe4b87 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 3 Oct 2015 19:28:48 +0100 Subject: [PATCH 1/4] [cspan] correct the clip info extraction --- youtube_dl/extractor/cspan.py | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..994e080d5 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -18,22 +18,21 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '067803f994e049b455a58b16e5aab442', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { @@ -44,7 +43,7 @@ class CSpanIE(InfoExtractor): 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,36 +56,33 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'
(.*?)(.*?)

' - ], - webpage, 'description', flags=re.DOTALL, default=None) - - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - files = data['video']['files'] try: capfile = data['video']['capfile']['#text'] @@ -112,12 +108,12 @@ class CSpanIE(InfoExtractor): if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } From 355c7ad361aa3c8a57ff83e3f702a496dce59e65 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 17 Oct 2015 21:30:38 +0100 Subject: [PATCH 2/4] [cspan] handle error massages and extract qualities --- youtube_dl/extractor/cspan.py | 67 +++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 994e080d5..c74b35fd9 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,16 +9,21 @@ from ..utils import ( find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE +def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') + + class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '067803f994e049b455a58b16e5aab442', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', @@ -28,7 +33,7 @@ class CSpanIE(InfoExtractor): 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { 'id': 'c4486943', 'ext': 'mp4', @@ -37,7 +42,7 @@ class CSpanIE(InfoExtractor): } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', @@ -71,8 +76,10 @@ class CSpanIE(InfoExtractor): return self.url_result(surl, 'SenateISVP', video_id, title) data = self._download_json( - 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), - video_id) + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -83,28 +90,36 @@ class CSpanIE(InfoExtractor): title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None + files = data['files'] + capfile = get_text_attr(data, 'capfile') - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) From 4bf56141950f3c24000381403417d20095f04460 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 20 Oct 2015 07:43:39 +0100 Subject: [PATCH 3/4] [cspan] move get_text_attr to CSpanIE --- youtube_dl/extractor/cspan.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index c74b35fd9..388460a32 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -14,10 +14,6 @@ from ..utils import ( from .senateisvp import SenateISVPIE -def get_text_attr(d, attr): - return d.get(attr, {}).get('#text') - - class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' @@ -60,6 +56,9 @@ class CSpanIE(InfoExtractor): } }] + def get_text_attr(self, d, attr): + return d.get(attr, {}).get('#text') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -79,7 +78,7 @@ class CSpanIE(InfoExtractor): 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), video_id)['video'] if data['@status'] != 'Success': - raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) + raise ExtractorError('%s said: %s' % (self.IE_NAME, self.get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -91,17 +90,17 @@ class CSpanIE(InfoExtractor): thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text files = data['files'] - capfile = get_text_attr(data, 'capfile') + capfile = self.get_text_attr(data, 'capfile') entries = [] for partnum, f in enumerate(files): formats = [] for quality in f['qualities']: formats.append({ - 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), - 'url': unescapeHTML(get_text_attr(quality, 'file')), - 'height': int_or_none(get_text_attr(quality, 'height')), - 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + 'format_id': '%s-%sp' % (self.get_text_attr(quality, 'bitrate'), self.get_text_attr(quality, 'height')), + 'url': unescapeHTML(self.get_text_attr(quality, 'file')), + 'height': int_or_none(self.get_text_attr(quality, 'height')), + 'tbr': int_or_none(self.get_text_attr(quality, 'bitrate')), }) self._sort_formats(formats) entries.append({ @@ -112,7 +111,7 @@ class CSpanIE(InfoExtractor): 'formats': formats, 'description': description, 'thumbnail': thumbnail, - 'duration': int_or_none(get_text_attr(f, 'length')), + 'duration': int_or_none(self.get_text_attr(f, 'length')), 'subtitles': { 'en': [{ 'url': capfile, From 2a776f978849e0c66f70133747e7fd244f516f7f Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 28 Nov 2015 20:22:31 +0100 Subject: [PATCH 4/4] [cspan] change into a function --- youtube_dl/extractor/cspan.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 388460a32..7b685d157 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -56,9 +56,6 @@ class CSpanIE(InfoExtractor): } }] - def get_text_attr(self, d, attr): - return d.get(attr, {}).get('#text') - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -74,11 +71,14 @@ class CSpanIE(InfoExtractor): surl = smuggle_url(senate_isvp_url, {'force_title': title}) return self.url_result(surl, 'SenateISVP', video_id, title) + def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') + data = self._download_json( 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), video_id)['video'] if data['@status'] != 'Success': - raise ExtractorError('%s said: %s' % (self.IE_NAME, self.get_text_attr(data, 'error')), expected=True) + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -90,17 +90,17 @@ class CSpanIE(InfoExtractor): thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text files = data['files'] - capfile = self.get_text_attr(data, 'capfile') + capfile = get_text_attr(data, 'capfile') entries = [] for partnum, f in enumerate(files): formats = [] for quality in f['qualities']: formats.append({ - 'format_id': '%s-%sp' % (self.get_text_attr(quality, 'bitrate'), self.get_text_attr(quality, 'height')), - 'url': unescapeHTML(self.get_text_attr(quality, 'file')), - 'height': int_or_none(self.get_text_attr(quality, 'height')), - 'tbr': int_or_none(self.get_text_attr(quality, 'bitrate')), + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), }) self._sort_formats(formats) entries.append({ @@ -111,7 +111,7 @@ class CSpanIE(InfoExtractor): 'formats': formats, 'description': description, 'thumbnail': thumbnail, - 'duration': int_or_none(self.get_text_attr(f, 'length')), + 'duration': int_or_none(get_text_attr(f, 'length')), 'subtitles': { 'en': [{ 'url': capfile,