Fix MIT extractor for Python 2.6

The HTML for the MIT page does not parse cleanly for Python 2.6 due
to script tags within an actual script element.  The offending piece
is inside a comment block, so removing all such comment blocks
fixes the parsing.
This commit is contained in:
Jeff Smith 2013-08-28 14:00:59 -05:00
parent 2891932bf0
commit b5ba7b9dcf
1 changed files with 7 additions and 9 deletions

View File

@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage( raw_page = self._download_webpage(
'http://techtv.mit.edu/videos/%s' % video_id, video_id) 'http://techtv.mit.edu/videos/%s' % video_id, video_id)
embed_page = self._download_webpage( clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
note=u'Downloading embed page')
base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
embed_page, u'base url') raw_page, u'base url')
formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
u'video formats') u'video formats')
formats = json.loads(formats_json) formats = json.loads(formats_json)
formats = sorted(formats, key=lambda f: f['bitrate']) formats = sorted(formats, key=lambda f: f['bitrate'])
title = get_element_by_id('edit-title', webpage) title = get_element_by_id('edit-title', clean_page)
description = clean_html(get_element_by_id('edit-description', webpage)) description = clean_html(get_element_by_id('edit-description', clean_page))
thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
embed_page, u'thumbnail', flags=re.DOTALL) raw_page, u'thumbnail', flags=re.DOTALL)
return {'id': video_id, return {'id': video_id,
'title': title, 'title': title,