import datetime import itertools import json import re from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( compat_urllib_parse, ExtractorError, ) class YahooIE(InfoExtractor): IE_DESC = u'Yahoo screen' _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' _TEST = { u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', u'file': u'214727115.flv', u'md5': u'2e717f169c1be93d84d3794a00d4a325', u'info_dict': { u"title": u"Julian Smith & Travis Legg Watch Julian Smith" }, u'skip': u'Requires rtmpdump' } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) if m_id is None: # TODO: Check which url parameters are required info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* .*?)\]\]>.* .*?)\ .*\]\]>.* = n: break mobj = re.search(r'(?Pscreen\.yahoo\.com/.*?-\d*?\.html)"', r) e = self.url_result('http://' + mobj.group('url'), 'Yahoo') res['entries'].append(e) if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )): break return res