diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 021945a89..f5bb2d521 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -946,7 +946,7 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None): + def url_result(url, ie=None, video_id=None, video_title=None, video_duration=None): """Returns a URL that points to a page that should be processed""" # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', @@ -956,6 +956,8 @@ class InfoExtractor(object): video_info['id'] = video_id if video_title is not None: video_info['title'] = video_title + if video_duration is not None: + video_info['duration'] = video_duration return video_info def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bd1515380..3dc1daa6e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -326,35 +326,56 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) + for video_id, video_title, video_duration, playlist_video_id in self.extract_videos_from_page(content): + if len(video_id) == 11: + # Youtube video id found + yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) + elif len(video_id) > 11: + # Youtube playlist id found + yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (playlist_video_id, video_id), 'YoutubePlaylist', video_id, video_title, video_duration) - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): + def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page): for mobj in re.finditer(video_re, page): # The link with index 0 is not the first video of the playlist (not sure if still actual) if 'index' in mobj.groupdict() and mobj.group('id') == '0': continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None + video_id_original = mobj.group('id') + video_id = video_id_original + playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None + if playlist_id is not None: + video_id = playlist_id + video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None if video_title: video_title = video_title.strip() if video_title == '► Play all': video_title = None + video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None + playlist_count = mobj.group('plcounter') if 'plcounter' in mobj.groupdict() else None + if playlist_id is not None and playlist_count is not None: + video_duration = playlist_count + if video_duration: + video_duration = video_duration.strip() try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: titles_in_page[idx] = video_title + if video_duration and not durations_in_page[idx]: + durations_in_page[idx] = video_duration + if playlist_id is not None and not playlist_video_id_in_page[idx]: + playlist_video_id_in_page[idx] = video_id_original except ValueError: ids_in_page.append(video_id) titles_in_page.append(video_title) + durations_in_page.append(video_duration) + playlist_video_id_in_page.append(video_id_original) def extract_videos_from_page(self, page): ids_in_page = [] titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + playlist_video_id_in_page = [] + durations_in_page = [] + self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) + return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): @@ -2764,6 +2785,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): def extract_videos_from_page(self, page): ids_in_page = [] titles_in_page = [] + durations_in_page = [] + playlist_video_id_in_page for item in re.findall( r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): @@ -2774,20 +2797,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_title = video_title.strip() ids_in_page.append(video_id) titles_in_page.append(video_title) + # TODO: ADD VIDEO DURATION HERE TOO? + durations_in_page.append(None) + playlist_video_id_in_page.append(None) # Fallback with old _VIDEO_RE self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) + self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) # Relaxed fallbacks self.extract_videos_from_page_impl( r'href="\s*/watch\?v\s*=\s*(?P[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) + ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) self.extract_videos_from_page_impl( r'data-video-ids\s*=\s*["\'](?P[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) + ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) - return zip(ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) def _extract_mix(self, playlist_id): # The mixes are generated from a single video @@ -3171,7 +3197,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link|href="\s*/watch)[\s\S])*"[^\d]+(?P<plcounter>[0-9,.]+)</b>\svideos)?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):