[ytsearch] added support to get correct playlist results while searching for playlists. Added playlist count to playlist results. Added video duration in video results.

This commit is contained in:
Crypto90 2020-10-24 12:09:12 +02:00
parent 416da574ec
commit 4dfb0763ba
2 changed files with 43 additions and 15 deletions

View File

@ -946,7 +946,7 @@ class InfoExtractor(object):
# Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None, video_title=None):
def url_result(url, ie=None, video_id=None, video_title=None, video_duration=None):
"""Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
@ -956,6 +956,8 @@ class InfoExtractor(object):
video_info['id'] = video_id
if video_title is not None:
video_info['title'] = video_title
if video_duration is not None:
video_info['duration'] = video_duration
return video_info
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):

View File

@ -326,35 +326,56 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
def _process_page(self, content):
for video_id, video_title in self.extract_videos_from_page(content):
yield self.url_result(video_id, 'Youtube', video_id, video_title)
for video_id, video_title, video_duration, playlist_video_id in self.extract_videos_from_page(content):
if len(video_id) == 11:
# Youtube video id found
yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration)
elif len(video_id) > 11:
# Youtube playlist id found
yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (playlist_video_id, video_id), 'YoutubePlaylist', video_id, video_title, video_duration)
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page):
for mobj in re.finditer(video_re, page):
# The link with index 0 is not the first video of the playlist (not sure if still actual)
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
continue
video_id = mobj.group('id')
video_title = unescapeHTML(
mobj.group('title')) if 'title' in mobj.groupdict() else None
video_id_original = mobj.group('id')
video_id = video_id_original
playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None
if playlist_id is not None:
video_id = playlist_id
video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title:
video_title = video_title.strip()
if video_title == '► Play all':
video_title = None
video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None
playlist_count = mobj.group('plcounter') if 'plcounter' in mobj.groupdict() else None
if playlist_id is not None and playlist_count is not None:
video_duration = playlist_count
if video_duration:
video_duration = video_duration.strip()
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
titles_in_page[idx] = video_title
if video_duration and not durations_in_page[idx]:
durations_in_page[idx] = video_duration
if playlist_id is not None and not playlist_video_id_in_page[idx]:
playlist_video_id_in_page[idx] = video_id_original
except ValueError:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
durations_in_page.append(video_duration)
playlist_video_id_in_page.append(video_id_original)
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
playlist_video_id_in_page = []
durations_in_page = []
self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
@ -2764,6 +2785,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
durations_in_page = []
playlist_video_id_in_page
for item in re.findall(
r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
@ -2774,20 +2797,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
video_title = video_title.strip()
ids_in_page.append(video_id)
titles_in_page.append(video_title)
# TODO: ADD VIDEO DURATION HERE TOO?
durations_in_page.append(None)
playlist_video_id_in_page.append(None)
# Fallback with old _VIDEO_RE
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
# Relaxed fallbacks
self.extract_videos_from_page_impl(
r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
self.extract_videos_from_page_impl(
r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
ids_in_page, titles_in_page)
ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
return zip(ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
@ -3171,7 +3197,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&amp;list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link|href="\s*/watch)[\s\S])*"[^\d]+(?P<plcounter>[0-9,.]+)</b>\svideos)?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?'
class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):