[ytsearch] added support to get correct playlist results while searching for playlists. Added playlist count to playlist results. Added video duration in video results.
This commit is contained in:
parent
416da574ec
commit
4dfb0763ba
|
@ -946,7 +946,7 @@ class InfoExtractor(object):
|
||||||
|
|
||||||
# Methods for following #608
|
# Methods for following #608
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def url_result(url, ie=None, video_id=None, video_title=None):
|
def url_result(url, ie=None, video_id=None, video_title=None, video_duration=None):
|
||||||
"""Returns a URL that points to a page that should be processed"""
|
"""Returns a URL that points to a page that should be processed"""
|
||||||
# TODO: ie should be the class used for getting the info
|
# TODO: ie should be the class used for getting the info
|
||||||
video_info = {'_type': 'url',
|
video_info = {'_type': 'url',
|
||||||
|
@ -956,6 +956,8 @@ class InfoExtractor(object):
|
||||||
video_info['id'] = video_id
|
video_info['id'] = video_id
|
||||||
if video_title is not None:
|
if video_title is not None:
|
||||||
video_info['title'] = video_title
|
video_info['title'] = video_title
|
||||||
|
if video_duration is not None:
|
||||||
|
video_info['duration'] = video_duration
|
||||||
return video_info
|
return video_info
|
||||||
|
|
||||||
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
|
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
|
||||||
|
|
|
@ -326,35 +326,56 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
|
||||||
|
|
||||||
class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
|
class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
|
||||||
def _process_page(self, content):
|
def _process_page(self, content):
|
||||||
for video_id, video_title in self.extract_videos_from_page(content):
|
for video_id, video_title, video_duration, playlist_video_id in self.extract_videos_from_page(content):
|
||||||
yield self.url_result(video_id, 'Youtube', video_id, video_title)
|
if len(video_id) == 11:
|
||||||
|
# Youtube video id found
|
||||||
|
yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration)
|
||||||
|
elif len(video_id) > 11:
|
||||||
|
# Youtube playlist id found
|
||||||
|
yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (playlist_video_id, video_id), 'YoutubePlaylist', video_id, video_title, video_duration)
|
||||||
|
|
||||||
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
|
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page):
|
||||||
for mobj in re.finditer(video_re, page):
|
for mobj in re.finditer(video_re, page):
|
||||||
# The link with index 0 is not the first video of the playlist (not sure if still actual)
|
# The link with index 0 is not the first video of the playlist (not sure if still actual)
|
||||||
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
|
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
|
||||||
continue
|
continue
|
||||||
video_id = mobj.group('id')
|
video_id_original = mobj.group('id')
|
||||||
video_title = unescapeHTML(
|
video_id = video_id_original
|
||||||
mobj.group('title')) if 'title' in mobj.groupdict() else None
|
playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None
|
||||||
|
if playlist_id is not None:
|
||||||
|
video_id = playlist_id
|
||||||
|
video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None
|
||||||
if video_title:
|
if video_title:
|
||||||
video_title = video_title.strip()
|
video_title = video_title.strip()
|
||||||
if video_title == '► Play all':
|
if video_title == '► Play all':
|
||||||
video_title = None
|
video_title = None
|
||||||
|
video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None
|
||||||
|
playlist_count = mobj.group('plcounter') if 'plcounter' in mobj.groupdict() else None
|
||||||
|
if playlist_id is not None and playlist_count is not None:
|
||||||
|
video_duration = playlist_count
|
||||||
|
if video_duration:
|
||||||
|
video_duration = video_duration.strip()
|
||||||
try:
|
try:
|
||||||
idx = ids_in_page.index(video_id)
|
idx = ids_in_page.index(video_id)
|
||||||
if video_title and not titles_in_page[idx]:
|
if video_title and not titles_in_page[idx]:
|
||||||
titles_in_page[idx] = video_title
|
titles_in_page[idx] = video_title
|
||||||
|
if video_duration and not durations_in_page[idx]:
|
||||||
|
durations_in_page[idx] = video_duration
|
||||||
|
if playlist_id is not None and not playlist_video_id_in_page[idx]:
|
||||||
|
playlist_video_id_in_page[idx] = video_id_original
|
||||||
except ValueError:
|
except ValueError:
|
||||||
ids_in_page.append(video_id)
|
ids_in_page.append(video_id)
|
||||||
titles_in_page.append(video_title)
|
titles_in_page.append(video_title)
|
||||||
|
durations_in_page.append(video_duration)
|
||||||
|
playlist_video_id_in_page.append(video_id_original)
|
||||||
|
|
||||||
def extract_videos_from_page(self, page):
|
def extract_videos_from_page(self, page):
|
||||||
ids_in_page = []
|
ids_in_page = []
|
||||||
titles_in_page = []
|
titles_in_page = []
|
||||||
self.extract_videos_from_page_impl(
|
playlist_video_id_in_page = []
|
||||||
self._VIDEO_RE, page, ids_in_page, titles_in_page)
|
durations_in_page = []
|
||||||
return zip(ids_in_page, titles_in_page)
|
self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||||
|
return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||||
|
|
||||||
|
|
||||||
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
|
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
|
||||||
|
@ -2764,6 +2785,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
|
||||||
def extract_videos_from_page(self, page):
|
def extract_videos_from_page(self, page):
|
||||||
ids_in_page = []
|
ids_in_page = []
|
||||||
titles_in_page = []
|
titles_in_page = []
|
||||||
|
durations_in_page = []
|
||||||
|
playlist_video_id_in_page
|
||||||
|
|
||||||
for item in re.findall(
|
for item in re.findall(
|
||||||
r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
|
r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
|
||||||
|
@ -2774,20 +2797,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
|
||||||
video_title = video_title.strip()
|
video_title = video_title.strip()
|
||||||
ids_in_page.append(video_id)
|
ids_in_page.append(video_id)
|
||||||
titles_in_page.append(video_title)
|
titles_in_page.append(video_title)
|
||||||
|
# TODO: ADD VIDEO DURATION HERE TOO?
|
||||||
|
durations_in_page.append(None)
|
||||||
|
playlist_video_id_in_page.append(None)
|
||||||
|
|
||||||
# Fallback with old _VIDEO_RE
|
# Fallback with old _VIDEO_RE
|
||||||
self.extract_videos_from_page_impl(
|
self.extract_videos_from_page_impl(
|
||||||
self._VIDEO_RE, page, ids_in_page, titles_in_page)
|
self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||||
|
|
||||||
# Relaxed fallbacks
|
# Relaxed fallbacks
|
||||||
self.extract_videos_from_page_impl(
|
self.extract_videos_from_page_impl(
|
||||||
r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
|
r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
|
||||||
ids_in_page, titles_in_page)
|
ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||||
self.extract_videos_from_page_impl(
|
self.extract_videos_from_page_impl(
|
||||||
r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
|
r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
|
||||||
ids_in_page, titles_in_page)
|
ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||||
|
|
||||||
return zip(ids_in_page, titles_in_page)
|
return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||||
|
|
||||||
def _extract_mix(self, playlist_id):
|
def _extract_mix(self, playlist_id):
|
||||||
# The mixes are generated from a single video
|
# The mixes are generated from a single video
|
||||||
|
@ -3171,7 +3197,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
|
||||||
|
|
||||||
|
|
||||||
class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
|
class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
|
||||||
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
|
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link|href="\s*/watch)[\s\S])*"[^\d]+(?P<plcounter>[0-9,.]+)</b>\svideos)?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?'
|
||||||
|
|
||||||
|
|
||||||
class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
|
class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
|
||||||
|
|
Loading…
Reference in New Issue