1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-11-26 10:24:33 +01:00

[simplecast] Add new extractor

This commit is contained in:
dmsummers 2020-02-20 14:33:05 -06:00
parent 97c822b3d5
commit 08c47b7af9
3 changed files with 338 additions and 0 deletions

View File

@ -991,6 +991,12 @@ from .shared import (
from .showroomlive import ShowRoomLiveIE from .showroomlive import ShowRoomLiveIE
from .sina import SinaIE from .sina import SinaIE
from .sixplay import SixPlayIE from .sixplay import SixPlayIE
from .simplecast import (
SimplecastIE,
SimplecastEmbedIE,
SimplecastEpisodeIE,
SimplecastPodcastIE,
)
from .skylinewebcams import SkylineWebcamsIE from .skylinewebcams import SkylineWebcamsIE
from .skynewsarabia import ( from .skynewsarabia import (
SkyNewsArabiaIE, SkyNewsArabiaIE,

View File

@ -2659,6 +2659,11 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
# Look for simplecast embeds
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:embed|player)\.simplecast\.com/[^"]+)"', webpage)
if matches:
return self.playlist_from_matches(matches, ie='SimplecastEmbed')
# Look for BBC iPlayer embed # Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches: if matches:

View File

@ -0,0 +1,327 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_unquote_to_bytes,
)
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
OnDemandPagedList,
strip_or_none,
unified_timestamp,
url_or_none,
)
class SimplecastIE(InfoExtractor):
IE_NAME = 'simplecast'
_VALID_URL = r'https://api\.simplecast\.com/episodes/(?P<id>[^?/]+)'
_TEST = {
'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
'md5': '8c93be7be54251bf29ee97464eabe61c',
'info_dict': {
'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'ext': 'mp3',
'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
'thumbnail': r're:^https?://.*\.jpg$',
'episode_number': 1,
'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'description': 'md5:72c89d3ae63a77a6c55ce8becf170f2e',
'season_number': 1,
'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
'series': 'The RE:BIND.io Podcast',
'duration': 5343,
'timestamp': 1580979475,
'upload_date': '20200206',
'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
'channel_url': r're:^https://.+\.simplecast.com$',
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
meta = self._download_json(
url, display_id,
expected_status=404)
if meta.get('status') == 404:
raise ExtractorError(
'The requested episode could not be found', expected=True)
summary = strip_or_none(meta.get('description'))
long_description = strip_or_none(
clean_html(meta.get('long_description')))
description = summary or long_description
if summary and long_description:
description = summary + '\n\n' + long_description
season_href = url_or_none(meta.get('season', {}).get('href'))
season_id = None
if season_href:
id_regex = re.match(
r'https?://api.simplecast.com/seasons/(?P<id>[^?]+)',
season_href)
if id_regex:
season_id = id_regex.group('id')
webpage_url = url_or_none(meta.get('episode_url'))
channel_url = None
if webpage_url:
channel_regex = re.match(
r'(?P<wp>https?://.+\.simplecast\.com)',
webpage_url)
if channel_regex:
channel_url = channel_regex.group('wp')
return {
'id': meta['id'],
'display_id': meta.get('slug') or display_id,
'title': meta['title'],
'url': meta['audio_file_url'],
'webpage_url': webpage_url,
'channel_url': channel_url,
'series': strip_or_none(meta.get('podcast', {}).get('title')),
'season_number': int_or_none(meta.get('season', {}).get('number')),
'season_id': season_id,
'thumbnail': url_or_none(
meta.get('image_url')
or meta.get('podcast', {}).get('image_url')),
'episode_id': meta.get('id'),
'episode_number': int_or_none(meta.get('number')),
'description': description,
'timestamp': unified_timestamp(meta.get('published_at')),
'duration': int_or_none(meta.get('duration')),
}
class SimplecastEmbedIE(InfoExtractor):
IE_NAME = 'simplecast:embed'
_VALID_URL = r'https?://(?:embed|player)\.simplecast\.com/(?P<id>[^?]+)'
_TESTS = [{
'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
'md5': '8c93be7be54251bf29ee97464eabe61c',
'info_dict': {
'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'ext': 'mp3',
'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
'thumbnail': r're:^https?://.*\.jpg$',
'episode_number': 1,
'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'description': 'md5:72c89d3ae63a77a6c55ce8becf170f2e',
'season_number': 1,
'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
'series': 'The RE:BIND.io Podcast',
'duration': 5343,
'timestamp': 1580979475,
'upload_date': '20200206',
'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
'channel_url': r're:^https://.+\.simplecast.com$',
}
}, {
'url': 'https://embed.simplecast.com/0bab9525',
'md5': '580b1cc614c8ba33d929aaeeb38c274b',
'info_dict': {
'id': '565b3059-5227-4439-86e7-3eb1d43bf209',
'ext': 'mp3',
'title': 'Talib Kweli',
'thumbnail': r're:^https?://.*\.jpg$',
'episode_number': 22,
'episode_id': '565b3059-5227-4439-86e7-3eb1d43bf209',
'description': 'md5:d9d22ebcc84b6938efd3896fd18cc047',
'season_number': 1,
'season_id': 'b9df4072-7bd7-4704-b314-8dba46369de4',
'series': 'Armchair Expert with Dax Shepard',
'duration': 10435,
'timestamp': 1528707600,
'upload_date': '20180611',
'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
'channel_url': r're:^https://.+\.simplecast.com$',
}
}]
def _real_extract(self, url):
if re.match(r'https?://embed\.', url):
disp_id = self._match_id(url)
return self.url_result(
self._request_webpage(
'https://embed.simplecast.com/{0}'.format(disp_id), disp_id,
'Resolving ID', 'Unable to resolve ID').geturl(),
'SimplecastEmbed')
episode_id = self._match_id(url)
return self.url_result(
'https://api.simplecast.com/episodes/{0}'.format(episode_id),
'Simplecast', episode_id)
class SimplecastEpisodeIE(InfoExtractor):
IE_NAME = 'simplecast:episode'
_VALID_URL = r'https?://(?!(?:cdn|embed|player|api|www)\.).*\.simplecast\.com/episodes/(?P<id>[^?]+)'
_TEST = {
'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
'md5': '8c93be7be54251bf29ee97464eabe61c',
'info_dict': {
'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'ext': 'mp3',
'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
'thumbnail': r're:^https?://.*\.jpg$',
'episode_number': 1,
'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'description': 'md5:72c89d3ae63a77a6c55ce8becf170f2e',
'season_number': 1,
'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
'series': 'The RE:BIND.io Podcast',
'duration': 5343,
'timestamp': 1580979475,
'upload_date': '20200206',
'webpage_url': r're:^https://.+\.simplecast.com/episodes/[^?/]+',
'channel_url': r're:^https://.+\.simplecast.com$',
}
}
def _real_extract(self, url):
url = url.rstrip('/')
display_id = self._match_id(url)
search_result = self._download_json(
'https://api.simplecast.com/episodes/search', display_id,
'Looking up episode info', 'Unable to look up episode info',
data=compat_urllib_parse_unquote_to_bytes(
'{{"url":"{0}"}}'.format(url)),
headers={'Content-Type': 'application/json;charset=utf-8'},
expected_status=404)
if search_result.get('status') == 404:
raise ExtractorError(
'The requested episode could not be found', expected=True)
episode_id = search_result['id']
self.to_screen('{0}: Real ID is {1}'.format(display_id, episode_id))
return self.url_result(
'https://api.simplecast.com/episodes/{0}'.format(episode_id),
'Simplecast', episode_id, search_result.get('title'))
class SimplecastPodcastIE(InfoExtractor):
IE_NAME = 'simplecast:podcast'
_VALID_URL = r'https?://(?!(?:cdn|embed|player|api|www))(?P<id>.+)\.simplecast\.com(?:/episodes/?)?'
_PAGE_SIZE = 25
_PAGE_TOTAL = '?'
_TEST = {
'url': 'https://the-re-bind-io-podcast.simplecast.com',
'playlist_mincount': 3,
'info_dict': {
'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
'description': 'md5:5b83928525a22effaee9dd5c2addc378',
'title': 'The RE:BIND.io Podcast',
}
}
@classmethod
def suitable(cls, url):
return (False
if SimplecastEpisodeIE.suitable(url)
else super(SimplecastPodcastIE, cls).suitable(url))
def _real_extract(self, url):
podcast_id = self._match_id(url)
search_result = self._download_json(
'https://api.simplecast.com/sites/search', podcast_id,
'Looking up podcast info', 'Unable to look up podcast info',
data=compat_urllib_parse_unquote_to_bytes(
'{{"url":"{0}"}}'.format(url)),
headers={'Content-Type': 'application/json;charset=utf-8'},
expected_status=404)
if search_result.get('status') == 404:
raise ExtractorError(
'The requested podcast could not be found', expected=True)
series_id = search_result['podcast']['id']
pod_meta = self._download_json(
'https://api.simplecast.com/podcasts/{0}'.format(series_id),
podcast_id, 'Downloading podcast metadata',
'Unable to download podcast metadata',
fatal=False)
series_description = series_thumbnail = license = None
if type(pod_meta) is dict:
series_description = strip_or_none(pod_meta.get('description'))
license = strip_or_none(pod_meta.get('copyright'))
series_thumbnail = pod_meta.get('image_url')
season_list = self._download_json(
'https://api.simplecast.com/podcasts/{0}/seasons'.format(series_id),
podcast_id, 'Downloading season list',
'Unable to download season list')
if len(season_list['collection']) > 1:
# Haven't seen anything with multiple seasons,
# not sure how to handle that
raise ExtractorError(
'Support for podcasts with more than one season'
' has not been implemented')
for season in season_list['collection']:
season_number = int_or_none(season.get('number'))
season_id = re.match(
r'https?://api.simplecast.com/seasons/(?P<id>[^?]+)',
season['href']).group('id')
series = strip_or_none(search_result.get('podcast', {}).get('title'))
subdomain = search_result.get('subdomain')
channel_url = None
if subdomain:
channel_url = 'https://{0}.simplecast.com'.format(subdomain)
def get_page(page_num):
episode_list = self._download_json(
'https://api.simplecast.com/seasons/{0}/episodes?limit={1}&offset={2}'.format(
season_id, self._PAGE_SIZE, self._PAGE_SIZE * page_num),
podcast_id,
'Downloading episode list page {0} of {1}'.format(
page_num + 1, self._PAGE_TOTAL),
'Unable to download episode list page {0} of {1}'.format(
page_num + 1, self._PAGE_TOTAL))
self._PAGE_TOTAL = (
int_or_none(episode_list.get('pages', {}).get('total'))
or '?')
for episode in episode_list['collection']:
yield {
'_type': 'url',
'ie_key': 'Simplecast',
'url': 'https://api.simplecast.com/episodes/{0}'.format(episode['id']),
'id': episode['id'],
'display_id': episode.get('slug'),
'title': episode.get('title'),
'webpage_url': url_or_none(search_result.get('episode_url')),
'channel_id': search_result.get('id'),
'channel_url': channel_url,
'series': series,
'season_id': season_id,
'season_number': season_number,
'thumbnail': url_or_none(episode.get('image_url') or series_thumbnail),
'episode_id': episode['id'],
'episode_number': int_or_none(episode.get('number')),
'description': strip_or_none(episode.get('description')),
'timestamp': unified_timestamp(episode.get('published_at')),
'license': license,
}
return self.playlist_result(
OnDemandPagedList(get_page, self._PAGE_SIZE),
series_id, series, series_description)